In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
from datetime import datetime, timedelta

yesterday = datetime.now().date() - timedelta(days=1)

data = [
    (1, "A", "view", 100.0, datetime.combine(yesterday, datetime.min.time())),
    (2, "A", "purchase", 100.0, datetime.combine(yesterday, datetime.min.time())),
    (3, "B", "view", 200.0, datetime.combine(yesterday, datetime.min.time())),
    (4, "B", "purchase", 200.0, datetime.combine(yesterday, datetime.min.time())),
    (5, "C", "view", 150.0, datetime.combine(yesterday, datetime.min.time()))
]

schema = StructType([
    StructField("user_id", IntegerType()),
    StructField("product_id", StringType()),
    StructField("event_type", StringType()),
    StructField("price", DoubleType()),
    StructField("event_time", TimestampType())
])

events_df = spark.createDataFrame(data, schema)
events_df.display()

user_id,product_id,event_type,price,event_time
1,A,view,100.0,2026-01-23T00:00:00.000Z
2,A,purchase,100.0,2026-01-23T00:00:00.000Z
3,B,view,200.0,2026-01-23T00:00:00.000Z
4,B,purchase,200.0,2026-01-23T00:00:00.000Z
5,C,view,150.0,2026-01-23T00:00:00.000Z


# Feature Engineering

In [0]:
fe_df = (
    events_df
    .withColumn("is_purchase", F.when(F.col("event_type") == "purchase", 1).otherwise(0))
    .withColumn("log_price", F.log1p("price"))
    .withColumn("event_hour", F.hour("event_time"))
)

fe_df.display()


user_id,product_id,event_type,price,event_time,is_purchase,log_price,event_hour
1,A,view,100.0,2026-01-23T00:00:00.000Z,0,4.61512051684126,0
2,A,purchase,100.0,2026-01-23T00:00:00.000Z,1,4.61512051684126,0
3,B,view,200.0,2026-01-23T00:00:00.000Z,0,5.303304908059076,0
4,B,purchase,200.0,2026-01-23T00:00:00.000Z,1,5.303304908059076,0
5,C,view,150.0,2026-01-23T00:00:00.000Z,0,5.017279836814924,0


# Aggregate to model-ready features

In [0]:
features_df = (
    fe_df
    .groupBy("product_id")
    .agg(
        F.avg("price").alias("avg_price"),
        F.avg("log_price").alias("avg_log_price"),
        F.sum("is_purchase").alias("total_purchases"),
        F.count("*").alias("total_events")
    )
)

features_df.display()


product_id,avg_price,avg_log_price,total_purchases,total_events
A,100.0,4.61512051684126,1,2
B,200.0,5.303304908059076,1,2
C,150.0,5.017279836814924,0,1


# Prepare ML features

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["avg_price", "total_events"],
    outputCol="features_v1"
)

assembler_log = VectorAssembler(
    inputCols=["avg_log_price", "total_events"],
    outputCol="features_v2"
)

df_v1 = assembler.transform(features_df)
df_v2 = assembler_log.transform(features_df)


# Train & Compare Models

In [0]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(labelCol="total_purchases")

model_v1 = lr.fit(df_v1.select("features_v1", "total_purchases")
                        .withColumnRenamed("features_v1", "features"))

model_v2 = lr.fit(df_v2.select("features_v2", "total_purchases")
                        .withColumnRenamed("features_v2", "features"))

evaluator = RegressionEvaluator(
    labelCol="total_purchases",
    metricName="rmse"
)

rmse_v1 = evaluator.evaluate(model_v1.transform(
    df_v1.withColumnRenamed("features_v1", "features")
))

rmse_v2 = evaluator.evaluate(model_v2.transform(
    df_v2.withColumnRenamed("features_v2", "features")
))

print(f"RMSE without log feature : {rmse_v1}")
print(f"RMSE with log feature    : {rmse_v2}")


RMSE without log feature : 1.3260853798135218e-15
RMSE with log feature    : 1.6764000044290905e-15
