In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.table("silver.events_part")


In [0]:
events.select("price").describe().show()

In [0]:
events_wd = events.withColumn(
    "is_weekend",
    F.dayofweek("event_date").isin([1, 7])
)


In [0]:
conversion = (
    events_wd
    .groupBy("is_weekend")
    .agg(
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("views"),
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases")
    )
    .withColumn("conversion_rate", F.col("purchases") / F.col("views") * 100)
)

conversion.show()


In [0]:
events_corr = events.withColumn(
    "is_purchase",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)


In [0]:
events_corr.stat.corr("price", "is_purchase")

In [0]:
features = (
    events
    .withColumn("hour", F.hour("event_time"))
    .withColumn("day_of_week", F.dayofweek("event_date"))
)


In [0]:
features = features.withColumn(
    "price_log",
    F.log(F.col("price") + 1)
)


In [0]:
window = Window.partitionBy("user_id").orderBy("event_time")

features = features.withColumn(
    "time_since_first_event",
    F.unix_timestamp("event_time") -
    F.unix_timestamp(F.first("event_time").over(window))
)


In [0]:
features.select(
    "user_id",
    "event_time",
    "hour",
    "day_of_week",
    "price",
    "price_log",
    "time_since_first_event"
).show(10)
