In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
events = spark.table("silver.events")


In [0]:
events.select("price").describe().show()


+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67501979|
|   mean|292.45931656464654|
| stddev|355.67449958606727|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+



In [0]:
events.groupBy("event_type") \
      .agg(
          F.avg("price").alias("avg_price"),
          F.max("price").alias("max_price"),
          F.count("*").alias("event_count")
      ).show()


+----------+------------------+---------+-----------+
|event_type|         avg_price|max_price|event_count|
+----------+------------------+---------+-----------+
|  purchase| 300.1234438714189|  2574.07|     916939|
|      cart|289.97343027076136|  2574.07|    3028930|
|      view|292.46721570047384|  2574.07|   63556110|
+----------+------------------+---------+-----------+



In [0]:
events = events.withColumn("event_date", F.to_date("event_time")) \
               .withColumn("is_weekend",
                           F.dayofweek("event_date").isin([1, 7]))


In [0]:
events.groupBy("is_weekend", "event_type").count().show()


+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|      true|      view|23102117|
|      true|      cart| 1229688|
|      true|  purchase|  416681|
|     false|  purchase|  500258|
|     false|      view|40453993|
|     false|      cart| 1799242|
+----------+----------+--------+



In [0]:
events.stat.corr("price", "product_id")


-0.184331120526865

In [0]:
#time -based features
features = events.withColumn("hour", F.hour("event_time")) \
                 .withColumn("day_of_week", F.dayofweek("event_date"))


In [0]:
#price transformation
features = features.withColumn("price_log", F.log(F.col("price") + 1))


In [0]:
#User behavior feature (time since first event)
window_spec = Window.partitionBy("user_id").orderBy("event_time")

features = features.withColumn(
    "time_since_first_event",
    F.unix_timestamp("event_time") -
    F.unix_timestamp(F.first("event_time").over(window_spec))
)


In [0]:
features.select(
    "user_id",
    "event_time",
    "hour",
    "day_of_week",
    "price",
    "price_log",
    "time_since_first_event"
).show(5)


+---------+-------------------+----+-----------+------+------------------+----------------------+
|  user_id|         event_time|hour|day_of_week| price|         price_log|time_since_first_event|
+---------+-------------------+----+-----------+------+------------------+----------------------+
| 65800726|2019-11-27 04:33:16|   4|          4|  81.8| 4.416428061391214|                     0|
| 65800726|2019-11-27 04:35:24|   4|          4|  81.8| 4.416428061391214|                   128|
| 81255481|2019-11-08 07:44:45|   7|          6| 66.35| 4.209902902856373|                     0|
| 81255481|2019-11-21 14:11:26|  14|          5| 66.14| 4.206779991551889|               1146401|
|106416780|2019-11-28 05:43:46|   5|          5|256.53|5.5511362181719965|                     0|
+---------+-------------------+----+-----------+------+------------------+----------------------+
only showing top 5 rows
