In [0]:
events = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)

events.show(5)


+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 00:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

In [0]:
#complex joins
user_counts = events.groupBy("user_id").count()
events_joined = events.join(
    user_counts,
    on="user_id",
    how="left"
)

events_joined.show(5)



+---------+-------------------+----------+----------+-------------------+--------------------+-------+------+--------------------+-----+
|  user_id|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|        user_session|count|
+---------+-------------------+----------+----------+-------------------+--------------------+-------+------+--------------------+-----+
|529230007|2019-11-15 19:54:53|      view|  26400470|2053013563651392361|                NULL|   NULL|189.71|42d36263-d0b8-4be...| 1400|
|536833043|2019-11-03 01:28:51|      view|   1801690|2053013554415534427|electronics.video.tv|samsung|369.45|d9fc4bb1-c017-4d9...|   20|
|514728400|2019-11-16 04:07:31|      view|   1005116|2053013555631882655|electronics.smart...|  apple|979.43|ad009ce1-e256-4f6...|   42|
|532284031|2019-11-01 15:35:08|      view|   1004836|2053013555631882655|electronics.smart...|samsung| 229.9|fdb71ebe-d19e-429...|   20|
|529475219|2019-11-26 17:41:44|      view

In [0]:
#window, functions
from pyspark.sql.window import Window
from pyspark.sql.functions import sum

window = Window.partitionBy("user_id").orderBy("event_time")

events.withColumn(
    "running_total",
    sum("price").over(window)
).show(5)


+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|     running_total|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+------------------+
|2019-11-29 14:47:32|      view|   1004740|2053013555631882655|electronics.smart...|  xiaomi|239.36| 94584874|e6abb356-512a-447...|            239.36|
|2019-11-26 05:31:47|      view|   4900378|2053013555220840837|appliances.kitche...|scarlett|112.99|122384079|c04d12ef-da1c-4e4...|            112.99|
|2019-11-28 08:25:09|      view|  12702958|2053013553559896355|                NULL|cordiant| 42.47|122384079|6cee7edb-68ae-4be...|155.45999999999998|
|2019-11-28 08:32:51|      view|   4900173|2053013555220840837|appliances.kitche...|moulinex|1

In [0]:
events.select("price").show(10)


+------+
| price|
+------+
|489.07|
|293.65|
| 28.31|
|712.87|
|183.27|
|360.09|
|514.56|
| 30.86|
| 72.72|
|732.07|
+------+
only showing top 10 rows


In [0]:

#Derived Features
from pyspark.sql.functions import when

events_features = events.withColumn(
    "high_value_txn",
    when(events.price > 200, 1).otherwise(0)
)
events_features.show()


+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+--------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|high_value_txn|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+--------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|  xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|             1|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|  janome|293.65|530496790|8e5f4f83-366c-4f7...|             1|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL|   creed| 28.31|561587266|755422e7-9040-477...|             0|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|      lg|712.87|518085591|3bfb58cd