In [0]:
%python

df = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    inferSchema=True
)

display(df)


In [0]:
events.printSchema()
events.show(5)
events = events.withColumn("price", F.col("price").cast("double"))

from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:
F.sum("price")

window = Window.partitionBy("user_id").orderBy("event_time")
events.withColumn(
    "running_events",
    F.count("*").over(window)
)


In [0]:
revenue = events.filter(F.col("event_type") == "purchase") \
    .groupBy("product_id", "brand") \
    .agg(F.sum(F.col("price").cast("double")).alias("revenue")) \
    .orderBy(F.desc("revenue")) \
    .limit(5)

display(revenue)


In [0]:
window = Window.partitionBy("user_id").orderBy("event_time")
events.withColumn("cumulative_events", F.count("*").over(window))

In [0]:
window_spec = Window.partitionBy("user_id") \
    .orderBy("event_time") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)


In [0]:
events_with_running = events.withColumn(
    "cumulative_events",
    F.count("*").over(window_spec)
)

display(events_with_running)


In [0]:
user_spending = events.filter(F.col("event_type") == "purchase") \
    .groupBy("user_id") \
    .agg(F.sum("price").alias("total_spent"))

rank_window = Window.orderBy(F.desc("total_spent"))

user_rank = user_spending.withColumn(
    "rank",
    F.rank().over(rank_window)
)

display(user_rank)


In [0]:
from pyspark.sql import functions as F

conversion = (
    events.groupBy("category_code")
    .agg(
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("view"),
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchase")
    )
    .withColumn(
        "conversion_rate",
        F.when(F.col("view") == 0, 0)
         .otherwise((F.col("purchase") / F.col("view")) * 100)
    )
)

display(conversion)

In [0]:
def order_category(price):
    if price < 1000:
        return "Low"
    elif price <= 5000:
        return "Medium"
    else:
        return "High"


from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

order_category_udf = udf(order_category, StringType())

events = events.withColumn(
    "order_category",
    order_category_udf("price")
)

display(events)


In [0]:
events = events.withColumn("event_year", F.year("event_time")) \
               .withColumn("event_month", F.month("event_time"))

display(events)
