In [0]:
# Load data
events = spark.read.csv("/path/to/sample.csv", header=True, inferSchema=True)

# Basic operations
events.select("event_type", "product_name", "price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

In [0]:
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)
# Basic operations
events.select("event_type","category_id","category_code","price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
events.orderBy("price", ascending=False).show(10)
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

In [0]:
%python

events.write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct-export.csv")

In [0]:
%python
events.coalesce(1).write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct-export")

In [0]:
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)
# Basic operations
events.select("event_type","category_id","category_code","price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
events.orderBy("price", ascending=False).show(10)
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

events.coalesce(1).write.mode("overwrite").option("header", "true").csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov-export")

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)
# Basic operations
events.select("event_type","category_id","category_code","price").show(10)
events.filter("price > 100").count()
events.groupBy("event_type").count().show()
events.orderBy("price", ascending=False).show(10)
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)

# Top 5 products by revenue
revenue = events.filter(F.col("event_type") == "purchase") \
    .groupBy("product_id", "product_name") \
    .agg(F.sum("price").alias("revenue")) \
    .orderBy(F.desc("revenue")).limit(5)

# Running total per user
window = Window.partitionBy("user_id").orderBy("event_time")
events.withColumn("cumulative_events", F.count("*").over(window))

# Conversion rate by category
conversion = events.groupBy("category_code").agg(
    F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchase"),
    F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("view")
).withColumn(
    "conversion_rate",
    (F.col("purchase") / F.col("view")) * 100
)
display(conversion)

In [0]:
from pyspark.sql import functions as F

# Derived feature: discounted price (10% off)
events = events.withColumn(
    "discounted_price",
    F.col("price") * 0.9
)

# Derived feature: is_high_price (True if price > 100)
events = events.withColumn(
    "is_high_price",
    F.col("price") > 100
)

display(events)