In [0]:
# Load the optimized parquet (Clean & Simple)
base_path = "/Volumes/workspace/ecommerce/ecommerce_data/processed_data"

#### STEP 1: Create a Sample from your Big Data

In [0]:
df_full = spark.read.parquet(f"{base_path}/oct_2019")

# Create a 1% sample
# withReplacement=False, fraction=0.01, seed=42 (for reproducibility)
df_sample = df_full.sample(False, 0.01, seed=42)

print(f"Sampling complete. Reduced rows from {df_full.count():,} to {df_sample.count():,}")

Sampling complete. Reduced rows from 42,448,764 to 424,672


#### STEP 2: Save this Sample as a CSV

In [0]:
sample_path = "/Volumes/workspace/ecommerce/ecommerce_data/sample_dev_data"

# Coalesce(1) forces it to write a SINGLE CSV file instead of many small ones
df_sample.coalesce(1).write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(sample_path)

print(f"Sample CSV saved to: {sample_path}")

Sample CSV saved to: /Volumes/workspace/ecommerce/ecommerce_data/sample_dev_data


#### STEP 3: The Day 2 Challenge Tasks

1. Upload sample e-commerce CSV
2. Read data into DataFrame
3. Perform basic operations: select, filter, groupBy, orderBy
4. Export results

In [0]:
# Read data into DataFrame
print("Reading the Sample CSV...")
df = spark.read.csv(sample_path, header=True, inferSchema=True)

print(f"Loaded Sample Data: {df.count():,} rows")

# VIEW the data
print("Previewing Data:")
df.show(5)

Reading the Sample CSV...
Loaded Sample Data: 424,672 rows
Previewing Data:
+-------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-----+-------+---------+--------------------+
|2019-10-13 06:25:46|      view|   1306558|2053013558920217191|  computers.notebook| acer|1801.82|523366823|0c7f0449-74d5-4b0...|
|2019-10-13 06:25:57|      view|   2702277|2053013563911439225|appliances.kitche...|   lg| 465.72|520011252|595fcb56-7f46-4fe...|
|2019-10-13 06:25:58|      view|   1701197|2053013553031414015|computers.periphe...| asus| 343.87|556005307|b4c4f49c-786a-499...|
|2019-10-13 06:25:58|      view|  26204079|2053013563693335403|                NULL| NULL| 319.96|530303804|2adae820-1a9f-404...|
|2019-10-13 06

In [0]:
# Check the Column Names & Types
print("Schema:")
df.printSchema()

📋 Schema:
root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
display(df)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-13T06:25:46.000Z,view,1306558,2053013558920217191,computers.notebook,acer,1801.82,523366823,0c7f0449-74d5-4b0c-bb07-ba04cc7a6681
2019-10-13T06:25:57.000Z,view,2702277,2053013563911439225,appliances.kitchen.refrigerators,lg,465.72,520011252,595fcb56-7f46-4fe8-a26f-b44dd459ad44
2019-10-13T06:25:58.000Z,view,1701197,2053013553031414015,computers.peripherals.monitor,asus,343.87,556005307,b4c4f49c-786a-499e-8eac-b636d9efa8ef
2019-10-13T06:25:58.000Z,view,26204079,2053013563693335403,,,319.96,530303804,2adae820-1a9f-404e-a466-aa576718e614
2019-10-13T06:25:59.000Z,view,2702347,2053013563911439225,appliances.kitchen.refrigerators,lg,552.08,555510454,7f20c2f6-9c6f-47d2-ae55-eb24f49029f9
2019-10-13T06:26:05.000Z,view,28717875,2053013565782098913,apparel.shoes,cover,135.65,517118885,1bb4ad09-8bde-47a9-92ba-b02062cce93e
2019-10-13T06:26:18.000Z,view,1003317,2053013555631882655,electronics.smartphone,apple,952.15,512691790,2484906b-0bd2-aae9-0706-437a660106b5
2019-10-13T06:26:20.000Z,view,6300883,2053013554834964853,appliances.kitchen.kettle,tefal,33.44,518812652,594bb8ea-125d-4e3b-aee1-96227f9242d1
2019-10-13T06:26:34.000Z,view,1004873,2053013555631882655,electronics.smartphone,samsung,378.56,554627283,7d4d6c14-2f76-45fd-8a95-20ab170d78b9
2019-10-13T06:26:34.000Z,view,12718983,2053013553559896355,,,34.75,513346566,60a5091b-903b-4be6-866f-dd094f0d2780


In [0]:
# Basic Operations (Select, Filter, GroupBy, OrderBy)
from pyspark.sql.functions import col, desc

# SELECT & FILTER: Find purchases for a specific brand (e.g., samsung)
samsung_purchases = df.select("event_time", "brand", "price", "user_id") \
                      .filter((col("brand") == "samsung") & (col("event_type") == "purchase"))

print(f"Found {samsung_purchases.count()} Samsung purchases")

Found 1745 Samsung purchases


In [0]:
# GROUP BY & ORDER BY
category_stats = df.filter(col("event_type") == "purchase") \
                   .groupBy("brand") \
                   .count() \
                   .orderBy(col("count").desc())

print("Top 5 Categories:")
display(category_stats.limit(5))

Top 5 Categories:


brand,count
samsung,1745
apple,1424
xiaomi,590
,558
huawei,222
