In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import random
from datetime import datetime, timedelta

spark = SparkSession.builder \
    .appName("StreamPulse-MerchPipeline") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()


In [3]:
random.seed(42)
num_orders = 300000

categories = ["T-Shirts", "Vinyl Records", "Posters", "Hoodies", "Stickers", "Accessories"]
regions = ["North America", "Europe", "Asia Pacific", "Latin America"]
payment_methods = ["credit_card", "debit_card", "paypal", "apple_pay", "gift_card"]
statuses = ["completed", "pending", "cancelled", "refunded", "failed"]

prices = {
    "T-Shirts": (15.99, 39.99),
    "Vinyl Records": (19.99, 49.99),
    "Posters": (9.99, 29.99),
    "Hoodies": (39.99, 79.99),
    "Stickers": (2.99, 9.99),
    "Accessories": (5.99, 24.99),
}

raw_data = []
base_date = datetime(2025, 1, 1)

for i in range(num_orders):
    order_date = base_date + timedelta(days=random.randint(0, 179))
    category = random.choice(categories)
    price_range = prices[category]
    unit_price = __builtins__.round(random.uniform(*price_range), 2)
    quantity = random.choices([1, 2, 3, 4, 5], weights=[50, 25, 15, 7, 3])[0]
    status = random.choices(statuses, weights=[70, 10, 10, 7, 3])[0]

    discount = 0.0
    if random.random() < 0.3:
        discount = random.choice([0.05, 0.10, 0.15, 0.20, 0.25])

    shipping = __builtins__.round(random.uniform(2.99, 12.99), 2) if unit_price * quantity > 10 else 0.0

    row = (
        f"ORD-{i+1:07d}",
        f"CUST-{random.randint(1, 80000):06d}",
        category,
        random.choice(regions),
        str(unit_price),
        str(quantity),
        str(discount),
        str(shipping),
        random.choice(payment_methods),
        status,
        order_date.strftime("%Y-%m-%d"),
        f"ART-{random.randint(1, 5000):05d}",
    )

    if random.random() < 0.005:
        row = tuple("" if j == 4 else v for j, v in enumerate(row))
    if random.random() < 0.003:
        row = tuple("BAD_DATE" if j == 10 else v for j, v in enumerate(row))

    raw_data.append(row)

raw_columns = [
    "order_id", "customer_id", "category", "region", "unit_price",
    "quantity", "discount_pct", "shipping_cost", "payment_method",
    "status", "order_date", "artist_id"
]

df_raw = spark.createDataFrame(raw_data, raw_columns)
df_raw.write.csv("pipeline/raw_orders", header=True, mode="overwrite")
print(f"‚úÖ Generated {df_raw.count()} raw orders")

‚úÖ Generated 300000 raw orders


In [5]:
# Read and Inspect Raw Data
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType

# Define schema to avoid inferSchema issues with malformed numeric data
# Read all columns as StringType initially
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("region", StringType(), True),
    StructField("unit_price", StringType(), True),
    StructField("quantity", StringType(), True),
    StructField("discount_pct", StringType(), True),
    StructField("shipping_cost", StringType(), True),
    StructField("payment_method", StringType(), True),
    StructField("status", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("artist_id", StringType(), True)
])

# Read CSV with defined schema
df = spark.read.csv("pipeline/raw_orders", header=True, schema=schema)

# Clean and cast columns to their correct types, handling malformed data
df = df.withColumn("unit_price", when(col("unit_price") == "", None).otherwise(col("unit_price")).cast(DoubleType())) \
       .withColumn("quantity", when(col("quantity") == "", None).otherwise(col("quantity")).cast(IntegerType())) \
       .withColumn("discount_pct", when(col("discount_pct") == "", None).otherwise(col("discount_pct")).cast(DoubleType())) \
       .withColumn("shipping_cost", when(col("shipping_cost") == "", None).otherwise(col("shipping_cost")).cast(DoubleType())) \
       .withColumn("order_date", when(col("order_date") == "BAD_DATE", None).otherwise(col("order_date")).cast(DateType()))

df.show(10)
df.printSchema()
print(f"Total rows: {df.count()}")

print("\n--- Null/Empty Counts ---")
# Now that types are correct and empty strings/bad dates are converted to null, check for actual nulls
for col_name in df.columns:
    null_count = df.filter(
        col(col_name).isNull()
    ).count()
    if null_count > 0:
        print(f"  {col_name}: {null_count} nulls")

print(f"\nBad dates: {df.filter(col('order_date').isNull()).count()}") # 'BAD_DATE' are now null

+-----------+-----------+-------------+-------------+----------+--------+------------+-------------+--------------+---------+----------+---------+
|   order_id|customer_id|     category|       region|unit_price|quantity|discount_pct|shipping_cost|payment_method|   status|order_date|artist_id|
+-----------+-----------+-------------+-------------+----------+--------+------------+-------------+--------------+---------+----------+---------+
|ORD-0149505|CUST-058620|     Stickers| Asia Pacific|      3.89|       1|         0.2|          0.0|   credit_card|completed|2025-06-20|ART-04754|
|ORD-0149506|CUST-067925|  Accessories|Latin America|     12.07|       2|        0.15|        11.38|     apple_pay|  pending|2025-02-26|ART-00387|
|ORD-0149507|CUST-006271|     Stickers|North America|      3.28|       3|         0.0|          0.0|     apple_pay|  pending|2025-06-03|ART-02345|
|ORD-0149508|CUST-033067|      Hoodies|North America|     78.84|       1|        0.05|         4.94|   credit_card|com

In [6]:
# Clean and Transform

order_schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("customer_id", StringType(), False),
    StructField("category", StringType(), True),
    StructField("region", StringType(), True),
    StructField("unit_price", DoubleType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("discount_pct", DoubleType(), True),
    StructField("shipping_cost", DoubleType(), True),
    StructField("payment_method", StringType(), True),
    StructField("status", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("artist_id", StringType(), True),
])

df = spark.read.csv("pipeline/raw_orders", header=True, schema=order_schema)


In [7]:
# Remove invalid records

df_valid = df \
    .filter(col("order_id").isNotNull()) \
    .filter(col("unit_price").isNotNull() & (col("unit_price") > 0)) \
    .filter(col("order_date") != "BAD_DATE") \
    .filter(col("order_date").isNotNull())

removed = df.count() - df_valid.count()
print(f"Removed {removed} invalid records ({removed/df.count()*100:.1f}%)")


Removed 2410 invalid records (0.8%)


In [8]:
# Cast and enrich

df_enriched = df_valid \
    .withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd")) \
    .withColumn("discount_pct", coalesce(col("discount_pct"), lit(0.0))) \
    .withColumn("shipping_cost", coalesce(col("shipping_cost"), lit(0.0))) \
    .withColumn("subtotal", round(col("unit_price") * col("quantity"), 2)) \
    .withColumn("discount_amount", round(col("subtotal") * col("discount_pct"), 2)) \
    .withColumn("total_amount", round(
        col("subtotal") - col("discount_amount") + col("shipping_cost"), 2
    )) \
    .withColumn("year", year(col("order_date"))) \
    .withColumn("month", month(col("order_date"))) \
    .withColumn("day_of_week", dayofweek(col("order_date"))) \
    .withColumn("is_weekend", when(
        dayofweek(col("order_date")).isin(1, 7), True
    ).otherwise(False))

df_enriched.show(5)
df_enriched.printSchema()


+-----------+-----------+-----------+-------------+----------+--------+------------+-------------+--------------+---------+----------+---------+--------+---------------+------------+----+-----+-----------+----------+
|   order_id|customer_id|   category|       region|unit_price|quantity|discount_pct|shipping_cost|payment_method|   status|order_date|artist_id|subtotal|discount_amount|total_amount|year|month|day_of_week|is_weekend|
+-----------+-----------+-----------+-------------+----------+--------+------------+-------------+--------------+---------+----------+---------+--------+---------------+------------+----+-----+-----------+----------+
|ORD-0149505|CUST-058620|   Stickers| Asia Pacific|      3.89|       1|         0.2|          0.0|   credit_card|completed|2025-06-20|ART-04754|    3.89|           0.78|        3.11|2025|    6|          6|     false|
|ORD-0149506|CUST-067925|Accessories|Latin America|     12.07|       2|        0.15|        11.38|     apple_pay|  pending|2025-02-2

In [9]:
# Write Curated Dataset

df_enriched \
    .coalesce(4) \
    .write.parquet(
        "pipeline/curated_orders",
        mode="overwrite",
        partitionBy=["year", "month"],
        compression="snappy"
    )

curated = spark.read.parquet("pipeline/curated_orders")
print(f"‚úÖ Curated dataset: {curated.count()} rows, {len(curated.columns)} columns")


‚úÖ Curated dataset: 297590 rows, 19 columns


In [11]:
# Verify the partition structure:

import os

def show_tree(path, prefix="", max_depth=3, depth=0):
    if depth >= max_depth:
        return
    items = sorted(os.listdir(path))
    dirs = [i for i in items if os.path.isdir(os.path.join(path, i)) and not i.startswith("_")]
    files = [i for i in items if not os.path.isdir(os.path.join(path, i)) and i.endswith(".parquet")]
    for d in dirs:
        print(f"{prefix}üìÅ {d}/")
        show_tree(os.path.join(path, d), prefix + "  ", max_depth, depth + 1)
    for f in files:
        size = os.path.getsize(os.path.join(path, f))
        print(f"{prefix}üìÑ {f} ({size/1024:.0f} KB)")

show_tree("pipeline/curated_orders")


üìÅ year=2025/
  üìÅ month=1/
    üìÑ part-00000-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (752 KB)
    üìÑ part-00001-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (747 KB)
  üìÅ month=2/
    üìÑ part-00000-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (691 KB)
    üìÑ part-00001-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (689 KB)
  üìÅ month=3/
    üìÑ part-00000-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (759 KB)
    üìÑ part-00001-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (751 KB)
  üìÅ month=4/
    üìÑ part-00000-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (727 KB)
    üìÑ part-00001-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (729 KB)
  üìÅ month=5/
    üìÑ part-00000-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (752 KB)
    üìÑ part-00001-30d92c80-cf1c-4b2b-aef3-0afa5e7b8288.c000.snappy.parquet (746 KB)
  üìÅ month=6/
    üìÑ part-00000-30d92c80

In [13]:
from pyspark.sql.functions import col, count, sum, avg, countDistinct

# Build Aggregated Summary Tables
daily_revenue = curated \
    .filter(col("status") == "completed") \
    .groupBy("order_date") \
    .agg(
        count("order_id").alias("total_orders"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        countDistinct("customer_id").alias("unique_customers")
    ) \
    .orderBy("order_date")

daily_revenue.show(10)

daily_revenue.coalesce(1) \
    .write.parquet("pipeline/summary/daily_revenue", mode="overwrite")
print("‚úÖ Daily revenue summary written")

+----------+------------+------------------+------------------+----------------+
|order_date|total_orders|     total_revenue|   avg_order_value|unique_customers|
+----------+------------+------------------+------------------+----------------+
|2025-01-01|        1179| 65883.99999999997|55.881255301102605|            1164|
|2025-01-02|        1156|63688.399999999994|55.093771626297574|            1149|
|2025-01-03|        1170| 67130.27000000006|  57.3762991452992|            1163|
|2025-01-04|        1174| 69704.56000000003|59.373560477001725|            1162|
|2025-01-05|        1157| 65451.66999999999| 56.57015557476231|            1152|
|2025-01-06|        1182|64904.819999999934| 54.91101522842634|            1172|
|2025-01-07|        1206| 68506.16000000006|  56.8044444444445|            1197|
|2025-01-08|        1125| 65306.56999999999|58.050284444444436|            1115|
|2025-01-09|        1185| 66108.00000000003|55.787341772151926|            1175|
|2025-01-10|        1127| 63

In [15]:
from pyspark.sql.functions import col, count, sum, avg, dense_rank, desc
from pyspark.sql.window import Window

# Category Performance
category_perf = curated \
    .filter(col("status") == "completed") \
    .groupBy("category") \
    .agg(
        count("order_id").alias("total_orders"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        avg("discount_pct").alias("avg_discount"),
        sum("quantity").alias("total_units_sold")
    ) \
    .withColumn("revenue_rank", dense_rank().over(
        Window.orderBy(desc("total_revenue"))
    )) \
    .orderBy("revenue_rank")

category_perf.show()

category_perf.coalesce(1) \
    .write.parquet("pipeline/summary/category_performance", mode="overwrite")
print("‚úÖ Category performance summary written")

+-------------+------------+------------------+------------------+--------------------+----------------+------------+
|     category|total_orders|     total_revenue|   avg_order_value|        avg_discount|total_units_sold|revenue_rank|
+-------------+------------+------------------+------------------+--------------------+----------------+------------+
|      Hoodies|       34477| 3988137.040000002|115.67529193375299| 0.04563042028018676|           64930|           1|
|Vinyl Records|       34992|2475490.0699999915| 70.74445787608572| 0.04515032007315957|           65847|           2|
|     T-Shirts|       34724| 2025033.519999999|58.317979495449805|0.045291441078216754|           65387|           3|
|      Posters|       34764|1520636.3200000052| 43.74169600736409| 0.04502214934990221|           65106|           4|
|  Accessories|       34838|1221213.2199999988| 35.05405649003958|0.044976749526379256|           65602|           5|
|     Stickers|       34770| 521870.5699999997| 15.00921

In [16]:
#  Regional Trends
regional = curated \
    .filter(col("status") == "completed") \
    .groupBy("region", "month") \
    .agg(
        count("order_id").alias("total_orders"),
        sum("total_amount").alias("total_revenue"),
        countDistinct("customer_id").alias("unique_customers"),
    ) \
    .orderBy("region", "month")

regional.show(20)

regional.coalesce(1) \
    .write.parquet("pipeline/summary/regional_trends", mode="overwrite")
print("‚úÖ Regional trends summary written")


+-------------+-----+------------+------------------+----------------+
|       region|month|total_orders|     total_revenue|unique_customers|
+-------------+-----+------------+------------------+----------------+
| Asia Pacific|    1|        8868| 497060.9899999998|            8395|
| Asia Pacific|    2|        8055| 453786.0000000006|            7681|
| Asia Pacific|    3|        8918|497517.07000000076|            8452|
| Asia Pacific|    4|        8650|486337.20000000065|            8235|
| Asia Pacific|    5|        8894| 500368.5399999996|            8389|
| Asia Pacific|    6|        8377|469133.65000000026|            7945|
|       Europe|    1|        9060|509821.58999999927|            8561|
|       Europe|    2|        8204| 456793.9799999998|            7813|
|       Europe|    3|        8977|497563.55000000016|            8482|
|       Europe|    4|        8723| 493569.8400000009|            8243|
|       Europe|    5|        9010|506017.30999999953|            8505|
|     

In [17]:
# Payment Method Analysis
payment_analysis = curated \
    .filter(col("status") == "completed") \
    .groupBy("payment_method") \
    .agg(
        count("order_id").alias("total_orders"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
    ) \
    .withColumn("pct_of_orders", round(
        col("total_orders") / sum("total_orders").over(Window.partitionBy()), 4
    )) \
    .orderBy(desc("total_orders"))

payment_analysis.show()

payment_analysis.coalesce(1) \
    .write.parquet("pipeline/summary/payment_analysis", mode="overwrite")
print("‚úÖ Payment analysis summary written")


+--------------+------------+------------------+------------------+-------------+
|payment_method|total_orders|     total_revenue|   avg_order_value|pct_of_orders|
+--------------+------------+------------------+------------------+-------------+
|        paypal|       42337|2380316.7599999923| 56.22308524458494|        0.203|
|     apple_pay|       41697|  2360488.59000001| 56.61051370602226|       0.1999|
|   credit_card|       41650| 2346339.889999993|56.334691236494436|       0.1997|
|     gift_card|       41454|2336333.4800000014| 56.35966324118303|       0.1988|
|    debit_card|       41427| 2328902.019999999|56.217008714123615|       0.1986|
+--------------+------------+------------------+------------------+-------------+

‚úÖ Payment analysis summary written


In [18]:
# Pipeline Validation

print("=" * 60)
print("PIPELINE VALIDATION REPORT")
print("=" * 60)

raw_count = spark.read.csv("pipeline/raw_orders", header=True).count()
curated_count = spark.read.parquet("pipeline/curated_orders").count()
daily_count = spark.read.parquet("pipeline/summary/daily_revenue").count()
cat_count = spark.read.parquet("pipeline/summary/category_performance").count()

print(f"\n1. Row Counts:")
print(f"   Raw orders:          {raw_count}")
print(f"   Curated orders:      {curated_count}")
print(f"   Records removed:     {raw_count - curated_count} ({(raw_count-curated_count)/raw_count*100:.1f}%)")

print(f"\n2. Summary Tables:")
print(f"   Daily revenue:       {daily_count} days")
print(f"   Category perf:       {cat_count} categories")

print(f"\n3. Data Quality Checks:")
curated_df = spark.read.parquet("pipeline/curated_orders")
null_ids = curated_df.filter(col("order_id").isNull()).count()
null_amounts = curated_df.filter(col("total_amount").isNull()).count()
neg_amounts = curated_df.filter(col("total_amount") < 0).count()
print(f"   Null order_ids:      {null_ids} {'‚úÖ' if null_ids == 0 else '‚ùå'}")
print(f"   Null total_amounts:  {null_amounts} {'‚úÖ' if null_amounts == 0 else '‚ùå'}")
print(f"   Negative amounts:    {neg_amounts} {'‚úÖ' if neg_amounts == 0 else '‚ùå'}")

print(f"\n4. Schema Validation:")
expected_cols = [
    "order_id", "customer_id", "category", "region", "unit_price",
    "quantity", "discount_pct", "shipping_cost", "payment_method",
    "status", "order_date", "artist_id", "subtotal", "discount_amount",
    "total_amount", "year", "month", "day_of_week", "is_weekend"
]
actual_cols = curated_df.columns
missing = set(expected_cols) - set(actual_cols)
extra = set(actual_cols) - set(expected_cols)
print(f"   Expected columns:    {len(expected_cols)}")
print(f"   Actual columns:      {len(actual_cols)}")
print(f"   Missing:             {missing if missing else '‚úÖ None'}")
print(f"   Extra:               {extra if extra else 'None'}")

print("\n" + "=" * 60)
print("PIPELINE COMPLETE ‚úÖ")
print("=" * 60)


PIPELINE VALIDATION REPORT

1. Row Counts:
   Raw orders:          300000
   Curated orders:      297590
   Records removed:     2410 (0.8%)

2. Summary Tables:
   Daily revenue:       180 days
   Category perf:       6 categories

3. Data Quality Checks:
   Null order_ids:      0 ‚úÖ
   Null total_amounts:  0 ‚úÖ
   Negative amounts:    0 ‚úÖ

4. Schema Validation:
   Expected columns:    19
   Actual columns:      19
   Missing:             ‚úÖ None
   Extra:               None

PIPELINE COMPLETE ‚úÖ


What data quality issues did you find?

Missing values in unit_price (Nulls).

Malformed strings in order_date ("BAD_DATE").

Varying formats (converting strings to actual Date objects).

How did your cleaning handle them?

We used an Internal Filter to drop rows where unit_price was missing or the date was invalid, ensuring the final analytics weren't skewed.

We used to_date to standardize the strings into a proper temporal format for time-series analysis.

What would you change for production?

Partitioning: I would add partitionBy("year", "month") when writing to disk to avoid the performance warning you saw.

Schema Enforcement: Instead of letting Spark guess, I'd use a strict StructType schema to prevent the pipeline from running if the source data format changes
