# Bronze Layer (Raw Ingestion)

In [0]:
from pyspark.sql.functions import *

#Create path variables

base_path = "/Volumes/workspace/ecommerce/ecommerce_data"
bronze_path = f"{base_path}/delta/bronze_events"
silver_path = f"{base_path}/delta/silver_events"
gold_path = f"{base_path}/delta/gold_product"

# 1. READ RAW DATA - using both Oct and Nov data

df_raw = spark.read.option("header", "true") \
                    .option("inferSchema", "true") \
                    .csv(f"{base_path}/2019-*.csv")

In [0]:

#2. Add Audit Columns for bronze
df_bronze = df_raw.withColumn("ingestion_ts", current_timestamp())

#3.Write to Bronze (Append Only for Bronze)
df_bronze.write.format("delta").mode("append").save(bronze_path)

print(f"Bronze Layer successfully built at: {bronze_path}")


Bronze Layer successfully built at: /Volumes/workspace/ecommerce/ecommerce_data/delta/bronze_events


# Silver Layer (Cleaning & Enrichment)

In [0]:
# 1. READ FROM BRONZE
df_bronze_src = spark.read.format("delta").load(bronze_path)

# 2. Cleanup Data
# Minimal fix: cast price to float before filtering
from pyspark.sql.functions import col

df_bronze_cleaned = df_bronze_src \
    .filter(col("price").cast("float") > 0) \
    .filter(col("price").cast("float") < 10000) \
    .dropDuplicates(["user_session", "event_time", "product_id"])

# 3. Enrichment (Derived Columns)
# - Add Date Column for easier partitioning / querying
# - Add 'price_tier' for segmentation Analysis

df_silver = df_bronze_cleaned \
    .withColumn("event_date", to_date(col("event_time"))) \
    .withColumn("price_tier", 
        when(col("price").cast("float") < 50 , "Cheap")
        .when((col("price").cast("float") >= 50) & (col("price").cast("float") < 300), "Standard")
        .otherwise("Luxury")
    )
        
# 4. Write to Silver
df_silver.write.format("delta").mode("overwrite").save(silver_path)

print(f"Silver Layer successfully built at: {silver_path}")

# 5. Validation
df_silver.select("event_time", "event_type", "product_id", "price", "price_tier", "event_date").show(5)

Silver Layer successfully built at: /Volumes/workspace/ecommerce/ecommerce_data/delta/silver_events
+-------------------+----------+----------+------+----------+----------+
|         event_time|event_type|product_id| price|price_tier|event_date|
+-------------------+----------+----------+------+----------+----------+
|2019-11-17 08:43:12|      view|  17300671| 86.12|  Standard|2019-11-17|
|2019-11-17 08:43:19|      view| 100007591| 21.11|     Cheap|2019-11-17|
|2019-11-17 08:43:29|      view|  28718397|103.99|  Standard|2019-11-17|
|2019-11-17 08:43:34|      view|   4700387| 42.17|     Cheap|2019-11-17|
|2019-11-17 08:43:34|  purchase| 100000246| 43.68|     Cheap|2019-11-17|
+-------------------+----------+----------+------+----------+----------+
only showing top 5 rows


# The Gold Layer (Business Aggregates)

In [0]:
#1. Read from Silver
df_silver_src = spark.read.format("delta").load(silver_path)

# 2. Aggregations - Counts vies vs purchase per product

df_gold = df_silver_src.groupBy("product_id", "category_code", "brand") \
    .agg(
        #Count unique users who viewed
        countDistinct(when(col("event_type") == "view", col("user_id"))).alias("unique_views"),
        #Count Unique users who purchased
        countDistinct(when(col("event_type") == "purchase", col("user_id"))).alias("unique_purchases"),
        #Total Revenue
        sum(when(col("event_type") == "purchase", col("price").cast("float"))).alias("revenue")
      )
    
# 3. Add KPIs(Conversion Rate)
# Purchases /Views 
df_gold_final = df_gold.withColumn(
  "conversion_rate_pct", (col("unique_purchases") / (col("unique_views") + 1)) *100
).fillna(0)

# 4. Write to Gold
df_gold_final.write.format("delta").mode("overwrite").save(gold_path)
print(f"Gold Layer successfully built at: {gold_path}")

# 5. Validation
df_gold_final.select("product_id", "category_code", "brand", "unique_views", "unique_purchases", "revenue", "conversion_rate_pct").show(5)
    


Gold Layer successfully built at: /Volumes/workspace/ecommerce/ecommerce_data/delta/gold_product
+----------+------------------+-------+------------+----------------+------------------+-------------------+
|product_id|     category_code|  brand|unique_views|unique_purchases|           revenue|conversion_rate_pct|
+----------+------------------+-------+------------+----------------+------------------+-------------------+
|   8500290|              NULL|   NULL|         982|              40|14674.029891967773|  4.069175991861648|
|   5100573|electronics.clocks|  apple|        7904|             148|  87535.3402709961| 1.8722327640733714|
|   3300488|              NULL|redmond|        3254|              96|13912.220016479492| 2.9493087557603688|
|  29502246|              NULL|   NULL|         191|              10| 451.6999816894531|  5.208333333333334|
|  12704683|              NULL| nokian|        1259|              43| 4911.619964599609| 3.4126984126984126|
+----------+------------------+

# Visualization (Business Value)

In [0]:

# Load gold Data
gold_data = spark.read.format("delta").load(gold_path)

#Visualize Top Performing Products by Revenue
display(gold_data.orderBy(col("revenue").desc()).limit(10))

product_id,category_code,brand,unique_views,unique_purchases,revenue,conversion_rate_pct
1005115,electronics.smartphone,apple,372011,21687,33030410.20373535,5.829650656430438
1005105,electronics.smartphone,apple,240055,10333,21684603.255371094,4.304412303795781
1004249,electronics.smartphone,apple,190109,10754,13543934.59729004,5.656725053916154
1005135,electronics.smartphone,apple,128821,4948,12654328.793701172,3.8409588424337455
1004767,electronics.smartphone,samsung,369934,28926,11004247.829162598,7.819211483098382
1002544,electronics.smartphone,apple,194725,13800,10457979.146636965,7.086881053377566
1004856,electronics.smartphone,samsung,415016,38760,7917145.031646728,9.339376459277572
1005116,electronics.smartphone,apple,84120,4909,7161938.289855957,5.835641516387109
1002524,electronics.smartphone,apple,101861,7892,6965532.939117432,7.747737134554593
1004870,electronics.smartphone,samsung,185228,14756,6057424.0316467285,7.966355160369057
