In [0]:
from pyspark.sql import functions as F


In [0]:
bronze_df = spark.read.csv(
    "/Volumes/main/medalian/meadalian/sales.csv",
    header=True,
    inferSchema=True
).withColumn(
    "ingestion_ts", F.current_timestamp()
)


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS main.medalian.bronze;


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS main.medalian.silver;


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS main.medalian.gold;


In [0]:
%sql
SHOW VOLUMES IN main.medalian;


database,volume_name
medalian,bronze
medalian,gold
medalian,meadalian
medalian,silver


BRONZE LAYER – Raw Ingestion
🔹 Read CSV (already clean, no renaming needed)

In [0]:
from pyspark.sql import functions as F

bronze_df = spark.read.csv(
    "/Volumes/main/medalian/meadalian/sales.csv",
    header=True,
    inferSchema=True
).withColumn("ingestion_ts", F.current_timestamp())

bronze_df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/main/medalian/bronze/sales")


In [0]:
display(bronze_df)
bronze_df.printSchema()


date,order_id,customer_id,category,price,quantity,revenue,ingestion_ts
2023-01-01,20230101-489713,8726,Toys,63.95,4,255.8,2026-01-15T04:04:58.126Z
2023-01-01,20230101-505413,3044,Beauty,27.82,2,55.64,2026-01-15T04:04:58.126Z
2023-01-01,20230101-427515,5200,Electronics,80.12,1,80.12,2026-01-15T04:04:58.126Z
2023-01-01,20230101-516646,2169,Home,83.68,3,251.04,2026-01-15T04:04:58.126Z
2023-01-01,20230101-315601,7141,Electronics,82.78,2,165.56,2026-01-15T04:04:58.126Z
2023-01-01,20230101-320932,2030,Beauty,47.07,1,47.07,2026-01-15T04:04:58.126Z
2023-01-01,20230101-412748,4930,Electronics,146.22,4,584.88,2026-01-15T04:04:58.126Z
2023-01-01,20230101-476162,8325,Electronics,45.91,1,45.91,2026-01-15T04:04:58.126Z
2023-01-01,20230101-547596,7266,Home,43.17,3,129.51,2026-01-15T04:04:58.126Z
2023-01-01,20230101-787805,9721,Toys,12.23,1,12.23,2026-01-15T04:04:58.126Z


root
 |-- date: date (nullable = true)
 |-- order_id: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- revenue: double (nullable = true)
 |-- ingestion_ts: timestamp (nullable = true)



SILVER LAYER – Cleaning & Validation

In [0]:
bronze_df = spark.read.format("delta") \
    .load("/Volumes/main/medalian/bronze/sales")


In [0]:
silver_df = (
    bronze_df
    .filter(F.col("price") > 0)
    .filter(F.col("quantity") > 0)
    .filter(F.col("revenue") > 0)
    .dropDuplicates(["order_id", "customer_id"])
    .withColumn("order_date", F.to_date("date", "dd-MM-yyyy"))
    .withColumn(
        "calculated_revenue",
        F.col("price") * F.col("quantity")
    )
)


Write Silver

In [0]:
silver_df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/main/medalian/silver/sales")


In [0]:
display(silver_df)


date,order_id,customer_id,category,price,quantity,revenue,ingestion_ts,order_date,calculated_revenue
2023-01-08,20230108-989415,5654,Grocery,5.69,2,11.38,2026-01-15T04:04:58.126Z,2023-01-08,11.38
2023-01-17,20230117-805614,8185,Beauty,56.54,4,226.16,2026-01-15T04:04:58.126Z,2023-01-17,226.16
2023-01-21,20230121-176336,2669,Home,85.92,1,85.92,2026-01-15T04:04:58.126Z,2023-01-21,85.92
2023-01-30,20230130-760229,1580,Toys,35.54,1,35.54,2026-01-15T04:04:58.126Z,2023-01-30,35.54
2023-02-11,20230211-255297,5531,Toys,38.94,2,77.88,2026-01-15T04:04:58.126Z,2023-02-11,77.88
2023-02-16,20230216-982018,6342,Home,49.63,1,49.63,2026-01-15T04:04:58.126Z,2023-02-16,49.63
2023-02-22,20230222-150306,8869,Electronics,193.41,3,580.23,2026-01-15T04:04:58.126Z,2023-02-22,580.23
2023-02-28,20230228-511860,7108,Home,66.02,1,66.02,2026-01-15T04:04:58.126Z,2023-02-28,66.02
2023-03-27,20230327-929715,8168,Beauty,47.32,1,47.32,2026-01-15T04:04:58.126Z,2023-03-27,47.32
2023-04-01,20230401-140058,7410,Electronics,45.52,4,182.08,2026-01-15T04:04:58.126Z,2023-04-01,182.08


Read Silver

In [0]:
silver_df = spark.read.format("delta") \
    .load("/Volumes/main/medalian/silver/sales")


GOLD 1: Category-wise Sales Performance

In [0]:
gold_category_df = (
    silver_df
    .groupBy("category")
    .agg(
        F.sum("quantity").alias("total_quantity"),
        F.sum("calculated_revenue").alias("total_revenue"),
        F.countDistinct("order_id").alias("total_orders"),
        F.countDistinct("customer_id").alias("unique_customers")
    )
)


Write Gold (Category)

In [0]:
gold_category_df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/main/medalian/gold/category_sales")


GOLD 2: Daily Revenue Trend (Optional but Strong)

In [0]:
gold_daily_df = (
    silver_df
    .groupBy("order_date")
    .agg(
        F.sum("calculated_revenue").alias("daily_revenue"),
        F.sum("quantity").alias("daily_quantity"),
        F.countDistinct("order_id").alias("daily_orders")
    )
)


In [0]:
gold_daily_df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/main/medalian/gold/daily_sales")


Validate Gold Outputs

In [0]:
display(gold_category_df)
display(gold_daily_df)


category,total_quantity,total_revenue,total_orders,unique_customers
Toys,41311,1658949.3799999994,20881,8152
Home,57245,3533354.679999991,28576,8624
Beauty,46161,1420490.0200000082,23232,8337
Grocery,41582,609129.3500000056,20765,8119
Electronics,72249,11704721.130000023,36221,8820


order_date,daily_revenue,daily_quantity,daily_orders
2024-10-26,32831.98,398,164
2024-11-04,26411.42,365,187
2023-06-12,24310.93,316,164
2023-04-19,24951.590000000004,362,178
2023-09-04,25883.710000000003,340,124
2023-07-05,24484.25,337,168
2023-03-22,24235.830000000005,371,201
2024-11-20,27373.85,341,169
2024-12-20,27812.360000000004,359,186
2024-06-25,28768.66,355,184
