In [0]:
from pyspark.sql import functions as F
from delta.tables import *

##### 1. SETUP

In [0]:
combined_data_path = "/Volumes/workspace/ecommerce/ecommerce_data/processed_data/combined_all"

# Define Base Paths for the Medallion Architecture
base_medallion = "/Volumes/workspace/ecommerce/ecommerce_data/medallion"
path_bronze = f"{base_medallion}/bronze"
path_silver = f"{base_medallion}/silver"
path_gold   = f"{base_medallion}/gold"

print(f"STARTING AD-HOC DATA LOAD")
print(f"   • Source: {combined_data_path}")

STARTING AD-HOC DATA LOAD
   • Source: /Volumes/workspace/ecommerce/ecommerce_data/processed_data/combined_all


##### 2. Define Logic for Each Layer

In [0]:
def run_bronze():
    print(f"\n Ingesting BRONZE...")
    raw_df = spark.read.parquet(combined_data_path) # Reading from Combined Path
    
    bronze_df = raw_df.withColumn("ingestion_ts", F.current_timestamp()) \
                      .withColumn("source_file", F.col("_metadata.file_path"))
    
    bronze_df.write.format("delta").mode("overwrite").save(path_bronze)
    print(f"Bronze Done.")

def run_silver():
    print("\n Cleaning SILVER...")
    bronze_read = spark.read.format("delta").load(path_bronze)
    
    silver_df = bronze_read \
        .filter(F.col("price") > 0) \
        .dropDuplicates(["user_session", "event_time", "product_id"]) \
        .withColumn("price_tier", 
            F.when(F.col("price") < 50, "budget").otherwise("premium")
        )
    
    silver_df.write.format("delta").mode("overwrite").save(path_silver)
    print(f"Silver Done.")

def run_gold():
    print("\n Aggregating GOLD...")
    silver_read = spark.read.format("delta").load(path_silver)
    
    product_perf = silver_read.groupBy("product_id").agg(
        F.countDistinct("user_session").alias("total_views"),
        F.sum("price").alias("total_revenue")
    ).withColumn("conversion_rate", 
        F.when(F.col("total_views") == 0, 0.0) \
         .otherwise(F.round(F.col("total_revenue") / F.col("total_views"), 2))
    )
    
    product_perf.write.format("delta").mode("overwrite").save(path_gold)
    print(f"Gold Done.")

##### 3. Execution Controller

In [0]:
run_bronze()
run_silver()
run_gold()

print("\n ALL DATA UPDATED! You can now go to the SQL Editor.")


 Ingesting BRONZE...
Bronze Done.

 Cleaning SILVER...
Silver Done.

 Aggregating GOLD...
Gold Done.

 ALL DATA UPDATED! You can now go to the SQL Editor.
