# Batch Pipepline (Medallion Architecture)

Loading the files to bronze

In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType
from pyspark.sql import functions as F

schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])


In [0]:
loaderdf = (
    spark.read.format("csv")
    .option("header", True)
    .schema(schema)
    # no recursive lookup needed if files are in the same folder
    .option("pathGlobFilter", "*.csv")     
    .load("/Volumes/workspace/ecommerce/ecommerce_data/Batch/", multiLine=False)
    .withColumn("file_path", F.col("_metadata.file_path"))
    .withColumn("file_name", F.element_at(F.split(F.col("_metadata.file_path"), "/"), -1))
)


In [0]:
loaderdf.groupBy("file_name").count().display()

In [0]:
loaderdf.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("workspace.ecommerce.MLPrepBronze")

Bronze to Silver (Cleanup)

In [0]:
BronzeTbl = spark.table("workspace.ecommerce.MLPrepBronze")
BronzeTbl = (
    BronzeTbl.filter(F.col("product_id").isNotNull())
    .groupBy("product_id")
    .agg(
        F.count(F.when(F.col("event_type") == "view", 1)).alias("total_views"),
        F.count(F.when(F.col("event_type") == "cart", 1)).alias("total_carts"),
        F.count(F.when(F.col("event_type") == "purchase", 1)).alias("total_purchases"),
        F.round(
            F.sum(F.when(F.col("event_type") == "purchase", F.col("price"))), 2
        ).alias("total_revenue"),
        F.round(F.mean("price"), 2).alias("price_mean"),
        F.round(F.stddev("price"), 2).alias("price_stddev"),
        F.round(F.min("price"), 2).alias("price_min"),
        F.round(F.max("price"), 2).alias("price_max"),
        F.expr("percentile_approx(price, 0.5)").alias("price_median"),
       
    )
    .withColumn("feature_generated_at", F.current_timestamp())
)

In [0]:
BronzeTbl.write.format("delta").mode("overwrite").saveAsTable("workspace.ecommerce.MLPrepSilver")

Silver to Gold (Product conversion ratio)

In [0]:
product_features = spark.read.table("workspace.ecommerce.MLPrepSilver")
product_features = (product_features
                    .withColumn("v2p_rate", F.when(F.col("total_views")>100, F.round(F.col("total_purchases")/F.col("total_views"), 2)))
                    .withColumn("has_views", F.col("total_views") > 0)
                    .withColumn("has_carts", F.col("total_carts") > 0)
                    .withColumn("has_revenue", F.col("total_revenue") > 0)
                    .withColumn("had_purchases", F.col("total_purchases") > 0)
                    .withColumn("source_window", F.month(F.col("feature_generated_at")))
                    )

                    

In [0]:

pk_check = (
    product_features
    .groupBy("product_id")
    .count()
    .filter("count > 1")
)

if pk_check.count() > 0:
    pk_check.display()
    raise Exception("Primary key violation: product_id")


In [0]:

product_features.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("workspace.ecommerce.mlPrepGold")


In [0]:
%sql

ALTER TABLE workspace.ecommerce.mlPrepGold
ALTER COLUMN product_id SET NOT NULL;

ALTER TABLE workspace.ecommerce.mlPrepGold ADD CONSTRAINT prepgold_pk PRIMARY KEY (product_id);


Gold Table

In [0]:
display(spark.read.table("workspace.ecommerce.mlPrepGold"))

In [0]:
%skip
from pyspark.ml.classification import GBTClassifier

# 1. Re-run the Assembler on the new filtered data
X_train, X_test = df_final_prep.randomSplit([0.8, 0.2], seed=42)
train_data = assembler.transform(X_train)

# 2. Train a GBT Model
gbt = GBTClassifier(labelCol="is_top_seller", featuresCol="features", maxIter=10)
gbt_model = gbt.fit(train_data)

# 3. View the new "Clean" Importance
importances = gbt_model.featureImportances
for feature, importance in zip(feature_cols, importances):
    print(f"Feature: {feature:25} Importance: {importance:.4f}")