# Product level Analysis (More granular) & Prediction

Already Loaded to Bronze

In [0]:
events = spark.read.table("workspace.ecommerce.df_october")

In [0]:
from pyspark.sql import functions as F

events = events.withColumn("date",F.col("event_time").cast("date"))


Bronze to silver (Product conversion ratio)

In [0]:

from pyspark.sql import functions as F
from pyspark.sql.window import Window

product_agg = (
    events
    .filter(F.col("product_id").isNotNull())
    .groupBy("product_id")
    .agg(
        F.count(F.when(F.col("event_type") == "view", 1)).alias("total_views"),
        F.count(F.when(F.col("event_type") == "cart", 1)).alias("total_carts"),
        F.count(F.when(F.col("event_type") == "purchase", 1)).alias("total_purchases"),
        F.round(
            F.sum(F.when(F.col("event_type") == "purchase", F.col("price"))), 2
        ).alias("total_revenue")
    )
)


In [0]:

product_features = (
    product_agg
    .withColumn("has_views", F.col("total_views") > 0)
    .withColumn("has_carts", F.col("total_carts") > 0)
    .withColumn("has_purchases", F.col("total_purchases") > 0)
    .withColumn("has_revenue", F.col("total_revenue") > 0)
)


In [0]:

product_features = (
    product_features
    .withColumn(
        "view_to_purchase_rate",
        F.when(F.col("total_views") >= 100,
               F.round(F.col("total_purchases") / F.col("total_views"), 4))
    )
    .withColumn(
        "cart_to_purchase_rate",
        F.when(F.col("total_carts") >= 5,
               F.round(F.col("total_purchases") / F.col("total_carts"), 4))
    )
)


In [0]:

price_stats = (
    events
    .filter(F.col("product_id").isNotNull())
    .groupBy("product_id")
    .agg(
        F.count("price").alias("price_sample_size"),
        F.round(F.mean("price"), 2).alias("price_mean"),
        F.round(F.stddev("price"), 2).alias("price_stddev"),
        F.round(F.min("price"), 2).alias("price_min"),
        F.round(F.max("price"), 2).alias("price_max")
    )
)


In [0]:

product_features = product_features.join(
    price_stats, "product_id", "left"
)


product_features = (
    product_features
    .withColumn("feature_generated_at", F.current_timestamp())
    .withColumn("source_window", F.lit("october_2023"))
)



In [0]:

pk_check = (
    product_features
    .groupBy("product_id")
    .count()
    .filter("count > 1")
)

if pk_check.count() > 0:
    pk_check.display()
    raise Exception("Primary key violation: product_id")


Silver to Gold (stats added)

In [0]:

product_features.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ecom.gold.product_features")


In [0]:
%sql

ALTER TABLE ecom.gold.product_features
ALTER COLUMN product_id SET NOT NULL;

ALTER TABLE ecom.gold.product_features ADD CONSTRAINT product_features_pk PRIMARY KEY (product_id);


In [0]:
display(spark.read.table("ecom.gold.product_features"))

In [0]:
d = spark.read.table("ecom.gold.product_features")
                     

Feature Engineering

Outlier

In [0]:
from pyspark.sql import functions as F

df = spark.table("ecom.gold.product_features")

# Check for skewness in revenue and purchase rates
df.select("view_to_purchase_rate", "total_revenue", "price_mean").summary().show()

# Find high-value products that might be outliers (Z-Score logic)
price_stats = df.select(F.mean("price_mean").alias("avg"), F.stddev("price_mean").alias("std")).collect()
avg_p, std_p = price_stats[0]['avg'], price_stats[0]['std']

outliers = df.filter(F.col("price_mean") > (avg_p + 3 * std_p))
print(f"Number of price outliers: {outliers.count()}")


Strict filtering for Outlier | Top 10% revenue Threshold

In [0]:
from pyspark.sql import functions as F

# Find the 90th percentile value for total_revenue
threshold_value = df.approxQuantile("total_revenue", [0.9], 0.01)[0]
print(f"Top 10% Revenue Threshold: ${threshold_value:.2f}")

# Create the binary 'label' (1 for Top Seller, 0 for others)
df_labeled = df.withColumn("is_top_seller", 
                           F.when(F.col("total_revenue") >= threshold_value, 1).otherwise(0))

# Quick check: How many Top Sellers do we have?
df_labeled.groupBy("is_top_seller").count().show()

Filters views > 100 | Added to cart > 5 | view_to_purchase_rate > 75% threshold | cart_to_purchase_rate > 25% threshold (this will handle Nulls too)

In [0]:
# Apply the stricter filters for high-quality signal
df_high_intent = df_labeled.filter((F.col("total_views") > 100) & (F.col("total_carts") > 5) & (F.col("view_to_purchase_rate")>0.01) &  (F.col("cart_to_purchase_rate")> 0.4))

# Check how many products remain
print(f"Remaining products for training: {df_high_intent.count()}")
df_high_intent.groupBy("is_top_seller").count().show()

Building Vector Assembler

In [0]:
from pyspark.ml.feature import VectorAssembler

# Prep the features (similar to before, but adding the new label)
feature_cols = ["total_views", "total_carts", "view_to_purchase_rate", "cart_to_purchase_rate", "price_mean"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

ML Model (GBT)

In [0]:
from pyspark.ml.classification import GBTClassifier

# 1. Re-run the Assembler on the new filtered data
X_train, X_test = df_high_intent.randomSplit([0.8, 0.2], seed=42)
train_data = assembler.transform(X_train)

# 2. Train a GBT Model
gbt = GBTClassifier(labelCol="is_top_seller", featuresCol="features", maxIter=10)
gbt_model = gbt.fit(train_data)

# 3. View the new "Clean" Importance
importances = gbt_model.featureImportances
for feature, importance in zip(feature_cols, importances):
    print(f"Feature: {feature:25} Importance: {importance:.4f}")

Training Logic

In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import BinaryClassificationEvaluator

dfs_tmp_path = "/Volumes/workspace/ecommerce/ecommerce_data"

# Start the MLflow experiment
with mlflow.start_run(run_name="Elite_Product_Classifier") as run:
    
    # 1. Log the Threshold Parameters you chose
    mlflow.log_param("min_views", 100)
    mlflow.log_param("min_carts", 5)
    mlflow.log_param("v2p_threshold", 0.01)
    mlflow.log_param("c2p_threshold", 0.4)
    
    # 2. Train the Model (already defined in your previous step)
    # gbt_model = gbt.fit(train_data)
    
    # 3. Log Model & Metrics
    #mlflow.spark.log_model(gbt_model, "gbt_product_model")

    # Updated log_model with the dfs_tmpdir argument
    mlflow.spark.log_model(
        spark_model=gbt_model, 
        artifact_path="gbt-model",
        dfs_tmpdir=dfs_tmp_path # This satisfies the UC Volume requirement
    )
    
    # Calculate and log Accuracy (AUC)
    predictions = gbt_model.transform(assembler.transform(X_test))
    evaluator = BinaryClassificationEvaluator(labelCol="is_top_seller")
    auc = evaluator.evaluate(predictions)
    mlflow.log_metric("auc_roc", auc)
    
    # Log Feature Importances
    for feature, importance in zip(feature_cols, importances):
        mlflow.log_metric(f"importance_{feature}", importance)
        
    print(f"Model logged successfully! AUC: {auc:.4f}")
    run_id = run.info.run_id

MLFlow Experiment

In [0]:
import mlflow
import mlflow.spark
from mlflow.models.signature import infer_signature
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Define your Unity Catalog Volume path for SparkML checkpointing
dfs_tmp_path = "/Volumes/workspace/ecommerce/ecommerce_data"

with mlflow.start_run(run_name="Elite_Product_Classifier_Final") as run:
    
    # 1. Log Business Logic Parameters (Thresholds)
    mlflow.log_params({
        "min_views": 100,
        "min_carts": 5,
        "v2p_threshold": 0.01,
        "c2p_threshold": 0.4
    })
    
    # 2. Create a Model Signature (Critical for Unity Catalog)
    # We use a sample of our training features to define the schema
    input_example = X_train.select(feature_cols).limit(5).toPandas()
    output_example = gbt_model.transform(assembler.transform(X_train)).select("prediction").limit(5).toPandas()
    signature = infer_signature(input_example, output_example)
    
    # 3. Log the Model with Signature and Volume Path
    mlflow.spark.log_model(
        spark_model=gbt_model, 
        artifact_path="gbt-model",
        dfs_tmpdir=dfs_tmp_path,
        signature=signature,
        input_example=input_example
    )
    
    # 4. Evaluate and Log Performance Metrics
    predictions = gbt_model.transform(assembler.transform(X_test))
    evaluator = BinaryClassificationEvaluator(labelCol="is_top_seller")
    auc = evaluator.evaluate(predictions)
    mlflow.log_metric("auc_roc", auc)
    
    # 5. Log Feature Importances for Audit
    for feature, importance in zip(feature_cols, importances):
        mlflow.log_metric(f"importance_{feature}", importance)
        
    print(f"âœ… Model logged successfully!")
    print(f"ðŸ“Š AUC-ROC: {auc:.4f}")
    print(f"ðŸš€ Run ID: {run.info.run_id}")

Model Registered

In [0]:
# Define the Model Name in Unity Catalog format: <catalog>.<schema>.<model_name>
model_name = "workspace.ecommerce.elite_product_classifier"

# Register the model using the Run ID from the previous step
model_uri = f"runs:/{run.info.run_id}/gbt-model"
registered_model = mlflow.register_model(model_uri, model_name)

print(f"âœ… Model registered as: {model_name}")

Rising Star (Products) 

In [0]:
from pyspark.ml.functions import vector_to_array

# 1. Load the model back (using the 'latest' version)

model_name = "workspace.ecommerce.elite_product_classifier"
dfs_tmp_path = "/Volumes/workspace/ecommerce/ecommerce_data"

loaded_model = mlflow.spark.load_model(f"models:/{model_name}/1", dfs_tmpdir=dfs_tmp_path)

# 2. Score all high-intent products
scored_df = loaded_model.transform(assembler.transform(df_high_intent))

# 3. Filter for products the model loves (>80% prob) but aren't Top Sellers yet
rising_stars_final = (
    scored_df
    .withColumn("success_probability", vector_to_array("probability")[1])
    .filter((F.col("is_top_seller") == 0) & (F.col("success_probability") > 0.8))
    .select(
        "product_id", 
        "total_views", 
        "total_revenue", 
        F.round("success_probability", 4).alias("prob_score"),
        "price_mean"
    )
    .orderBy(F.col("success_probability").desc())
)

# 4. Save to a Gold table for Business Teams
rising_stars_final.write.mode("overwrite").saveAsTable("workspace.ecommerce.rising_stars_report")

print("ðŸš€ Rising Stars table created!")
display(rising_stars_final.limit(10))