### Load E-commerce Transactions Data
Load the raw transactions table from Databricks to begin feature engineering.


In [0]:
from pyspark.sql import functions as F

df = spark.table("default.ecommerce_transactions")
display(df.limit(5))

Transaction_ID,User_Name,Age,Country,Product_Category,Purchase_Amount,Payment_Method,Transaction_Date
1,Ava Hall,63,Mexico,Clothing,780.69,Debit Card,2023-04-14
2,Sophia Hall,59,India,Beauty,738.56,PayPal,2023-07-30
3,Elijah Thompson,26,France,Books,178.34,Credit Card,2023-09-17
4,Elijah White,43,Mexico,Sports,401.09,UPI,2023-06-21
5,Ava Harris,48,Germany,Beauty,594.83,Net Banking,2024-10-29


### Aggregate Transactions at Customer Level
Group transaction data by user to derive customer-level features such as total spend, transaction count, and average spend.


In [0]:
user_df = (
    df.groupBy("User_Name")
      .agg(
          F.first("Age", ignorenulls=True).alias("Age"),
          F.first("Country", ignorenulls=True).alias("Country"),
          F.count("*").alias("txn_count"),
          F.sum("Purchase_Amount").alias("total_spend"),
          F.avg("Purchase_Amount").alias("avg_spend")
      )
)

display(user_df.limit(10))

User_Name,Age,Country,txn_count,total_spend,avg_spend
Ava Hall,63,Mexico,517,268756.72999999975,519.8389361702123
Sophia Hall,59,India,516,267975.57,519.3325
Elijah Thompson,26,France,514,257888.03999999992,501.7277042801555
Elijah White,43,Mexico,456,227204.04000000012,498.2544736842109
Ava Harris,48,Germany,520,266846.0800000001,513.1655384615386
Elijah Harris,51,India,507,260886.8300000004,514.5696844181467
Oliver Clark,27,Germany,502,256144.86,510.24872509960153
Olivia Allen,46,Canada,490,251259.07000000012,512.7736122448982
Liam Harris,54,France,493,245372.46,497.7129006085192
Liam Allen,60,Canada,495,252962.9600000001,511.0362828282831


###Calculate Spending Quantiles
Compute 33rd and 66th percentile thresholds to define Low, Medium, and High spending tiers.


In [0]:
q33, q66 = user_df.approxQuantile("total_spend", [0.33, 0.66], 0.01)
print("q33 =", q33, "| q66 =", q66)

q33 = 245798.83000000022 | q66 = 256357.6499999997


### Label Customers into Spending Tiers
Create a categorical target variable (Low / Medium / High) based on total customer spend.


In [0]:
ml_df = (
    user_df.withColumn(
        "Spending_Tier",
        F.when(F.col("total_spend") <= F.lit(q33), F.lit("Low"))
         .when(F.col("total_spend") <= F.lit(q66), F.lit("Medium"))
         .otherwise(F.lit("High"))
    )
)

display(ml_df.groupBy("Spending_Tier").count())
display(ml_df.limit(10))


Spending_Tier,count
High,35
Low,32
Medium,33


User_Name,Age,Country,txn_count,total_spend,avg_spend,Spending_Tier
Ava Hall,63,Mexico,517,268756.72999999975,519.8389361702123,High
Sophia Hall,59,India,516,267975.57,519.3325,High
Elijah Thompson,26,France,514,257888.03999999992,501.7277042801555,High
Elijah White,43,Mexico,456,227204.04000000012,498.2544736842109,Low
Ava Harris,48,Germany,520,266846.0800000001,513.1655384615386,High
Elijah Harris,51,India,507,260886.8300000004,514.5696844181467,High
Oliver Clark,27,Germany,502,256144.86,510.24872509960153,Medium
Olivia Allen,46,Canada,490,251259.07000000012,512.7736122448982,Medium
Liam Harris,54,France,493,245372.46,497.7129006085192,Low
Liam Allen,60,Canada,495,252962.9600000001,511.0362828282831,Medium


### Split Data into Training and Testing Sets
Split the customer-level dataset into training and test sets for model evaluation.


In [0]:

train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)

print("Train rows:", train_df.count())
print("Test rows :", test_df.count())

Train rows: 77
Test rows : 23


### Initialize MLflow Experiment
Create and set an MLflow experiment to track model runs and evaluation metrics.


In [0]:
import mlflow

mlflow.set_experiment("/Day13_Spending_Tier_Model_Comparison")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/1129139708848343', creation_time=1769015017931, experiment_id='1129139708848343', last_update_time=1769045054807, lifecycle_stage='active', name='/Day13_Spending_Tier_Model_Comparison', tags={'mlflow.experiment.sourceName': '/Day13_Spending_Tier_Model_Comparison',
 'mlflow.experimentKind': 'custom_model_development',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'saitejaswikondapally@gmail.com',
 'mlflow.ownerId': '76787938685907'}>

### Create Reusable Model Training Function
Define a helper function to train models, evaluate performance, and log metrics to MLflow.


In [0]:
import mlflow
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluators (once)
f1_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
acc_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Common preprocessing stages (once)
common_stages = [
    StringIndexer(inputCol="Spending_Tier", outputCol="label", handleInvalid="keep"),
    StringIndexer(inputCol="Country", outputCol="Country_idx", handleInvalid="keep"),
    OneHotEncoder(inputCol="Country_idx", outputCol="Country_ohe", handleInvalid="keep"),
    VectorAssembler(inputCols=["Age", "txn_count", "avg_spend", "Country_ohe"],
                    outputCol="features", handleInvalid="keep")
]

In [0]:
def run_model(model, run_name):
    pipeline = Pipeline(stages=common_stages + [model])

    with mlflow.start_run(run_name=run_name):
        fitted = pipeline.fit(train_df)
        preds = fitted.transform(test_df)

        f1 = f1_eval.evaluate(preds)
        acc = acc_eval.evaluate(preds)

        mlflow.log_param("model", model.__class__.__name__)
        mlflow.log_metric("f1", float(f1))
        mlflow.log_metric("accuracy", float(acc))

    print(run_name, "| F1:", f1, "| Acc:", acc)
    return f1, acc


### Train and Compare Multiple ML Models
Train Logistic Regression, Random Forest, and Naive Bayes models using the same pipeline and compare their performance.


In [0]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes

lr = LogisticRegression(featuresCol="features", labelCol="label", family="multinomial", maxIter=50)
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=200, maxDepth=8, seed=42)
nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial")

results = {}
results["LogisticRegression"] = run_model(lr, "Model_1_LogisticRegression")
results["RandomForest"]      = run_model(rf, "Model_2_RandomForest")
results["NaiveBayes"]        = run_model(nb, "Model_3_NaiveBayes")

results

Model_1_LogisticRegression | F1: 0.8757246376811596 | Acc: 0.8695652173913043
Model_2_RandomForest | F1: 0.8235294117647058 | Acc: 0.8260869565217391
Model_3_NaiveBayes | F1: 0.2043478260869565 | Acc: 0.2608695652173913


{'LogisticRegression': (0.8757246376811596, 0.8695652173913043),
 'RandomForest': (0.8235294117647058, 0.8260869565217391),
 'NaiveBayes': (0.2043478260869565, 0.2608695652173913)}