In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, col, count, when, max
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import sys
import os

# Make sure project root is in Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
#configuration for spark instance
spark = SparkSession.builder \
    .appName("EcommerceBehavior") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.hadoop.hadoop.native.lib", "false") \
    .getOrCreate()

In [3]:
# Load CSV
df = spark.read.option("header", True).csv("../data/scaledData-2019-Nov.csv")

# Convert timestamp column
df = df.withColumn("event_time", to_timestamp("event_time"))

# Preview
df.show(5)
df.printSchema()


+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-19 03:35:46|      view|  30200005|2053013554449088861|                NULL|   elari|  77.2|512412397|f62be3c5-18af-4ab...|
|2019-11-26 09:16:08|      view|   1005115|2053013555631882655|electronics.smart...|   apple|916.37|568675496|c857db53-cd0a-480...|
|2019-11-10 12:50:50|      view|  15700275|2053013559733912211|                NULL|imperial|206.16|513262731|c637d18a-6fc5-4c1...|
|2019-11-04 09:23:52|      view|   1004589|2053013555631882655|electronics.smart...|    inoi| 61.36|562973725|e41d3c3f-830e-48d...|
|2019-11-29 12:11:17|  purchase|   5300157|2053013563173241677|             

In [4]:
# Convert 'price' column to float
df = df.withColumn("price", col("price").cast("float"))

# Drop rows with missing essential columns
df = df.dropna(subset=["user_id", "user_session"])

# Filter relevant events (view, cart, purchase)
df = df.filter(df.event_type.isin(["view", "cart", "purchase"]))

# Show the cleaned data preview
df.show(5)


+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-19 03:35:46|      view|  30200005|2053013554449088861|                NULL|   elari|  77.2|512412397|f62be3c5-18af-4ab...|
|2019-11-26 09:16:08|      view|   1005115|2053013555631882655|electronics.smart...|   apple|916.37|568675496|c857db53-cd0a-480...|
|2019-11-10 12:50:50|      view|  15700275|2053013559733912211|                NULL|imperial|206.16|513262731|c637d18a-6fc5-4c1...|
|2019-11-04 09:23:52|      view|   1004589|2053013555631882655|electronics.smart...|    inoi| 61.36|562973725|e41d3c3f-830e-48d...|
|2019-11-29 12:11:17|  purchase|   5300157|2053013563173241677|             

In [5]:
# Funnel analysis with corrected event type
view_df = df.filter(df.event_type == "view").select("user_id").distinct()
cart_df = df.filter(df.event_type == "cart").select("user_id").distinct()
purchase_df = df.filter(df.event_type == "purchase").select("user_id").distinct()

view_count = view_df.count()
cart_count = cart_df.count()
purchase_count = purchase_df.count()

funnel_data = {
    "Views": view_count,
    "Add to Cart (cart)": cart_count,
    "Purchase": purchase_count,
    "View to Cart Drop-off": view_count - cart_count,
    "Cart to Purchase Drop-off": cart_count - purchase_count
}

print(funnel_data)


{'Views': 1929582, 'Add to Cart (cart)': 249718, 'Purchase': 89603, 'View to Cart Drop-off': 1679864, 'Cart to Purchase Drop-off': 160115}


In [6]:
# One-hot like event counting using conditional aggregation
session_df = df.groupBy("user_session").agg(
    count(when(col("event_type") == "view", True)).alias("view_count"),
    count(when(col("event_type") == "cart", True)).alias("cart_count"),
    count(when(col("event_type") == "purchase", True)).alias("purchase_count")
)

# Create label: 1 if purchase_count > 0, else 0
session_df = session_df.withColumn("label", when(col("purchase_count") > 0, 1).otherwise(0))

session_df.show(5)


+--------------------+----------+----------+--------------+-----+
|        user_session|view_count|cart_count|purchase_count|label|
+--------------------+----------+----------+--------------+-----+
|879b893f-feb2-43d...|         2|         0|             0|    0|
|11ccc7df-5d85-499...|         4|         0|             0|    0|
|f2ab18d2-5759-402...|         1|         0|             0|    0|
|ee8dd117-fa84-47d...|         1|         0|             0|    0|
|077035b3-376b-48b...|         1|         0|             0|    0|
+--------------------+----------+----------+--------------+-----+
only showing top 5 rows



In [7]:
# Assemble features
assembler = VectorAssembler(
    inputCols=["view_count", "cart_count"],
    outputCol="features"
)
assembled_df = assembler.transform(session_df)

# Train-test split
train_df, test_df = assembled_df.randomSplit([0.8, 0.2], seed=42)

# Train Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(train_df)

# Predict on test set
predictions = model.transform(test_df)

# Evaluate
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc:.4f}")

AUC: 0.7438


In [8]:
# Train Random Forest
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)
rf_model = rf.fit(train_df)

# Predict and evaluate
rf_predictions = rf_model.transform(test_df)
rf_auc = evaluator.evaluate(rf_predictions)
print(f"Random Forest AUC: {rf_auc:.4f}")

Random Forest AUC: 0.7745


In [12]:
import os
import pandas as pd
import json
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Paths
base_output_dir = "D:/ecomm-bigdata-project/output"
predictions_dir = os.path.join(base_output_dir, "predictions")
feature_dir = os.path.join(base_output_dir, "feature_importances")
metadata_dir = os.path.join(base_output_dir, "model_metadata")

# Create folders if they don't exist
os.makedirs(predictions_dir, exist_ok=True)
os.makedirs(feature_dir, exist_ok=True)
os.makedirs(metadata_dir, exist_ok=True)

# Define evaluator
evaluator = BinaryClassificationEvaluator(labelCol="label")

# Define parameter grid
param_grid = [
    (50, 5, 32),
    (100, 5, 32),
    (200, 5, 32),
    (50, 10, 32),
    (100, 10, 32),
    (200, 10, 32),
    (50, 20, 32),
    (100, 20, 32),
    (200, 20, 32),
    (50, 5, 64),
    (100, 5, 64),
    (200, 5, 64),
    (100, 10, 64),
    (200, 20, 64),
]

# Store all model results
results = []

for i, (numTrees, maxDepth, maxBins) in enumerate(param_grid):
    model_name = f"rf_{numTrees}_{maxDepth}_{maxBins}"
    print(f"\n🚀 Training model {i+1}/{len(param_grid)}: {model_name}")

    # Train model
    rf = RandomForestClassifier(labelCol="label", featuresCol="features",
                                 numTrees=numTrees, maxDepth=maxDepth, maxBins=maxBins)
    model = rf.fit(train_df)

    # Predict
    predictions = model.transform(test_df)

    # Save predictions
    pred_path = os.path.join(predictions_dir, f"{model_name}_predictions.csv")
    predictions.select("user_session", "prediction", "probability").toPandas().to_csv(pred_path, index=False)
    print(f"✅ Predictions saved: {pred_path}")

    # Save feature importances
    fi = model.featureImportances.toArray()
    fi_df = pd.DataFrame(fi, columns=["importance"])
    fi_path = os.path.join(feature_dir, f"{model_name}_feature_importances.csv")
    fi_df.to_csv(fi_path, index=False)
    print(f"✅ Feature importances saved: {fi_path}")

    # Evaluate model
    auc = evaluator.evaluate(predictions)
    print(f"AUC for {model_name}: {auc:.4f}")

    # Save metadata
    metadata = {
        "model_name": model_name,
        "numTrees": numTrees,
        "maxDepth": maxDepth,
        "maxBins": maxBins,
        "AUC": auc
    }
    meta_path = os.path.join(metadata_dir, f"{model_name}_metadata.json")
    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=4)
    print(f"✅ Metadata saved: {meta_path}")

    # Store result for summary
    results.append((model_name, auc))

# Save all model results in one CSV
df_results = pd.DataFrame(results, columns=["model_name", "auc"])
results_path = os.path.join(base_output_dir, "all_model_results.csv")
df_results.to_csv(results_path, index=False)
print(f"\n🎯 All model AUC results saved to: {results_path}")



🚀 Training model 1/14: rf_50_5_32
✅ Predictions saved: D:/ecomm-bigdata-project/output\predictions\rf_50_5_32_predictions.csv
✅ Feature importances saved: D:/ecomm-bigdata-project/output\feature_importances\rf_50_5_32_feature_importances.csv
AUC for rf_50_5_32: 0.7745
✅ Metadata saved: D:/ecomm-bigdata-project/output\model_metadata\rf_50_5_32_metadata.json

🚀 Training model 2/14: rf_100_5_32
✅ Predictions saved: D:/ecomm-bigdata-project/output\predictions\rf_100_5_32_predictions.csv
✅ Feature importances saved: D:/ecomm-bigdata-project/output\feature_importances\rf_100_5_32_feature_importances.csv
AUC for rf_100_5_32: 0.7745
✅ Metadata saved: D:/ecomm-bigdata-project/output\model_metadata\rf_100_5_32_metadata.json

🚀 Training model 3/14: rf_200_5_32
✅ Predictions saved: D:/ecomm-bigdata-project/output\predictions\rf_200_5_32_predictions.csv
✅ Feature importances saved: D:/ecomm-bigdata-project/output\feature_importances\rf_200_5_32_feature_importances.csv
AUC for rf_200_5_32: 0.8123
✅

In [3]:
from src.model import pick_best_model

best_model_name, best_auc = pick_best_model("D:/ecomm-bigdata-project/output/all_model_results.csv")


🎯 Best model: rf_50_20_32 with AUC: 0.8126
