In [1]:
# Setup sys.path to import from src/
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
# Spark session
from pyspark.sql import SparkSession, functions as F

# Import custom functions
from src.preprocessing import load_data, clean_data, feature_engineer, assemble_features
from src.save_outputs import save_predictions, save_feature_importances, save_model_metadata
from src.model import train_random_forest, pick_best_model_from_grid, evaluate_model, manual_grid_search_rf, manual_grid_search_dt, manual_grid_search_gbt, manual_grid_search_lr, manual_grid_search_nb


In [3]:
spark = SparkSession.builder \
    .appName("Ecommerce Behavior Exploration") \
    .master("local[*]") \
    .getOrCreate()


In [4]:
# Load raw dataset
file_path = "../data/scaledData-2019-Nov.csv"
raw_df = load_data(file_path, spark)

# Quick look at raw data
raw_df.show(5)
raw_df.printSchema()


+--------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|2019-11-17 12:57:...|      view|   1005239|2053013555631882655|electronics.smart...| xiaomi| 262.3|517220397|294a6e27-97e1-4f6...|
|2019-11-20 09:05:...|      view|   1004655|2053013555631882655|electronics.smart...|samsung|744.17|561605578|22682045-26c7-4e7...|
|2019-11-17 03:40:...|      view|   1801653|2053013554415534427|electronics.video.tv|    tcl|539.47|530939266|d90cf77a-cba9-4fe...|
|2019-11-14 09:29:...|      view|   1004250|2053013555631882655|electronics.smart...|  apple|778.36|571366119|b321c1d5-7de3-440...|
|2019-11-20 20:16:...|      view|   7102014|2053013555464110485|furniture.be

In [5]:
# Clean the dataset
clean_df = clean_data(raw_df)

# Quick check after cleaning
clean_df.show(5)


+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|2019-11-17 07:57:11|      view|   1005239|2053013555631882655|electronics.smart...| xiaomi| 262.3|517220397|294a6e27-97e1-4f6...|
|2019-11-20 04:05:49|      view|   1004655|2053013555631882655|electronics.smart...|samsung|744.17|561605578|22682045-26c7-4e7...|
|2019-11-16 22:40:32|      view|   1801653|2053013554415534427|electronics.video.tv|    tcl|539.47|530939266|d90cf77a-cba9-4fe...|
|2019-11-14 04:29:48|      view|   1004250|2053013555631882655|electronics.smart...|  apple|778.36|571366119|b321c1d5-7de3-440...|
|2019-11-20 15:16:22|      view|   7102014|2053013555464110485|furniture.bedroom...

In [6]:
# Create new features if needed
feature_df = feature_engineer(clean_df)

# Quick preview
feature_df.show(5)


+--------------------+---------+-------------+-------------+-------------------+-------------------+------------------+----------------+
|        user_session|num_views|num_cart_adds|num_purchases|      session_start|        session_end|         avg_price|session_duration|
+--------------------+---------+-------------+-------------+-------------------+-------------------+------------------+----------------+
|e7b1d62a-2052-417...|        1|            0|            0|2019-11-25 23:17:45|2019-11-25 23:17:45|             20.32|               0|
|347a49d4-879c-40b...|        2|            0|            0|2019-11-16 06:55:01|2019-11-16 07:08:11|318.21000000000004|             790|
|a8c4169c-f7c5-4ac...|        1|            0|            0|2019-11-18 08:10:00|2019-11-18 08:10:00|            167.06|               0|
|5ea7b452-8a45-47a...|        3|            0|            0|2019-11-25 11:59:49|2019-11-25 12:19:57| 39.85333333333333|            1208|
|55f2520f-972e-4c2...|        1|         

In [7]:
labels = clean_df.groupBy("user_session").agg(
    (F.max(F.when(F.col("event_type") == "purchase", 1).otherwise(0))).alias("label")
)

final_df = feature_df.join(labels, on="user_session", how="left").fillna(0)


In [8]:
# Feature columns you want to use
feature_cols = ['num_views', 'num_cart_adds', 'session_duration', 'avg_price']

# Assemble features
final_df = assemble_features(final_df, feature_cols)

In [9]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [None]:
manual_grid_search_rf(train_df, test_df)

🚀 Starting Manual Random Forest grid search with model saves...
⏩ Skipping already trained: numTrees=50, maxDepth=5, maxBins=32
⏩ Skipping already trained: numTrees=100, maxDepth=5, maxBins=32
⏩ Skipping already trained: numTrees=200, maxDepth=5, maxBins=32
⏩ Skipping already trained: numTrees=50, maxDepth=10, maxBins=32
⏩ Skipping already trained: numTrees=100, maxDepth=10, maxBins=32
🔵 Training: numTrees=200, maxDepth=10, maxBins=32


In [12]:
manual_grid_search_lr(train_df, test_df)

🚀 Starting Manual Logistic Regression grid search...
⏩ Skipping already trained: maxIter=50, regParam=0.0, elasticNetParam=0.0
🔵 Training: maxIter=100, regParam=0.1, elasticNetParam=0.0
🔹 AUC: 0.9579
✅ Predictions saved: D:/ecomm-bigdata-project/output/lr/lr_100_0.1_0.0\lr_100_0.1_0.0_predictions.csv
✅ Model metadata saved: D:/ecomm-bigdata-project/output/lr/lr_100_0.1_0.0\lr_100_0.1_0.0_metadata.json
🔵 Training: maxIter=100, regParam=0.01, elasticNetParam=0.5


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\Karika\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Karika\AppData\Local\Programs\Python\Python311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Karika\AppData\Local\Programs\Python\Python311\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [13]:
manual_grid_search_dt(train_df, test_df)

🚀 Starting Manual Decision Tree grid search...
🔵 Training: maxDepth=5, maxBins=32
🔹 AUC: 0.9993
✅ Predictions saved: D:/ecomm-bigdata-project/output/dt/dt_5_32\dt_5_32_predictions.csv
✅ Feature importances saved: D:/ecomm-bigdata-project/output/dt/dt_5_32\dt_5_32_feature_importances.csv
✅ Model metadata saved: D:/ecomm-bigdata-project/output/dt/dt_5_32\dt_5_32_metadata.json
🔵 Training: maxDepth=10, maxBins=32
🔹 AUC: 0.9996
✅ Predictions saved: D:/ecomm-bigdata-project/output/dt/dt_10_32\dt_10_32_predictions.csv
✅ Feature importances saved: D:/ecomm-bigdata-project/output/dt/dt_10_32\dt_10_32_feature_importances.csv
✅ Model metadata saved: D:/ecomm-bigdata-project/output/dt/dt_10_32\dt_10_32_metadata.json
🔵 Training: maxDepth=20, maxBins=32
🔹 AUC: 0.9995
✅ Predictions saved: D:/ecomm-bigdata-project/output/dt/dt_20_32\dt_20_32_predictions.csv
✅ Feature importances saved: D:/ecomm-bigdata-project/output/dt/dt_20_32\dt_20_32_feature_importances.csv
✅ Model metadata saved: D:/ecomm-bigdata

In [None]:
manual_grid_search_nb(train_df, test_df)

🚀 Starting Manual Naive Bayes grid search...
⏩ Skipping already trained: smoothing=1.0, modelType=multinomial
⏩ Skipping already trained: smoothing=0.5, modelType=multinomial
🔵 Training: smoothing=2.0, modelType=multinomial
🔹 AUC: 0.5377


In [None]:
manual_grid_search_gbt(train_df, test_df)

In [None]:
base_rf_dir = "D:/ecomm-bigdata-project/output/rf/"
results_csv_path = os.path.join(base_rf_dir, "grid_search_results.csv")

best_model_outputs = pick_best_model_from_grid(results_csv_path, base_rf_dir)