In [1]:
# Setup sys.path to import from src/
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
# Spark session
from pyspark.sql import SparkSession, functions as F

# Import custom functions
from src.preprocessing import load_data, clean_data, feature_engineer, assemble_features
from src.model import train_random_forest, evaluate_model, pick_best_model


In [3]:
spark = SparkSession.builder \
    .appName("Ecommerce Behavior Exploration") \
    .master("local[*]") \
    .getOrCreate()


In [4]:
# Load raw dataset
file_path = "../data/scaledData-2019-Nov.csv"
raw_df = load_data(file_path, spark)

# Quick look at raw data
raw_df.show(5)
raw_df.printSchema()


+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-19 08:35:...|      view|  30200005|2053013554449088861|                NULL|   elari|  77.2|512412397|f62be3c5-18af-4ab...|
|2019-11-26 14:16:...|      view|   1005115|2053013555631882655|electronics.smart...|   apple|916.37|568675496|c857db53-cd0a-480...|
|2019-11-10 17:50:...|      view|  15700275|2053013559733912211|                NULL|imperial|206.16|513262731|c637d18a-6fc5-4c1...|
|2019-11-04 14:23:...|      view|   1004589|2053013555631882655|electronics.smart...|    inoi| 61.36|562973725|e41d3c3f-830e-48d...|
|2019-11-29 17:11:...|  purchase|   5300157|2053013563173241677|     

In [5]:
# Clean the dataset
clean_df = clean_data(raw_df)

# Quick check after cleaning
clean_df.show(5)


+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+
|2019-11-19 03:35:46|      view|  30200005|2053013554449088861|                NULL|   elari|  77.2|512412397|f62be3c5-18af-4ab...|
|2019-11-26 09:16:08|      view|   1005115|2053013555631882655|electronics.smart...|   apple|916.37|568675496|c857db53-cd0a-480...|
|2019-11-10 12:50:50|      view|  15700275|2053013559733912211|                NULL|imperial|206.16|513262731|c637d18a-6fc5-4c1...|
|2019-11-04 09:23:52|      view|   1004589|2053013555631882655|electronics.smart...|    inoi| 61.36|562973725|e41d3c3f-830e-48d...|
|2019-11-29 12:11:17|  purchase|   5300157|2053013563173241677|             

In [6]:
# Create new features if needed
feature_df = feature_engineer(clean_df)

# Quick preview
feature_df.show(5)


+--------------------+---------+-------------+-------------+-------------------+-------------------+------------------+----------------+
|        user_session|num_views|num_cart_adds|num_purchases|      session_start|        session_end|         avg_price|session_duration|
+--------------------+---------+-------------+-------------+-------------------+-------------------+------------------+----------------+
|08327d25-5fe5-4ec...|        1|            0|            0|2019-11-10 23:43:37|2019-11-10 23:43:37|              39.9|               0|
|cbd7e3a0-2c5e-493...|        3|            0|            0|2019-11-17 15:47:35|2019-11-17 15:49:03|              9.07|              88|
|a08f39b4-71f4-431...|        7|            0|            0|2019-11-24 09:34:25|2019-11-24 10:21:36|123.61142857142856|            2831|
|544365e8-d7e6-4ea...|        1|            1|            0|2019-11-09 11:44:13|2019-11-09 11:46:48|             47.47|             155|
|71697745-4b20-419...|        4|         

In [17]:
labels = clean_df.groupBy("user_session").agg(
    (F.max(F.when(F.col("event_type") == "purchase", 1).otherwise(0))).alias("label")
)

final_df = feature_df.join(labels, on="user_session", how="left").fillna(0)


In [18]:
# Feature columns you want to use
feature_cols = ['num_views', 'num_cart_adds', 'session_duration', 'avg_price']

# Assemble features
final_df = assemble_features(final_df, feature_cols)

In [21]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)


In [22]:
# Train Random Forest Model
rf_model = train_random_forest(train_df)


In [24]:
# Evaluate model
auc = evaluate_model(rf_model, test_df)
print(f"Random Forest AUC: {auc:.4f}")

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
train_df.groupBy("label").count().show()
test_df.groupBy("label").count().show()


+-----+-------+
|label|  count|
+-----+-------+
|    1|  85455|
|    0|3754149|
+-----+-------+

+-----+------+
|label| count|
+-----+------+
|    1| 21261|
|    0|938148|
+-----+------+

