In [2]:
import sys
sys.executable


'c:\\Users\\Sanaz\\Desktop\\explainable-product-representations\\venv\\Scripts\\python.exe'

In [5]:
import pandas as pd
import numpy as np
import shap
import sklearn


In [6]:

from pathlib import Path

# Set paths
PROJECT_ROOT = Path("..").resolve()
DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED_DIR = PROJECT_ROOT / "data"

PROJECT_ROOT, DATA_RAW_DIR, DATA_PROCESSED_DIR


(WindowsPath('C:/Users/Sanaz/Desktop/explainable-product-representations'),
 WindowsPath('C:/Users/Sanaz/Desktop/explainable-product-representations/data/raw'),
 WindowsPath('C:/Users/Sanaz/Desktop/explainable-product-representations/data'))

Load Raw Data

In [7]:
csv_file = DATA_RAW_DIR / "2019-Oct.csv"   # e.g. "2019-Oct.csv"

print(csv_file)
df = pd.read_csv(csv_file)
df.head()


C:\Users\Sanaz\Desktop\explainable-product-representations\data\raw\2019-Oct.csv


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


Keep Relevant Columns & Basic Cleaning

In [8]:
# Keep only the columns we need
cols = ["user_id", "product_id", "event_type", "category_code"]
df_small = df[cols].copy()

# Drop obvious missing ids / event types
df_small = df_small.dropna(subset=["user_id", "product_id", "event_type"])

df_small.head(), df_small.shape


(     user_id  product_id event_type                        category_code
 0  541312140    44600062       view                                  NaN
 1  554748717     3900821       view  appliances.environment.water_heater
 2  519107250    17200506       view           furniture.living_room.sofa
 3  550050854     1307067       view                   computers.notebook
 4  535871217     1004237       view               electronics.smartphone,
 (42448764, 4))

Optional Subsample (for speed)

In [9]:
# Optional: subsample for faster experimentation
# Comment this out if you want to use the full dataset.
n_samples = 500_000

if len(df_small) > n_samples:
    df_small = df_small.sample(n=n_samples, random_state=42)

df_small.shape


(500000, 4)

Aggregate Behavioral Counts per Product

In [10]:
# Aggregate counts per (product_id, event_type)
behavior = (
    df_small
    .groupby(["product_id", "event_type"])
    .size()
    .unstack(fill_value=0)  # columns: view, cart, purchase (if present)
    .reset_index()
)

behavior.head()


event_type,product_id,cart,purchase,view
0,1000978,0,0,40
1,1001588,0,0,25
2,1001606,0,0,1
3,1002042,0,0,1
4,1002062,0,0,35


In [11]:
# Ensure all expected event columns exist
for col in ["view", "cart", "purchase"]:
    if col not in behavior.columns:
        behavior[col] = 0

behavior = behavior[["product_id", "view", "cart", "purchase"]]
behavior.head()


event_type,product_id,view,cart,purchase
0,1000978,40,0,0
1,1001588,25,0,0
2,1001606,1,0,0
3,1002042,1,0,0
4,1002062,35,0,0


Dominant Category per Product

In [12]:
# Compute a dominant category_code for each product (most frequent one)
prod_cat = (
    df_small
    .dropna(subset=["category_code"])
    .groupby("product_id")["category_code"]
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index()
)

prod_cat.head()


Unnamed: 0,product_id,category_code
0,1000978,electronics.smartphone
1,1001588,electronics.smartphone
2,1001606,electronics.smartphone
3,1002042,electronics.smartphone
4,1002062,electronics.smartphone


In [13]:
# Merge category into behavior features
features = behavior.merge(prod_cat, on="product_id", how="left")
features.head()


Unnamed: 0,product_id,view,cart,purchase,category_code
0,1000978,40,0,0,electronics.smartphone
1,1001588,25,0,0,electronics.smartphone
2,1001606,1,0,0,electronics.smartphone
3,1002042,1,0,0,electronics.smartphone
4,1002062,35,0,0,electronics.smartphone


Log-Transform Counts (to reduce skew)

In [14]:
# Log-transform behavioral counts to reduce skew
for col in ["view", "cart", "purchase"]:
    features[f"log_{col}"] = np.log1p(features[col])

features.head()


Unnamed: 0,product_id,view,cart,purchase,category_code,log_view,log_cart,log_purchase
0,1000978,40,0,0,electronics.smartphone,3.713572,0.0,0.0
1,1001588,25,0,0,electronics.smartphone,3.258097,0.0,0.0
2,1001606,1,0,0,electronics.smartphone,0.693147,0.0,0.0
3,1002042,1,0,0,electronics.smartphone,0.693147,0.0,0.0
4,1002062,35,0,0,electronics.smartphone,3.583519,0.0,0.0


Save Processed Features

In [15]:
# Make sure processed data directory exists
DATA_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

out_file = DATA_PROCESSED_DIR / "product_behavior_features.csv"
features.to_csv(out_file, index=False)

out_file, features.shape


(WindowsPath('C:/Users/Sanaz/Desktop/explainable-product-representations/data/product_behavior_features.csv'),
 (63695, 8))

Quick Sanity Checks

In [16]:
features.describe()[["view", "cart", "purchase", "log_view", "log_cart", "log_purchase"]]


Unnamed: 0,view,cart,purchase,log_view,log_cart,log_purchase
count,63695.0,63695.0,63695.0,63695.0,63695.0,63695.0
mean,7.540435,0.170516,0.138959,1.34725,0.036327,0.049142
std,51.73094,4.314058,2.493339,0.880839,0.246177,0.243462
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.693147,0.0,0.0
50%,2.0,0.0,0.0,1.098612,0.0,0.0
75%,4.0,0.0,0.0,1.609438,0.0,0.0
max,4894.0,623.0,341.0,8.49597,6.43615,5.834811
