In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
dschettler8845_recsys_2020_ecommerce_dataset_path = kagglehub.dataset_download('dschettler8845/recsys-2020-ecommerce-dataset')

print('Data source import complete.')


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib


In [None]:
DATA_DIR = '/kaggle/input/recsys-2020-ecommerce-dataset'

train_df = pd.read_parquet(os.path.join(DATA_DIR,'train.parquet')).sample(n=500000, random_state=42)
val_df = pd.read_parquet(os.path.join(DATA_DIR,'val.parquet')).sample(n=100000, random_state=42)
test_df = pd.read_parquet(os.path.join(DATA_DIR,'test.parquet')).sample(n=100000, random_state=42)

print("Sampled Train shape:", train_df.shape)
print("Columns:", train_df.columns)

Sampled Train shape: (500000, 19)
Columns: Index(['event_time', 'event_type', 'product_id', 'brand', 'price', 'user_id',
       'user_session', 'target', 'cat_0', 'cat_1', 'cat_2', 'cat_3',
       'timestamp', 'ts_hour', 'ts_minute', 'ts_weekday', 'ts_day', 'ts_month',
       'ts_year'],
      dtype='object')


In [None]:
weight_map = {'view':1.0, 'cart':3.0, 'purchase':5.0}
train_df['weight'] = train_df['event_type'].map(lambda x: weight_map.get(x.lower(),1.0))

agg = train_df.groupby(['user_id','product_id'])['weight'].sum().reset_index()
user2idx = {u:i for i,u in enumerate(agg['user_id'].unique())}
item2idx = {i:i_ for i_,i in enumerate(agg['product_id'].unique())}
idx2item = {i_:i for i_,i in enumerate(agg['product_id'].unique())}

agg['u_idx'] = agg['user_id'].map(user2idx)
agg['i_idx'] = agg['product_id'].map(item2idx)

n_users = len(user2idx)
n_items = len(item2idx)

rows = agg['u_idx'].values
cols = agg['i_idx'].values
data_vals = agg['weight'].values
user_item_mat = csr_matrix((data_vals, (rows, cols)), shape=(n_users, n_items))

# SVD Embeddings (low dimension)
svd = TruncatedSVD(n_components=20, random_state=42)
item_emb = svd.fit_transform(user_item_mat.T)  # item embeddings

In [None]:
content_cols = ['brand','cat_0','cat_1','cat_2','cat_3']
item_metadata = train_df[['product_id'] + content_cols].drop_duplicates().set_index('product_id').fillna('unknown')

ohe = OneHotEncoder(sparse=True)
X_content = ohe.fit_transform(item_metadata)
# Map for content-based indexing
content_items = list(item_metadata.index)
idx2content = {i: pid for i, pid in enumerate(content_items)}
content2idx = {pid: i for i, pid in enumerate(content_items)}




In [None]:
def content_sim_on_fly(product_id, top_n=5):
    if product_id not in content2idx:
        return []
    i_idx = content2idx[product_id]
    sims = cosine_similarity(X_content[i_idx], X_content).flatten()
    top_idxs = np.argsort(-sims)[1:top_n+1]
    return [idx2content[i] for i in top_idxs]

def recommend_for_user(user_id, top_n=10):
    if user_id not in user2idx:
        return []
    uidx = user2idx[user_id]
    row = user_item_mat.getrow(uidx).toarray().flatten()
    interacted_idx = np.where(row>0)[0]
    scores = np.zeros(n_items)
    for ii in interacted_idx:
        sims = cosine_similarity(item_emb[ii].reshape(1,-1), item_emb).flatten()
        scores += row[ii]*sims
    scores[interacted_idx] = -np.inf
    top_idxs = np.argsort(-scores)[:top_n]
    return [idx2item[i] for i in top_idxs if scores[i]!=-np.inf]

def hybrid_recommend(user_id, top_n=10):
    collab_recs = recommend_for_user(user_id, top_n=top_n)
    content_recs = []
    for item in collab_recs[:5]:
        content_recs += content_sim_on_fly(item, top_n=2)
    hybrid_recs = list(dict.fromkeys(collab_recs + content_recs))
    return hybrid_recs[:top_n]

In [None]:
train_df['liked'] = train_df['event_type'].apply(lambda x: 1 if x.lower()=='purchase' else 0)

features = ['user_id','product_id','price']
X = train_df[features].copy()
y = train_df['liked']

le_user = LabelEncoder()
le_item = LabelEncoder()
X['user_id'] = le_user.fit_transform(X['user_id'])
X['product_id'] = le_item.fit_transform(X['product_id'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

# ------------------------------
# Step 6: Final Hybrid + ML Ranking
# ------------------------------
def final_recommend(user_id, top_n=10):
    hybrid_items = hybrid_recommend(user_id, top_n=20)
    df_pred = pd.DataFrame({
        'user_id':[user_id]*len(hybrid_items),
        'product_id':hybrid_items,
        'price':[train_df[train_df['product_id']==i]['price'].values[0] for i in hybrid_items]
    })
    df_pred['user_id'] = le_user.transform(df_pred['user_id'])
    df_pred['product_id'] = le_item.transform(df_pred['product_id'])
    probs = clf.predict_proba(df_pred)[:,1]
    df_pred['prob'] = probs
    df_pred = df_pred.sort_values('prob', ascending=False)
    return df_pred['product_id'].tolist()[:top_n]

              precision    recall  f1-score   support

           0       0.68      0.79      0.73     63618
           1       0.49      0.35      0.41     36382

    accuracy                           0.63    100000
   macro avg       0.59      0.57      0.57    100000
weighted avg       0.61      0.63      0.61    100000

ROC-AUC: 0.6197042891285451


In [None]:
sample_users = list(user2idx.keys())[:5]
for u in sample_users:
    print(f"\nFinal recommendations for user {u}:")
    print(final_recommend(u, top_n=10))


Final recommendations for user 153449371:
[334, 21316, 10199, 33553, 43662, 33550, 26102, 38131, 51661, 50882]

Final recommendations for user 171409616:
[10511, 10055, 35363, 34110, 7727, 42761, 10616, 34911, 21743, 7287]

Final recommendations for user 220134341:
[20505, 46951, 1300, 26009, 19244, 5638, 4149, 48463, 50494, 45368]

Final recommendations for user 226242984:
[25867, 20700, 23602, 13719, 25665, 25123, 25603, 48632, 47076, 25896]

Final recommendations for user 237470903:
[44812, 9581, 23238, 10449, 25375, 20724, 35938, 26514, 18715, 52198]


In [None]:
joblib.dump(item_emb,'item_emb.pkl')
joblib.dump(user_item_mat,'user_item_mat.pkl')
joblib.dump(user2idx,'user2idx.pkl')
joblib.dump(item2idx,'item2idx.pkl')
joblib.dump(idx2item,'idx2item.pkl')
joblib.dump(svd,'svd_model.pkl')
joblib.dump(ohe,'ohe.pkl')
joblib.dump(clf,'rf_recommender.pkl')
joblib.dump(le_user,'le_user.pkl')
joblib.dump(le_item,'le_item.pkl')

print("All .pkl files saved! Ready for Streamlit deployment.")

All .pkl files saved! Ready for Streamlit deployment.
