In [212]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import random

In [3]:
interactions = pd.read_csv("processed_data/interactions_train.csv")
interactions_test = pd.read_csv("processed_data/interactions_valid.csv")
item_popularity = pd.read_csv("processed_data/popularity_item.csv")
user_popularity = pd.read_csv("processed_data/popularity_user.csv")
product_features = pd.read_csv("processed_data/product_features.csv")

In [3]:
len(set(interactions['visitorid'].unique()) & set(interactions_test['visitorid'].unique()))

889

In [4]:
interactions_test = interactions_test[interactions_test['visitorid'].isin(interactions['visitorid'].unique())]

In [5]:
interactions['visitorid'].nunique()

89010

## Baseline Model: Recommend the most popular 50 items

In [None]:
top_items = item_popularity.sort_values(by='number_of_views', ascending=False)
top_popular_items = top_items['itemid'].head(50).tolist()

In [188]:
def precision_at_k(actual, predicted, k=10):
    predicted = predicted[:k]
    hits = len(set(predicted) & actual)
    return hits / k

def recall_at_k(actual, predicted, k=10):
    predicted = predicted[:k]
    hits = len(set(predicted) & actual)
    if len(actual)==0:
        return 0
    return hits/len(actual)

In [None]:
test_interactions = interactions_test.groupby('visitorid')['itemid'].apply(set).reset_index()
test_interactions.columns = ['visitorid', 'actual_items']
train_interactions = interactions.groupby('visitorid')['itemid'].apply(set)
test_interactions['actual_items_unseen'] = test_interactions['visitorid'].apply(lambda x: train_interactions[x] if x in train_interactions.index else {})
test_interactions['actual_items_unseen'] = test_interactions.apply(lambda row: row['actual_items'] - row['actual_items_unseen'], axis=1)

In [196]:
for k in [1,5,10,20,50]:
    test_interactions['precision'] = test_interactions['actual_items'].apply(lambda x: precision_at_k(x, top_popular_items,k))
    test_interactions['recall'] = test_interactions['actual_items'].apply(lambda x: recall_at_k(x, top_popular_items,k))
    print(f'Precision@{k}:', np.mean(test_interactions['precision']),f'Recall@{k}:', np.mean(test_interactions['recall']))

Precision@1: 0.021372328458942633 Recall@1: 0.010025006395473974
Precision@5: 0.010123734533183354 Recall@5: 0.027769271722697685
Precision@10: 0.00719910011248594 Recall@10: 0.04083174605626313
Precision@20: 0.005961754780652419 Recall@20: 0.061960463123044346
Precision@50: 0.0040269966254218225 Recall@50: 0.10431156169934218


In [197]:
for k in [1,5,10,20,50]:
    test_interactions['precision'] = test_interactions.apply(lambda row: precision_at_k(row['actual_items_unseen'], list(set(top_popular_items)-(row['actual_items']-row['actual_items_unseen'])),k),axis=1)
    test_interactions['recall'] = test_interactions.apply(lambda row: recall_at_k(row['actual_items_unseen'], list(set(top_popular_items)-(row['actual_items']-row['actual_items_unseen'])),k),axis=1)
    print(f'Precision@{k} on unseen:', np.mean(test_interactions['precision']),f'Recall@{k} on unseen:', np.mean(test_interactions['recall']))

Precision@1 on unseen: 0.0 Recall@1 on unseen: 0.0
Precision@5 on unseen: 0.0044994375703037125 Recall@5 on unseen: 0.009488969742144937
Precision@10 on unseen: 0.003262092238470192 Recall@10 on unseen: 0.013543191225329104
Precision@20 on unseen: 0.003262092238470192 Recall@20 on unseen: 0.02466121009828822
Precision@50 on unseen: 0.0024296962879640045 Recall@50 on unseen: 0.05054421084327059


## Recently Viewed items

In [None]:
recently_interacted = interactions.sort_values(by=['visitorid', 'timestamp'], ascending=[True, False])
top_recently_interacted = recently_interacted.groupby('visitorid')['itemid'].apply(set).reset_index()

In [30]:
test_interactions = test_interactions.merge(top_recently_interacted, on='visitorid',how='left')
test_interactions.rename(columns={'itemid':'RVI_predicted'}, inplace= True) 

In [50]:
def evaluate_baseline_model(test_data,k=10):
    precisions = []
    recalls = []

    test_data['predicted'] = test_data['RVI_predicted'].apply(lambda x: list(x)[:k] if x else [])
    test_data['precision'] = test_data.apply(lambda row: precision_at_k(row['actual_items'], row['predicted'], k), axis=1)
    test_data['recall'] = test_data.apply(lambda row: recall_at_k(row['actual_items'], row['predicted'], k), axis=1)

    return {
        f'Precision@{k}': np.mean(test_data['precision']),
        f'Recall@{k}': np.mean(test_data['recall']),
    }

In [54]:
for k in [1,5,10,20,50]:
    print(evaluate_baseline_model(test_interactions, k=k))

{'Precision@1': 0.3138357705286839, 'Recall@1': 0.2880005664876306}
{'Precision@5': 0.07896512935883016, 'Recall@5': 0.336465896308416}
{'Precision@10': 0.04139482564679415, 'Recall@10': 0.3424523717631232}
{'Precision@20': 0.02131608548931383, 'Recall@20': 0.34402791234227204}
{'Precision@50': 0.009043869516310463, 'Recall@50': 0.34574689844583406}


In [56]:
from collections import defaultdict

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set)

for idx, row in interactions.iterrows():
    user,item = row['visitorid'], row['itemid']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

## User user collaborative Filtering

In [168]:
user_item = pd.DataFrame(interactions.groupby(['visitorid','itemid'])['timestamp'].count().reset_index())
user_wise_df = user_item.pivot(index='visitorid', columns='itemid', values='timestamp').fillna(0)


In [86]:
user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_wise_df.index)}
index_to_user_id = {idx: user_id for user_id, idx in user_id_to_index.items()}
item_id_to_index = {item_id: idx for idx, item_id in enumerate(user_wise_df.columns)}
index_to_item_id = {idx: item_id for item_id, idx in item_id_to_index.items()}

In [169]:
from scipy import sparse
sparse_user_wise_df = sparse.csr_matrix(user_wise_df )
similarities = cosine_similarity(sparse_user_wise_df,dense_output=False)

In [None]:
import heapq
def evaluate_user_collaborative(test_interactions,k=10):
    recalls = []
    precisions = []
    recalls_unseen = []
    precisions_unseen = []
    
    
    for user, actual_items, actual_items_unseen in zip(test_interactions['visitorid'], test_interactions['actual_items'], test_interactions['actual_items_unseen']):
        specific_user_index = user_id_to_index[user]
        # Get nearest neighbors
        user_similarities = similarities[specific_user_index].toarray().flatten()
        top_similar_indices = heapq.nlargest(10, range(len(user_similarities)), key=lambda i: user_similarities[i])
        similar_users = [index_to_user_id[idx] for idx in top_similar_indices][:10]

        # Vectorized prediction for unseen items
        predictions = user_wise_df.loc[similar_users].sum(axis=0)
        predictions = predictions[predictions!=0]
        unseen_items = list(set(predictions.index) - set(itemsPerUser[user]))
        predictions_unseen = predictions[unseen_items]
        
        # Select top-k items
        top_k_items = predictions.nlargest(k).index.tolist()
        top_k_items_unseen = predictions_unseen.nlargest(k).index.tolist()
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))
        if len(actual_items_unseen)!=0:
            recalls_unseen.append(recall_at_k(actual_items_unseen, top_k_items_unseen, k))
            precisions_unseen.append(precision_at_k(actual_items_unseen, top_k_items_unseen, k))
    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls),
        f'Precision@{k} on unseen': np.mean(precisions_unseen),
        f'Recall@{k} on unseen': np.mean(recalls_unseen),
    }

In [171]:
for k in [1,5,10,20,50]:
    print(evaluate_user_collaborative(test_interactions, k=k))

{'Precision@1': 0.3340832395950506, 'Recall@1': 0.30273331996386854, 'Precision@1 on unseen': 0.0, 'Recall@1 on unseen': 0.0}
{'Precision@5': 0.08053993250843644, 'Recall@5': 0.3400933843558745, 'Precision@5 on unseen': 0.0009966777408637875, 'Recall@5 on unseen': 0.002558139534883721}
{'Precision@10': 0.04173228346456693, 'Recall@10': 0.34427281322596703, 'Precision@10 on unseen': 0.0009966777408637873, 'Recall@10 on unseen': 0.0062126245847176085}
{'Precision@20': 0.021766029246344205, 'Recall@20': 0.348719609243348, 'Precision@20 on unseen': 0.0005813953488372094, 'Recall@20 on unseen': 0.006295681063122924}
{'Precision@50': 0.009156355455568055, 'Recall@50': 0.34993121601227506, 'Precision@50 on unseen': 0.0002325581395348837, 'Recall@50 on unseen': 0.006295681063122924}


## Item item collaborative Filtering

In [172]:
sparse_item_wise_df = sparse.csr_matrix(user_wise_df.T) 
item_similarities = cosine_similarity(sparse_item_wise_df, dense_output=False)

In [173]:
def evaluate_item_collaborative(test_interactions, k=10):
    recalls = []
    precisions = []
    recalls_unseen = []
    precisions_unseen = []
    
    for user, actual_items, actual_items_unseen in zip(test_interactions['visitorid'], 
                                                           test_interactions['actual_items'], 
                                                           test_interactions['actual_items_unseen']):
        seen_items = itemsPerUser[user]  
        
        # Generate predictions
        predictions = pd.Series(dtype=float)
        for item in seen_items:
            specific_item_index = item_id_to_index[item]
            item_similarities_row = item_similarities[specific_item_index].toarray().flatten()
            
            # Aggregate scores for items similar to the current item
            similar_item_indices = np.argsort(item_similarities_row)[-10:]  # Top 10 similar items
            for idx in similar_item_indices:
                similar_item = index_to_item_id[idx]
                if similar_item not in predictions:
                    predictions[similar_item] = 0
                predictions[similar_item] += item_similarities_row[idx]
        
        # Exclude items the user has already interacted with for unseen predictions
        unseen_items = list(set(predictions.index) - set(seen_items))
        predictions_unseen = predictions.loc[unseen_items]
        
        # Select top-k items
        top_k_items = predictions.nlargest(k).index.tolist()
        top_k_items_unseen = predictions_unseen.nlargest(k).index.tolist()
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))
        if len(actual_items_unseen) != 0:
            recalls_unseen.append(recall_at_k(actual_items_unseen, top_k_items_unseen, k))
            precisions_unseen.append(precision_at_k(actual_items_unseen, top_k_items_unseen, k))
    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls),
        f'Precision@{k} on unseen': np.mean(precisions_unseen),
        f'Recall@{k} on unseen': np.mean(recalls_unseen),
    }

In [174]:
for k in [1,5,10,20,50]:
    print(evaluate_item_collaborative(test_interactions, k=k))

{'Precision@1': 0.312710911136108, 'Recall@1': 0.2877448354669952, 'Precision@1 on unseen': 0.011627906976744186, 'Recall@1 on unseen': 0.005592469545957917}
{'Precision@5': 0.0827896512935883, 'Recall@5': 0.3504008021724557, 'Precision@5 on unseen': 0.009634551495016613, 'Recall@5 on unseen': 0.027948504983388703}
{'Precision@10': 0.04488188976377953, 'Recall@10': 0.36637081052970605, 'Precision@10 on unseen': 0.006644518272425249, 'Recall@10 on unseen': 0.041791251384274634}
{'Precision@20': 0.023172103487064118, 'Recall@20': 0.3696254882974273, 'Precision@20 on unseen': 0.00398671096345515, 'Recall@20 on unseen': 0.04331180017226529}
{'Precision@50': 0.009943757030371203, 'Recall@50': 0.3752370353834019, 'Precision@50 on unseen': 0.0018604651162790697, 'Recall@50 on unseen': 0.04779235484296628}


## Singular Value decomoposition

In [175]:
from scipy.sparse.linalg import svds 
# Singular Value Decomposition
U, s, Vt = svds(sparse_user_wise_df, k = 50) # k is the number of latent features

# Construct diagonal array in SVD
sigma = np.diag(s)
predicted_interactions = np.dot(np.dot(U, sigma), Vt) 

In [176]:
def evaluate_svd(test_interactions,k=10):
    recalls = []
    precisions = []
    recalls_unseen = []
    precisions_unseen = []
    
    
    for user, actual_items, actual_items_unseen in zip(test_interactions['visitorid'], test_interactions['actual_items'], test_interactions['actual_items_unseen']):
        specific_user_index = user_id_to_index[user]

        # Vectorized prediction for unseen items
        predictions = predicted_interactions[specific_user_index]
        seen_items = [item_id_to_index[id] for id in itemsPerUser[user]]
        predictions_unseen = [v for i,v in enumerate(predictions) if i not in seen_items]
        predictions = np.argsort(predictions)[-k:][::-1]
        predictions_unseen = np.argsort(predictions_unseen)[-k:][::-1]
        
        # Select top-k items
        top_k_items = [index_to_item_id[pred] for pred in predictions]
        top_k_items_unseen = [index_to_item_id[pred] for pred in predictions_unseen]
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))
        if len(actual_items_unseen)!=0:
            recalls_unseen.append(recall_at_k(actual_items_unseen, top_k_items_unseen, k))
            precisions_unseen.append(precision_at_k(actual_items_unseen, top_k_items_unseen, k))
    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls),
        f'Precision@{k} on unseen': np.mean(precisions_unseen),
        f'Recall@{k} on unseen': np.mean(recalls_unseen)}

In [177]:
for k in [1,5,10,20,50]:
    print(evaluate_svd(test_interactions,k))

{'Precision@1': 0.07086614173228346, 'Recall@1': 0.05615159649703595, 'Precision@1 on unseen': 0.011627906976744186, 'Recall@1 on unseen': 0.006730650916697429}
{'Precision@5': 0.02249718785151856, 'Recall@5': 0.08523748784537476, 'Precision@5 on unseen': 0.004983388704318937, 'Recall@5 on unseen': 0.015727773744909607}
{'Precision@10': 0.015185601799775027, 'Recall@10': 0.10810644832171178, 'Precision@10 on unseen': 0.002990033222591362, 'Recall@10 on unseen': 0.017758882171366872}
{'Precision@20': 0.009392575928009, 'Recall@20': 0.12946307125160952, 'Precision@20 on unseen': 0.0020764119601328905, 'Recall@20 on unseen': 0.026705667739939466}
{'Precision@50': 0.005309336332958381, 'Recall@50': 0.17765943953758645, 'Precision@50 on unseen': 0.0011960132890365448, 'Recall@50 on unseen': 0.03747029071386476}


## Removing duplicate events , but weighing the events

In [178]:
event_map={'view':1, 'addtocart':2, 'transaction':3}
train_interactions = interactions[['visitorid','itemid','event']]
train_interactions = train_interactions.drop_duplicates()
train_interactions['weight'] = train_interactions['event'].apply(lambda x:event_map[x])
train_interactions = train_interactions.sort_values(by=['visitorid','itemid','weight'], ascending=[True, True, False])
train_interactions = train_interactions.drop_duplicates(subset=['visitorid','itemid'], keep='first')


### User-user collaborative

In [179]:
user_wise_df = train_interactions.pivot(index='visitorid', columns='itemid', values='weight').fillna(0)
sparse_user_wise_df = sparse.csr_matrix(user_wise_df )
similarities = cosine_similarity(sparse_user_wise_df,dense_output=False)

In [180]:
for k in [1,5,10,20,50]:
    print(evaluate_user_collaborative(test_interactions, k=k))

{'Precision@1': 0.3228346456692913, 'Recall@1': 0.2949885728569643, 'Precision@1 on unseen': 0.008305647840531562, 'Recall@1 on unseen': 0.0054490083559851}
{'Precision@5': 0.08233970753655792, 'Recall@5': 0.34256559259118996, 'Precision@5 on unseen': 0.0023255813953488376, 'Recall@5 on unseen': 0.006487214336051545}
{'Precision@10': 0.04274465691788527, 'Recall@10': 0.34679475615692923, 'Precision@10 on unseen': 0.0016611295681063123, 'Recall@10 on unseen': 0.010141699385885432}
{'Precision@20': 0.021822272215973, 'Recall@20': 0.35067503410919815, 'Precision@20 on unseen': 0.0009136212624584718, 'Recall@20 on unseen': 0.010169854124327913}
{'Precision@50': 0.009223847019122608, 'Recall@50': 0.35250533494821673, 'Precision@50 on unseen': 0.00046511627906976747, 'Recall@50 on unseen': 0.012098610122851352}


### Item-item collaborative

In [181]:
sparse_item_wise_df = sparse.csr_matrix(user_wise_df.T )
item_similarities = cosine_similarity(sparse_item_wise_df,dense_output=False)

In [182]:
for k in [1,5,10,20,50]:
    print(evaluate_item_collaborative(test_interactions, k=k))

{'Precision@1': 0.31833520809898763, 'Recall@1': 0.2905352412745642, 'Precision@1 on unseen': 0.014950166112956811, 'Recall@1 on unseen': 0.008084163898117386}
{'Precision@5': 0.08188976377952754, 'Recall@5': 0.34516814430454257, 'Precision@5 on unseen': 0.008305647840531562, 'Recall@5 on unseen': 0.02004825185888309}
{'Precision@10': 0.043869516310461196, 'Recall@10': 0.3602970161825708, 'Precision@10 on unseen': 0.006312292358803987, 'Recall@10 on unseen': 0.03401231105550043}
{'Precision@20': 0.023003374578177728, 'Recall@20': 0.36435121860386616, 'Precision@20 on unseen': 0.0034883720930232558, 'Recall@20 on unseen': 0.03543119256159123}
{'Precision@50': 0.009943757030371203, 'Recall@50': 0.3682150283103213, 'Precision@50 on unseen': 0.0017607973421926912, 'Recall@50 on unseen': 0.03810046727317174}


### SVD

In [162]:
U, s, Vt = svds(sparse_user_wise_df, k = 50) 
sigma = np.diag(s)
predicted_interactions = np.dot(np.dot(U, sigma), Vt) 

In [167]:
for k in [1,5,10,20,50]:
    print(evaluate_svd(test_interactions, k=k))

{'Precision@1': 0.07311586051743532, 'Recall@1': 0.0627992523661815, 'Precision@1 on unseen': 0.008305647840531562, 'Recall@1 on unseen': 0.006367663344407531}
{'Precision@5': 0.024296962879640047, 'Recall@5': 0.09284050532644458, 'Precision@5 on unseen': 0.0033222591362126247, 'Recall@5 on unseen': 0.011955099164401491}
{'Precision@10': 0.015635545556805398, 'Recall@10': 0.11392385270362301, 'Precision@10 on unseen': 0.001993355481727575, 'Recall@10 on unseen': 0.013646990391176436}
{'Precision@20': 0.010123734533183352, 'Recall@20': 0.14168945755906118, 'Precision@20 on unseen': 0.0015780730897009966, 'Recall@20 on unseen': 0.018842233301282846}
{'Precision@50': 0.005601799775028122, 'Recall@50': 0.1835851739194559, 'Precision@50 on unseen': 0.0010963455149501661, 'Recall@50 on unseen': 0.030438828147571692}
