In [72]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import random
from metrics import *

In [73]:
interactions = pd.read_csv("processed_data/interactions_train.csv")
interactions_test = pd.read_csv("processed_data/interactions_valid.csv")
item_popularity = pd.read_csv("processed_data/popularity_item.csv")
user_popularity = pd.read_csv("processed_data/popularity_user.csv")
product_features = pd.read_csv("processed_data/product_features.csv")

In [74]:
interactions.shape

(127780, 22)

In [75]:
interactions['visitorid'].nunique()

77563

## Recently Viewed items

In [76]:
recently_interacted = interactions.sort_values(by=['visitorid', 'timestamp'], ascending=[True, False])
top_recently_interacted = (
    recently_interacted.groupby('visitorid')['itemid']
    .apply(lambda x: x.drop_duplicates().tolist())
    .reset_index()
)

In [77]:
test_interactions = interactions_test.groupby('visitorid')['itemid'].apply(set).reset_index()
test_interactions.columns = ['visitorid', 'relevant_items']

In [78]:
test_interactions = test_interactions.merge(top_recently_interacted, on='visitorid',how='left')
test_interactions.rename(columns={'itemid':'RVI_predicted'}, inplace= True) 

In [79]:
for k in [1,5,10,20,50]:
    test_interactions['precision'] = test_interactions.apply(lambda row: precision_at_k(row['relevant_items'], row['RVI_predicted'], k), axis=1)
    test_interactions['recall'] = test_interactions.apply(lambda row: recall_at_k(row['relevant_items'], row['RVI_predicted'], k), axis=1)
    print(f'Precision@{k}:', np.mean(test_interactions['precision']),f'Recall@{k}:', np.mean(test_interactions['recall']))

Precision@1: 0.35680304471931495 Recall@1: 0.320797951430452
Precision@5: 0.08277830637488107 Recall@5: 0.3437992138579941
Precision@10: 0.04281636536631779 Recall@10: 0.34643372271990236
Precision@20: 0.021836346336822077 Recall@20: 0.3470162068769855
Precision@50: 0.00907706945765937 Recall@50: 0.34753733763286476


## Baseline Model: Recommend the most popular 50 items

In [80]:
top_items = item_popularity.sort_values(by='number_of_views', ascending=False)
top_popular_items = top_items['itemid'].head(50).tolist()

In [81]:
for k in [1,5,10,20,50]:
    test_interactions['precision'] = test_interactions['relevant_items'].apply(lambda x: precision_at_k(x, top_popular_items,k))
    test_interactions['recall'] = test_interactions['relevant_items'].apply(lambda x: recall_at_k(x, top_popular_items,k))
    print(f'Precision@{k}:', np.mean(test_interactions['precision']),f'Recall@{k}:', np.mean(test_interactions['recall']))

Precision@1: 0.0019029495718363464 Recall@1: 0.0019029495718363464
Precision@5: 0.007040913415794482 Recall@5: 0.019143018741983674
Precision@10: 0.006089438629876308 Recall@10: 0.03176126511633503
Precision@20: 0.005613701236917222 Recall@20: 0.058162504046047864
Precision@50: 0.0038249286393910566 Recall@50: 0.09566376061630838


## User user collaborative Filtering

In [82]:
from collections import defaultdict

usersPerItem = defaultdict(set) # Maps an item to the users who has interacted with it
itemsPerUser = defaultdict(set)

for idx, row in interactions.iterrows():
    user,item = row['visitorid'], row['itemid']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [83]:
user_item = pd.DataFrame(interactions.groupby(['visitorid','itemid'])['timestamp'].count().reset_index())
user_wise_df = user_item.pivot(index='visitorid', columns='itemid', values='timestamp').fillna(0)


In [84]:
user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_wise_df.index)}
index_to_user_id = {idx: user_id for user_id, idx in user_id_to_index.items()}
item_id_to_index = {item_id: idx for idx, item_id in enumerate(user_wise_df.columns)}
index_to_item_id = {idx: item_id for item_id, idx in item_id_to_index.items()}

In [85]:
from scipy import sparse
sparse_user_wise_df = sparse.csr_matrix(user_wise_df )
similarities = cosine_similarity(sparse_user_wise_df,dense_output=False)

In [86]:
import heapq
def evaluate_user_collaborative(test_interactions,k=10):
    recalls = []
    precisions = []
    
    
    for user, actual_items in zip(test_interactions['visitorid'], test_interactions['relevant_items']):
        specific_user_index = user_id_to_index[user]
        # Get nearest neighbors
        user_similarities = similarities[specific_user_index].toarray().flatten()
        top_similar_indices = heapq.nlargest(10, range(len(user_similarities)), key=lambda i: user_similarities[i])
        similar_users = [index_to_user_id[idx] for idx in top_similar_indices][:10]

        # Vectorized prediction for unseen items
        predictions = user_wise_df.loc[similar_users].sum(axis=0)
        predictions = predictions[predictions!=0]
        
        # Select top-k items
        top_k_items = predictions.nlargest(k).index.tolist()
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))

    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls)
    }

In [87]:
for k in [1,5,10,20,50]:
    print(evaluate_user_collaborative(test_interactions, k=k))

{'Precision@1': 0.3425309229305423, 'Recall@1': 0.3042906785401655}
{'Precision@5': 0.08258801141769744, 'Recall@5': 0.34242482595400914}
{'Precision@10': 0.042721217887725975, 'Recall@10': 0.3490346684320723}
{'Precision@20': 0.022074215033301616, 'Recall@20': 0.352761374148759}
{'Precision@50': 0.009267364414843007, 'Recall@50': 0.35365899155425196}


## Item item collaborative Filtering

In [88]:
sparse_item_wise_df = sparse.csr_matrix(user_wise_df.T) 
item_similarities = cosine_similarity(sparse_item_wise_df, dense_output=False)

In [89]:
def evaluate_item_collaborative(test_interactions, k=10):
    recalls = []
    precisions = []
    
    for user, actual_items in zip(test_interactions['visitorid'], test_interactions['relevant_items']):
        seen_items = itemsPerUser[user]  
        
        # Generate predictions
        predictions = pd.Series(dtype=float)
        for item in seen_items:
            specific_item_index = item_id_to_index[item]
            item_similarities_row = item_similarities[specific_item_index].toarray().flatten()
            
            # Aggregate scores for items similar to the current item
            similar_item_indices = np.argsort(item_similarities_row)[-10:]  # Top 10 similar items
            for idx in similar_item_indices:
                similar_item = index_to_item_id[idx]
                if similar_item not in predictions:
                    predictions[similar_item] = 0
                predictions[similar_item] += item_similarities_row[idx]
        
    
        
        # Select top-k items
        top_k_items = predictions.nlargest(k).index.tolist()
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))
    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls)
    }

In [90]:
for k in [1,5,10,20,50]:
    print(evaluate_item_collaborative(test_interactions, k=k))

{'Precision@1': 0.33111322549952427, 'Recall@1': 0.3026826302991859}
{'Precision@5': 0.08334919124643196, 'Recall@5': 0.35344415486516173}
{'Precision@10': 0.04490960989533778, 'Recall@10': 0.3681979172848415}
{'Precision@20': 0.023739295908658423, 'Recall@20': 0.37434201512250265}
{'Precision@50': 0.010123691722169361, 'Recall@50': 0.37789871862774127}


## Singular Value decomoposition

In [91]:
from scipy.sparse.linalg import svds 
# Singular Value Decomposition
U, s, Vt = svds(sparse_user_wise_df, k = 50) # k is the number of latent features

# Construct diagonal array in SVD
sigma = np.diag(s)
predicted_interactions = np.dot(np.dot(U, sigma), Vt) 

In [92]:
def evaluate_svd(test_interactions,k=10):
    recalls = []
    precisions = []
    
    
    for user, actual_items in zip(test_interactions['visitorid'], test_interactions['relevant_items']):
        specific_user_index = user_id_to_index[user]

        # Vectorized prediction for unseen items
        predictions = predicted_interactions[specific_user_index]
        predictions = np.argsort(predictions)[-k:][::-1]
        
        # Select top-k items
        top_k_items = [index_to_item_id[pred] for pred in predictions]
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))

    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls)
    }

In [93]:
for k in [1,5,10,20,50]:
    print(evaluate_svd(test_interactions,k))

{'Precision@1': 0.06279733587059944, 'Recall@1': 0.05049931581683325}
{'Precision@5': 0.02226450999048525, 'Recall@5': 0.08307321344161409}
{'Precision@10': 0.014081826831588965, 'Recall@10': 0.09942914907829946}
{'Precision@20': 0.00884871550903901, 'Recall@20': 0.11778291539052228}
{'Precision@50': 0.005252140818268316, 'Recall@50': 0.16402967672534172}


## Removing duplicate events , but weighing the events

In [94]:
event_map={'view':1, 'addtocart':2, 'transaction':3}
train_interactions = interactions[['visitorid','itemid','event']]
train_interactions = train_interactions.drop_duplicates()
train_interactions['weight'] = train_interactions['event'].apply(lambda x:event_map[x])
train_interactions = train_interactions.sort_values(by=['visitorid','itemid','weight'], ascending=[True, True, False])
train_interactions = train_interactions.drop_duplicates(subset=['visitorid','itemid'], keep='first')


### User-user collaborative

In [95]:
user_wise_df = train_interactions.pivot(index='visitorid', columns='itemid', values='weight').fillna(0)
sparse_user_wise_df = sparse.csr_matrix(user_wise_df )
similarities = cosine_similarity(sparse_user_wise_df,dense_output=False)

In [96]:
for k in [1,5,10,20,50]:
    print(evaluate_user_collaborative(test_interactions, k=k))

{'Precision@1': 0.3339676498572788, 'Recall@1': 0.2989444720234238}
{'Precision@5': 0.0835394862036156, 'Recall@5': 0.3468876027680471}
{'Precision@10': 0.04348239771646051, 'Recall@10': 0.350621361792169}
{'Precision@20': 0.022407231208372975, 'Recall@20': 0.3547773105440117}
{'Precision@50': 0.00934348239771646, 'Recall@50': 0.3557808065139446}


### Item-item collaborative

In [97]:
sparse_item_wise_df = sparse.csr_matrix(user_wise_df.T )
item_similarities = cosine_similarity(sparse_item_wise_df,dense_output=False)

In [98]:
for k in [1,5,10,20,50]:
    print(evaluate_item_collaborative(test_interactions, k=k))

{'Precision@1': 0.3301617507136061, 'Recall@1': 0.30358286526912415}
{'Precision@5': 0.08315889628924834, 'Recall@5': 0.3506256396612193}
{'Precision@10': 0.0445290199809705, 'Recall@10': 0.36433959543866756}
{'Precision@20': 0.02340627973358706, 'Recall@20': 0.3684295187197503}
{'Precision@50': 0.00993339676498573, 'Recall@50': 0.3704093140014455}


### SVD

In [99]:
U, s, Vt = svds(sparse_user_wise_df, k = 50) 
sigma = np.diag(s)
predicted_interactions = np.dot(np.dot(U, sigma), Vt) 

In [100]:
for k in [1,5,10,20,50]:
    print(evaluate_svd(test_interactions, k=k))

{'Precision@1': 0.0818268315889629, 'Recall@1': 0.06777940200006045}
{'Precision@5': 0.023786869647954328, 'Recall@5': 0.09015885514287615}
{'Precision@10': 0.015318744053282589, 'Recall@10': 0.10703198908121396}
{'Precision@20': 0.010180780209324452, 'Recall@20': 0.14002662101497954}
{'Precision@50': 0.005937202664129401, 'Recall@50': 0.1879631024226981}
