In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
import random

In [2]:
interactions = pd.read_csv("processed_data/interactions_train.csv")
interactions_test = pd.read_csv("processed_data/interactions_valid.csv")
item_popularity = pd.read_csv("processed_data/popularity_item.csv")
user_popularity = pd.read_csv("processed_data/popularity_user.csv")
product_features = pd.read_csv("processed_data/product_features.csv")

In [9]:
interactions.shape

(145807, 23)

In [3]:
len(set(interactions['visitorid'].unique()) & set(interactions_test['visitorid'].unique()))

889

In [19]:
interactions_test = interactions_test[interactions_test['visitorid'].isin(interactions['visitorid'].unique())]

In [5]:
interactions['visitorid'].nunique()

89010

## Recently Viewed items

In [20]:
recently_interacted = interactions.sort_values(by=['visitorid', 'timestamp'], ascending=[True, False])
top_recently_interacted = recently_interacted.groupby('visitorid')['itemid'].apply(set).reset_index()

In [21]:
all_interactions = pd.concat([interactions, interactions_test])
relevant_items = all_interactions.groupby('visitorid')['itemid'].apply(set).reset_index()
relevant_items.columns = ['visitorid', 'relevant_items']
test_interactions = interactions_test.groupby('visitorid')['itemid'].apply(set).reset_index()
test_interactions.columns = ['visitorid', 'test_items']
test_interactions = test_interactions.merge(relevant_items, on='visitorid', how='left')
test_interactions.drop(columns=['test_items'], inplace=True)

In [22]:
test_interactions = test_interactions.merge(top_recently_interacted, on='visitorid',how='left')
test_interactions.rename(columns={'itemid':'RVI_predicted'}, inplace= True) 

In [8]:
def precision_at_k(relevant, predicted, k=10):
    if isinstance(predicted, set):
        predicted = list(predicted)
    predicted = predicted[:k]
    hits = len(set(predicted) & relevant)
    return hits / k

def recall_at_k(relevant, predicted, k=10):
    if isinstance(predicted, set):
        predicted = list(predicted)
    predicted = predicted[:k]
    hits = len(set(predicted) & relevant)
    if len(relevant)==0:
        return 0
    return hits/len(relevant)

In [24]:
for k in [1,5,10,20,50]:
    test_interactions['precision'] = test_interactions.apply(lambda row: precision_at_k(row['relevant_items'], row['RVI_predicted'], k), axis=1)
    test_interactions['recall'] = test_interactions.apply(lambda row: recall_at_k(row['relevant_items'], row['RVI_predicted'], k), axis=1)
    print(f'Precision@{k}:', np.mean(test_interactions['precision']),f'Recall@{k}:', np.mean(test_interactions['recall']))

Precision@1: 1.0 Recall@1: 0.5168365248264494
Precision@5: 0.3631046119235096 Recall@5: 0.6474314888574894
Precision@10: 0.2205849268841395 Recall@10: 0.6681412713221907
Precision@20: 0.13053993250843643 Recall@20: 0.6795546418410727
Precision@50: 0.06357705286839145 Recall@50: 0.6867884687660509


## Baseline Model: Recommend the most popular 50 items

In [25]:
top_items = item_popularity.sort_values(by='number_of_views', ascending=False)
top_popular_items = top_items['itemid'].head(50).tolist()

In [26]:
train_interactions = interactions.groupby('visitorid')['itemid'].apply(set)
test_interactions['items_seen'] = test_interactions['visitorid'].apply(lambda x: train_interactions[x] if x in train_interactions.index else {})
test_interactions['relevant_items_unseen'] = test_interactions.apply(lambda row: row['relevant_items'] - row['items_seen'], axis=1)

In [28]:
for k in [1,5,10,20,50]:
    test_interactions['precision'] = test_interactions['relevant_items'].apply(lambda x: precision_at_k(x, top_popular_items,k))
    test_interactions['recall'] = test_interactions['relevant_items'].apply(lambda x: recall_at_k(x, top_popular_items,k))
    print(f'Precision@{k}:', np.mean(test_interactions['precision']),f'Recall@{k}:', np.mean(test_interactions['recall']))

Precision@1: 0.03262092238470191 Recall@1: 0.005971973170736153
Precision@5: 0.01822272215973004 Recall@5: 0.021588325564663398
Precision@10: 0.01372328458942632 Recall@10: 0.035443115935966496
Precision@20: 0.011754780652418449 Recall@20: 0.060273105797464574
Precision@50: 0.008391451068616424 Recall@50: 0.10371207243828037


In [29]:
for k in [1,5,10,20,50]:
    test_interactions['precision'] = test_interactions.apply(lambda row: precision_at_k(row['relevant_items_unseen'], list(set(top_popular_items)-row['items_seen']),k),axis=1)
    test_interactions['recall'] = test_interactions.apply(lambda row: recall_at_k(row['relevant_items_unseen'], list(set(top_popular_items)-row['items_seen']),k),axis=1)
    print(f'Precision@{k} on unseen:', np.mean(test_interactions['precision']),f'Recall@{k} on unseen:', np.mean(test_interactions['recall']))

Precision@1 on unseen: 0.0 Recall@1 on unseen: 0.0
Precision@5 on unseen: 0.0044994375703037125 Recall@5 on unseen: 0.009488969742144937
Precision@10 on unseen: 0.003374578177727784 Recall@10 on unseen: 0.01358818560103214
Precision@20 on unseen: 0.00343082114735658 Recall@20 on unseen: 0.02486356575621193
Precision@50 on unseen: 0.0024296962879640045 Recall@50 on unseen: 0.05054421084327059


## User user collaborative Filtering

In [None]:
from collections import defaultdict

usersPerItem = defaultdict(set) # Maps an item to the users who has interacted with it
itemsPerUser = defaultdict(set)

for idx, row in interactions.iterrows():
    user,item = row['visitorid'], row['itemid']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [31]:
user_item = pd.DataFrame(interactions.groupby(['visitorid','itemid'])['timestamp'].count().reset_index())
user_wise_df = user_item.pivot(index='visitorid', columns='itemid', values='timestamp').fillna(0)


In [32]:
user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_wise_df.index)}
index_to_user_id = {idx: user_id for user_id, idx in user_id_to_index.items()}
item_id_to_index = {item_id: idx for idx, item_id in enumerate(user_wise_df.columns)}
index_to_item_id = {idx: item_id for item_id, idx in item_id_to_index.items()}

In [33]:
from scipy import sparse
sparse_user_wise_df = sparse.csr_matrix(user_wise_df )
similarities = cosine_similarity(sparse_user_wise_df,dense_output=False)

In [34]:
import heapq
def evaluate_user_collaborative(test_interactions,k=10):
    recalls = []
    precisions = []
    recalls_unseen = []
    precisions_unseen = []
    
    
    for user, actual_items, actual_items_unseen in zip(test_interactions['visitorid'], test_interactions['relevant_items'], test_interactions['relevant_items_unseen']):
        specific_user_index = user_id_to_index[user]
        # Get nearest neighbors
        user_similarities = similarities[specific_user_index].toarray().flatten()
        top_similar_indices = heapq.nlargest(10, range(len(user_similarities)), key=lambda i: user_similarities[i])
        similar_users = [index_to_user_id[idx] for idx in top_similar_indices][:10]

        # Vectorized prediction for unseen items
        predictions = user_wise_df.loc[similar_users].sum(axis=0)
        predictions = predictions[predictions!=0]
        unseen_items = list(set(predictions.index) - set(itemsPerUser[user]))
        predictions_unseen = predictions[unseen_items]
        
        # Select top-k items
        top_k_items = predictions.nlargest(k).index.tolist()
        top_k_items_unseen = predictions_unseen.nlargest(k).index.tolist()
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))
        if len(actual_items_unseen)!=0:
            recalls_unseen.append(recall_at_k(actual_items_unseen, top_k_items_unseen, k))
            precisions_unseen.append(precision_at_k(actual_items_unseen, top_k_items_unseen, k))
    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls),
        f'Precision@{k} on unseen': np.mean(precisions_unseen),
        f'Recall@{k} on unseen': np.mean(recalls_unseen),
    }

In [35]:
for k in [1,5,10,20,50]:
    print(evaluate_user_collaborative(test_interactions, k=k))

{'Precision@1': 0.9763779527559056, 'Recall@1': 0.5046773304400334, 'Precision@1 on unseen': 0.0, 'Recall@1 on unseen': 0.0}
{'Precision@5': 0.3583802024746907, 'Recall@5': 0.6430921972671556, 'Precision@5 on unseen': 0.0009966777408637875, 'Recall@5 on unseen': 0.002558139534883721}
{'Precision@10': 0.21833520809898765, 'Recall@10': 0.6662730413291676, 'Precision@10 on unseen': 0.0009966777408637873, 'Recall@10 on unseen': 0.0062126245847176085}
{'Precision@20': 0.1294713160854893, 'Recall@20': 0.6805909093664554, 'Precision@20 on unseen': 0.0005813953488372094, 'Recall@20 on unseen': 0.006295681063122924}
{'Precision@50': 0.063644544431946, 'Recall@50': 0.6888902893718084, 'Precision@50 on unseen': 0.0002325581395348837, 'Recall@50 on unseen': 0.006295681063122924}


## Item item collaborative Filtering

In [36]:
sparse_item_wise_df = sparse.csr_matrix(user_wise_df.T) 
item_similarities = cosine_similarity(sparse_item_wise_df, dense_output=False)

In [38]:
def evaluate_item_collaborative(test_interactions, k=10):
    recalls = []
    precisions = []
    recalls_unseen = []
    precisions_unseen = []
    
    for user, actual_items, actual_items_unseen in zip(test_interactions['visitorid'], test_interactions['relevant_items'], test_interactions['relevant_items_unseen']):
        seen_items = itemsPerUser[user]  
        
        # Generate predictions
        predictions = pd.Series(dtype=float)
        for item in seen_items:
            specific_item_index = item_id_to_index[item]
            item_similarities_row = item_similarities[specific_item_index].toarray().flatten()
            
            # Aggregate scores for items similar to the current item
            similar_item_indices = np.argsort(item_similarities_row)[-10:]  # Top 10 similar items
            for idx in similar_item_indices:
                similar_item = index_to_item_id[idx]
                if similar_item not in predictions:
                    predictions[similar_item] = 0
                predictions[similar_item] += item_similarities_row[idx]
        
        # Exclude items the user has already interacted with for unseen predictions
        unseen_items = list(set(predictions.index) - set(seen_items))
        predictions_unseen = predictions.loc[unseen_items]
        
        # Select top-k items
        top_k_items = predictions.nlargest(k).index.tolist()
        top_k_items_unseen = predictions_unseen.nlargest(k).index.tolist()
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))
        if len(actual_items_unseen) != 0:
            recalls_unseen.append(recall_at_k(actual_items_unseen, top_k_items_unseen, k))
            precisions_unseen.append(precision_at_k(actual_items_unseen, top_k_items_unseen, k))
    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls),
        f'Precision@{k} on unseen': np.mean(precisions_unseen),
        f'Recall@{k} on unseen': np.mean(recalls_unseen),
    }

In [39]:
for k in [1,5,10,20,50]:
    print(evaluate_item_collaborative(test_interactions, k=k))

{'Precision@1': 0.9977502812148481, 'Recall@1': 0.5165735706827304, 'Precision@1 on unseen': 0.011627906976744186, 'Recall@1 on unseen': 0.005592469545957917}
{'Precision@5': 0.3633295838020248, 'Recall@5': 0.6534620981589057, 'Precision@5 on unseen': 0.009634551495016613, 'Recall@5 on unseen': 0.027948504983388703}
{'Precision@10': 0.2195725534308212, 'Recall@10': 0.6797623772671364, 'Precision@10 on unseen': 0.006644518272425249, 'Recall@10 on unseen': 0.041791251384274634}
{'Precision@20': 0.1262654668166479, 'Recall@20': 0.6917278737386943, 'Precision@20 on unseen': 0.00398671096345515, 'Recall@20 on unseen': 0.04331180017226529}
{'Precision@50': 0.060854893138357714, 'Recall@50': 0.7004087573176127, 'Precision@50 on unseen': 0.0018604651162790697, 'Recall@50 on unseen': 0.04779235484296628}


## Singular Value decomoposition

In [40]:
from scipy.sparse.linalg import svds 
# Singular Value Decomposition
U, s, Vt = svds(sparse_user_wise_df, k = 50) # k is the number of latent features

# Construct diagonal array in SVD
sigma = np.diag(s)
predicted_interactions = np.dot(np.dot(U, sigma), Vt) 

In [41]:
def evaluate_svd(test_interactions,k=10):
    recalls = []
    precisions = []
    recalls_unseen = []
    precisions_unseen = []
    
    
    for user, actual_items, actual_items_unseen in zip(test_interactions['visitorid'], test_interactions['relevant_items'], test_interactions['relevant_items_unseen']):
        specific_user_index = user_id_to_index[user]

        # Vectorized prediction for unseen items
        predictions = predicted_interactions[specific_user_index]
        seen_items = [item_id_to_index[id] for id in itemsPerUser[user]]
        predictions_unseen = [v for i,v in enumerate(predictions) if i not in seen_items]
        predictions = np.argsort(predictions)[-k:][::-1]
        predictions_unseen = np.argsort(predictions_unseen)[-k:][::-1]
        
        # Select top-k items
        top_k_items = [index_to_item_id[pred] for pred in predictions]
        top_k_items_unseen = [index_to_item_id[pred] for pred in predictions_unseen]
        
        # Compute precision and recall
        recalls.append(recall_at_k(actual_items, top_k_items, k))
        precisions.append(precision_at_k(actual_items, top_k_items, k))
        if len(actual_items_unseen)!=0:
            recalls_unseen.append(recall_at_k(actual_items_unseen, top_k_items_unseen, k))
            precisions_unseen.append(precision_at_k(actual_items_unseen, top_k_items_unseen, k))
    
    return {
        f'Precision@{k}': np.mean(precisions),
        f'Recall@{k}': np.mean(recalls),
        f'Precision@{k} on unseen': np.mean(precisions_unseen),
        f'Recall@{k} on unseen': np.mean(recalls_unseen)}

In [42]:
for k in [1,5,10,20,50]:
    print(evaluate_svd(test_interactions,k))

{'Precision@1': 0.1732283464566929, 'Recall@1': 0.06925190784800801, 'Precision@1 on unseen': 0.011627906976744186, 'Recall@1 on unseen': 0.006730650916697429}
{'Precision@5': 0.07289088863892013, 'Recall@5': 0.11264905040616921, 'Precision@5 on unseen': 0.004651162790697675, 'Recall@5 on unseen': 0.015174063888874172}
{'Precision@10': 0.05016872890888639, 'Recall@10': 0.14208093373429817, 'Precision@10 on unseen': 0.002823920265780731, 'Recall@10 on unseen': 0.017205172315331432}
{'Precision@20': 0.03357705286839145, 'Recall@20': 0.16901294820764554, 'Precision@20 on unseen': 0.001993355481727575, 'Recall@20 on unseen': 0.02615195788390403}
{'Precision@50': 0.02125984251968504, 'Recall@50': 0.2375755882538524, 'Precision@50 on unseen': 0.0011627906976744186, 'Recall@50 on unseen': 0.036916580857829326}


## Removing duplicate events , but weighing the events

In [43]:
event_map={'view':1, 'addtocart':2, 'transaction':3}
train_interactions = interactions[['visitorid','itemid','event']]
train_interactions = train_interactions.drop_duplicates()
train_interactions['weight'] = train_interactions['event'].apply(lambda x:event_map[x])
train_interactions = train_interactions.sort_values(by=['visitorid','itemid','weight'], ascending=[True, True, False])
train_interactions = train_interactions.drop_duplicates(subset=['visitorid','itemid'], keep='first')


### User-user collaborative

In [44]:
user_wise_df = train_interactions.pivot(index='visitorid', columns='itemid', values='weight').fillna(0)
sparse_user_wise_df = sparse.csr_matrix(user_wise_df )
similarities = cosine_similarity(sparse_user_wise_df,dense_output=False)

In [45]:
for k in [1,5,10,20,50]:
    print(evaluate_user_collaborative(test_interactions, k=k))

{'Precision@1': 0.9955005624296963, 'Recall@1': 0.5147742826067268, 'Precision@1 on unseen': 0.008305647840531562, 'Recall@1 on unseen': 0.0054490083559851}
{'Precision@5': 0.36197975253093373, 'Recall@5': 0.6471755092391303, 'Precision@5 on unseen': 0.0023255813953488376, 'Recall@5 on unseen': 0.006487214336051545}
{'Precision@10': 0.21968503937007874, 'Recall@10': 0.6697380373656902, 'Precision@10 on unseen': 0.0016611295681063123, 'Recall@10 on unseen': 0.010141699385885432}
{'Precision@20': 0.1295838020247469, 'Recall@20': 0.6820373024496834, 'Precision@20 on unseen': 0.0009136212624584718, 'Recall@20 on unseen': 0.010169854124327913}
{'Precision@50': 0.06251968503937008, 'Recall@50': 0.6892388035132668, 'Precision@50 on unseen': 0.00046511627906976747, 'Recall@50 on unseen': 0.012098610122851352}


### Item-item collaborative

In [46]:
sparse_item_wise_df = sparse.csr_matrix(user_wise_df.T )
item_similarities = cosine_similarity(sparse_item_wise_df,dense_output=False)

In [47]:
for k in [1,5,10,20,50]:
    print(evaluate_item_collaborative(test_interactions, k=k))

{'Precision@1': 0.9988751406074241, 'Recall@1': 0.5168153010643253, 'Precision@1 on unseen': 0.014950166112956811, 'Recall@1 on unseen': 0.008084163898117386}
{'Precision@5': 0.3655793025871767, 'Recall@5': 0.6537380241549404, 'Precision@5 on unseen': 0.008305647840531562, 'Recall@5 on unseen': 0.02004825185888309}
{'Precision@10': 0.2210348706411699, 'Recall@10': 0.678370885814794, 'Precision@10 on unseen': 0.006312292358803987, 'Recall@10 on unseen': 0.03401231105550043}
{'Precision@20': 0.12632170978627671, 'Recall@20': 0.6892910624828924, 'Precision@20 on unseen': 0.0034883720930232558, 'Recall@20 on unseen': 0.03543119256159123}
{'Precision@50': 0.06150731158605175, 'Recall@50': 0.6978064122211955, 'Precision@50 on unseen': 0.0017607973421926912, 'Recall@50 on unseen': 0.03810046727317174}


### SVD

In [48]:
U, s, Vt = svds(sparse_user_wise_df, k = 50) 
sigma = np.diag(s)
predicted_interactions = np.dot(np.dot(U, sigma), Vt) 

In [49]:
for k in [1,5,10,20,50]:
    print(evaluate_svd(test_interactions, k=k))

{'Precision@1': 0.2047244094488189, 'Recall@1': 0.0863497992722168, 'Precision@1 on unseen': 0.008305647840531562, 'Recall@1 on unseen': 0.006367663344407531}
{'Precision@5': 0.0843644544431946, 'Recall@5': 0.13563909071733965, 'Precision@5 on unseen': 0.0033222591362126247, 'Recall@5 on unseen': 0.011955099164401491}
{'Precision@10': 0.05568053993250844, 'Recall@10': 0.1684084361993182, 'Precision@10 on unseen': 0.0021594684385382065, 'Recall@10 on unseen': 0.014200700247211874}
{'Precision@20': 0.03751406074240721, 'Recall@20': 0.21385005436562904, 'Precision@20 on unseen': 0.0016611295681063123, 'Recall@20 on unseen': 0.020503362869389155}
{'Precision@50': 0.023397075365579303, 'Recall@50': 0.28919804674176763, 'Precision@50 on unseen': 0.0010963455149501661, 'Recall@50 on unseen': 0.03126939293162485}
