In [1]:
import pandas as pd
import random
from caboose_nbr.tifuknn import TIFUKNN
from caboose_nbr.evaluate_recommendation import evaluate

In [2]:
train_baskets = pd.read_csv('data/instacart_30k/train_baskets.csv.gz')
test_baskets = pd.read_csv('data/instacart_30k/test_baskets.csv')
valid_baskets = pd.read_csv('data/instacart_30k/valid_baskets.csv')

In [3]:
all_users = train_baskets['user_id'].tolist()
sample_users = random.sample(all_users,1000)

In [4]:
train_baskets = train_baskets[train_baskets['user_id'].isin(sample_users)]
test_baskets = test_baskets[test_baskets['user_id'].isin(sample_users)]
valid_baskets = valid_baskets[valid_baskets['user_id'].isin(sample_users)]

In [5]:
train_baskets.shape

(408146, 10)

In [6]:
train_baskets[['user_id','item_id']].drop_duplicates().shape

(122473, 2)

In [7]:
tifu_sklearn_cosine = TIFUKNN(train_baskets, test_baskets, valid_baskets, mode = 'sklearn', distance_metric = 'cosine')
tifu_sklearn_cosine.train()
tifu_sklearn_cosine_preds = tifu_sklearn_cosine.predict()

number of test users: 953
filtered items: 19825
initial data processing
item count: 8207
compute user reps 953
(953, 8207)
start of knn
knn finished


In [8]:
tifu_caboose = TIFUKNN(train_baskets, test_baskets, valid_baskets,'caboose')
tifu_caboose.train()
tifu_caboose_preds = tifu_caboose.predict()

number of test users: 953
filtered items: 19825
initial data processing
item count: 8207
compute user reps 953
(953, 8207)
start of knn
knn finished


In [9]:
bad_users_items = []
for user in tifu_caboose_preds:
    if tifu_caboose_preds[user][:50] != tifu_sklearn_cosine_preds[user][:50]:
        bad_users_items.append(user)
print('total users:',len(tifu_caboose_preds))
print('bad users (users who have a difference in their top 50 predicted items for caboose vs. cosine sklearn:)',len(bad_users_items))

total users: 953
bad users (users who have a difference in their top 50 predicted items for caboose vs. cosine sklearn:) 156


In [10]:
low_neighbors = 0
same_neighbors = []
for user in bad_users_items:
    if len(tifu_caboose.all_user_nns[user]) < tifu_caboose.k:
        low_neighbors +=1
    else:
        same_neighbors.append(user)
print('bad users with less than k neighbors in caboose:',low_neighbors)

bad users with less than k neighbors in caboose: 153


In [11]:
for user in same_neighbors:
    print('------')
    print('user:',user)
    caboose_nns = set([x[0] for x in tifu_caboose.all_user_nns[user]])
    sklearn_nns = set(tifu_sklearn_cosine.all_user_nns[user])
    print(caboose_nns.difference(sklearn_nns))
    print(sklearn_nns.difference(caboose_nns))

------
user: 68927
{71}
{849}
------
user: 111936
{299}
{639}
------
user: 116843
{489}
{501}


In [12]:
user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
user_test_baskets_dict = dict(zip( user_test_baskets_df['user_id'],user_test_baskets_df['item_id']))


In [13]:
_,_ = evaluate(user_test_baskets_dict,tifu_sklearn_cosine_preds)

10
recall: 0.2610533319768127
ndcg: 0.3809740919579771
20
recall: 0.3747476855104333
ndcg: 0.3070318247616832


In [14]:
_,_ = evaluate(user_test_baskets_dict,tifu_caboose_preds)

10
recall: 0.2611279501416722
ndcg: 0.38087836350565824
20
recall: 0.3744945438930643
ndcg: 0.30696641577457373
