In [1]:
import pandas as pd
import random
from caboose_nbr.tifuknn import TIFUKNN

In [2]:
train_baskets = pd.read_csv('data/instacart_30k/train_baskets.csv.gz')
test_baskets = pd.read_csv('data/instacart_30k/test_baskets.csv')
valid_baskets = pd.read_csv('data/instacart_30k/valid_baskets.csv')

In [3]:
all_users = train_baskets['user_id'].tolist()
sample_users = random.sample(all_users,1000)

In [4]:
train_baskets = train_baskets[train_baskets['user_id'].isin(sample_users)]
test_baskets = test_baskets[test_baskets['user_id'].isin(sample_users)]
valid_baskets = valid_baskets[valid_baskets['user_id'].isin(sample_users)]

In [5]:
train_baskets.shape

(408407, 10)

In [6]:
train_baskets[['user_id','item_id']].drop_duplicates().shape

(126219, 2)

In [7]:
tifu_sklearn_cosine = TIFUKNN(train_baskets, test_baskets, valid_baskets, mode = 'sklearn', distance_metric = 'cosine')
tifu_sklearn_cosine.train()
tifu_sklearn_cosine_preds = tifu_sklearn_cosine.predict()

number of test users: 966
filtered items: 20056
initial data processing
item count: 8387
compute user reps 966
(966, 8387)
start of knn
knn finished


In [8]:
tifu_caboose = TIFUKNN(train_baskets, test_baskets, valid_baskets,'caboose')
tifu_caboose.train()
tifu_caboose_preds = tifu_caboose.predict()

number of test users: 966
filtered items: 20056
initial data processing
item count: 8387
compute user reps 966
(966, 8387)
start of knn
knn finished


In [10]:
bad_users_items = []
for user in tifu_caboose_preds:
    if tifu_caboose_preds[user][:50] != tifu_sklearn_cosine_preds[user][:50]:
        bad_users_items.append(user)
print('total users:',len(tifu_caboose_preds))
print('bad users (users who have a difference in their top 50 predicted items for caboose vs. cosine sklearn:)',len(bad_users_items))

total users: 966
bad users (users who have a difference in their top 50 predicted items for caboose vs. cosine sklearn:) 189


In [15]:
low_neighbors = 0
same_neighbors = []
for user in bad_users_items:
    if len(tifu_caboose.all_user_nns[user]) < tifu_caboose.k:
        low_neighbors +=1
    else:
        same_neighbors.append(user)
print('bad users with less than k neighbors in caboose:',low_neighbors)

bad users with less than k neighbors in caboose: 184


In [13]:
user = 43927
caboose_nns = set([x[0] for x in tifu_caboose.all_user_nns[user]])
sklearn_nns = set(tifu_sklearn_cosine.all_user_nns[user])
print(caboose_nns.difference(sklearn_nns))
print(sklearn_nns.difference(caboose_nns))

{238}
{458}


In [14]:
user = 110210
caboose_nns = set([x[0] for x in tifu_caboose.all_user_nns[user]])
sklearn_nns = set(tifu_sklearn_cosine.all_user_nns[user])
print(caboose_nns.difference(sklearn_nns))
print(sklearn_nns.difference(caboose_nns))

{504}
{253}


In [None]:
user = 144107
caboose_nns = set([x[0] for x in tifu_caboose.all_user_nns[user]])
sklearn_nns = set(tifu_sklearn_cosine.all_user_nns[user])
print(caboose_nns.difference(sklearn_nns))
print(sklearn_nns.difference(caboose_nns))