In [1]:
import pandas as pd
import random
from caboose_nbr.tifuknn import TIFUKNN
from caboose_nbr.evaluate_recommendation import evaluate

In [2]:
train_baskets = pd.read_csv('data/instacart_30k/train_baskets.csv.gz')
test_baskets = pd.read_csv('data/instacart_30k/test_baskets.csv')
valid_baskets = pd.read_csv('data/instacart_30k/valid_baskets.csv')

In [3]:
all_users = train_baskets['user_id'].tolist()
sample_users = random.sample(all_users,1000)

In [4]:
train_baskets = train_baskets[train_baskets['user_id'].isin(sample_users)]
test_baskets = test_baskets[test_baskets['user_id'].isin(sample_users)]
valid_baskets = valid_baskets[valid_baskets['user_id'].isin(sample_users)]

In [5]:
train_baskets.shape

(411999, 10)

In [6]:
train_baskets[['user_id','item_id']].drop_duplicates().shape

(125386, 2)

In [7]:
tifu_sklearn_cosine = TIFUKNN(train_baskets, test_baskets, valid_baskets, mode = 'sklearn', distance_metric = 'cosine')
tifu_sklearn_cosine.train()
tifu_sklearn_cosine_preds = tifu_sklearn_cosine.predict()

number of test users: 957
filtered items: 19829
initial data processing
item count: 8420
compute basket reps
1000  baskets passed
2000  baskets passed
3000  baskets passed
4000  baskets passed
5000  baskets passed
6000  baskets passed
7000  baskets passed
8000  baskets passed
9000  baskets passed
10000  baskets passed
11000  baskets passed
12000  baskets passed
13000  baskets passed
14000  baskets passed
15000  baskets passed
16000  baskets passed
17000  baskets passed
18000  baskets passed
19000  baskets passed
20000  baskets passed
21000  baskets passed
22000  baskets passed
23000  baskets passed
24000  baskets passed
25000  baskets passed
26000  baskets passed
27000  baskets passed
28000  baskets passed
29000  baskets passed
compute user reps 957
(957, 8420)
start of knn
knn finished


In [8]:
tifu_caboose = TIFUKNN(train_baskets, test_baskets, valid_baskets,'caboose')
tifu_caboose.train()
tifu_caboose_preds = tifu_caboose.predict()

number of test users: 957
filtered items: 19829
initial data processing
item count: 8420
compute basket reps
1000  baskets passed
2000  baskets passed
3000  baskets passed
4000  baskets passed
5000  baskets passed
6000  baskets passed
7000  baskets passed
8000  baskets passed
9000  baskets passed
10000  baskets passed
11000  baskets passed
12000  baskets passed
13000  baskets passed
14000  baskets passed
15000  baskets passed
16000  baskets passed
17000  baskets passed
18000  baskets passed
19000  baskets passed
20000  baskets passed
21000  baskets passed
22000  baskets passed
23000  baskets passed
24000  baskets passed
25000  baskets passed
26000  baskets passed
27000  baskets passed
28000  baskets passed
29000  baskets passed
compute user reps 957
(957, 8420)
start of knn
knn finished


In [9]:
interactions_to_forget = train_baskets[['user_id','item_id']].drop_duplicates().sample(5000)

In [None]:
tifu_caboose.train_baskets[(train_baskets['user_id'] == 19482) & (train_baskets['item_id'] == 32433) ].head()

In [None]:
import numpy as np
import math
sorted_baskets = tifu_caboose.train_baskets.sort_values(['user_id','order_number'])
sorted_baskets = sorted_baskets[['user_id','basket_id']].drop_duplicates()
user_baskets_df = sorted_baskets.groupby('user_id')['basket_id'].apply(list).reset_index()
user_baskets_dict = dict(zip(user_baskets_df['user_id'],user_baskets_df['basket_id']))
basket_items_df = tifu_caboose.train_baskets[['basket_id','item_id']].drop_duplicates().groupby('basket_id')['item_id'] \
            .apply(list).reset_index()
basket_items_dict = dict(zip(basket_items_df['basket_id'],basket_items_df['item_id']))

rep = np.array([0.0]* len(tifu_caboose.item_id_mapper))

user = 19482
baskets = user_baskets_dict[user]
group_size = math.ceil(len(baskets)/tifu_caboose.m)
addition = (group_size * tifu_caboose.m) - len(baskets)

basket_groups = []
basket_groups.append(baskets[:group_size-addition])
for i in range(tifu_caboose.m-1):
    basket_groups.append(baskets[group_size-addition+(i* group_size):group_size-addition+((i+1)* group_size)])

for i in range(tifu_caboose.m):
    print('group')
    group_rep = np.array([0.0]* len(tifu_caboose.item_id_mapper))
    for j in range(1,len(basket_groups[i])+1):
        basket = basket_groups[i][j-1]

        this_rep = [0]* len(tifu_caboose.item_id_mapper)
        for item in basket_items_dict[basket]:
            if item in tifu_caboose.item_id_mapper:
                if item == 32433:
                    print('miad!')
                this_rep[tifu_caboose.item_id_mapper[item]] = 1

        basket_rep = np.array(this_rep) * math.pow(tifu_caboose.rb, group_size-j)
        group_rep += basket_rep
    group_rep /= group_size
    print(group_rep[tifu_caboose.item_id_mapper[32433]])
    rep += np.array(group_rep) * math.pow(tifu_caboose.rg, tifu_caboose.m-i)
    print(rep[tifu_caboose.item_id_mapper[32433]])

rep /= tifu_caboose.m

In [None]:
for user,item in interactions_to_forget.values.tolist():
    if item in tifu_caboose.item_id_mapper and user in tifu_caboose.user_map:
        print(user,item)
        print(tifu_caboose.user_reps[tifu_caboose.user_map[user]][tifu_caboose.item_id_mapper[item]])
        #tifu_caboose.user_reps[tifu_caboose.user_map[user]][tifu_caboose.item_id_mapper[item]] = 0
        #tifu_caboose.caboose.forget(tifu_caboose.user_map[user],tifu_caboose.item_id_mapper[item])

In [10]:
tifu_sklearn_cosine.forget_interactions(interactions_to_forget.values.tolist())
tifu_sklearn_cosine_preds_after_forget = tifu_sklearn_cosine.predict()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train_baskets.drop('user_item', axis=1, inplace=True)


initial data processing
item count: 8209
compute basket reps
1000  baskets passed
2000  baskets passed
3000  baskets passed
4000  baskets passed
5000  baskets passed
6000  baskets passed
7000  baskets passed
8000  baskets passed
9000  baskets passed
10000  baskets passed
11000  baskets passed
12000  baskets passed
13000  baskets passed
14000  baskets passed
15000  baskets passed
16000  baskets passed
17000  baskets passed
18000  baskets passed
19000  baskets passed
20000  baskets passed
21000  baskets passed
22000  baskets passed
23000  baskets passed
24000  baskets passed
25000  baskets passed
26000  baskets passed
27000  baskets passed
28000  baskets passed
29000  baskets passed
compute user reps 957
(957, 8420)
start of knn
knn finished


In [11]:
tifu_caboose.forget_interactions(interactions_to_forget.values.tolist())
tifu_caboose_preds_after_forget = tifu_caboose.predict()

In [13]:
user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
user_test_baskets_dict = dict(zip( user_test_baskets_df['user_id'],user_test_baskets_df['item_id']))

In [14]:
_,_ = evaluate(user_test_baskets_dict,tifu_sklearn_cosine_preds)

10
recall: 0.28589681113851917
ndcg: 0.4309156000846968
20
recall: 0.4089619259853871
ndcg: 0.3476394477420133


In [15]:
_,_ = evaluate(user_test_baskets_dict,tifu_caboose_preds)

10
recall: 0.28589681113851917
ndcg: 0.4309156000846968
20
recall: 0.4089619259853871
ndcg: 0.34763865376373787


In [16]:
_,_ = evaluate(user_test_baskets_dict,tifu_sklearn_cosine_preds_after_forget)

10
recall: 0.2804697395752089
ndcg: 0.42409397175943037
20
recall: 0.4002570049935217
ndcg: 0.34193608168412404


In [19]:
_,_ = evaluate(user_test_baskets_dict,tifu_caboose_preds_after_forget)

10
recall: 0.28069658057284086
ndcg: 0.4242549234588572
20
recall: 0.4008608428074309
ndcg: 0.34218268604635294
