In [1]:
import pandas as pd
import random
from caboose_nbr.upcfr import UPCFr
from caboose_nbr.evaluate_recommendation import evaluate
from caboose_nbr.evaluate_recommendation import evaluate

In [2]:
train_baskets = pd.read_csv('data/instacart_30k/train_baskets.csv.gz')
test_baskets = pd.read_csv('data/instacart_30k/test_baskets.csv')
valid_baskets = pd.read_csv('data/instacart_30k/valid_baskets.csv')

In [3]:
all_users = train_baskets['user_id'].tolist()
sample_users = random.sample(all_users,1000)

In [4]:
train_baskets = train_baskets[train_baskets['user_id'].isin(sample_users)]
test_baskets = test_baskets[test_baskets['user_id'].isin(sample_users)]
valid_baskets = valid_baskets[valid_baskets['user_id'].isin(sample_users)]

In [5]:
train_baskets.shape

(400050, 10)

In [6]:
train_baskets[['user_id','item_id']].drop_duplicates().shape

(123050, 2)

In [7]:
upcfr_similaripy = UPCFr(train_baskets, test_baskets, valid_baskets, mode = 'similaripy')
upcfr_similaripy.train()
upcfr_similaripy_preds = upcfr_similaripy.predict()

number of test users: 959
filtered items: 9290
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['uid'] = self.data['user_id'].rank(method='dense')-1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['pid'] = self.data['item_id'].rank(method='dense')-1


uid       959
pid      9290
Score    1650
dtype: int64
(108124, 3)
959 9290


Done: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 959/959 [00:00<00:00, 16186.14it/s]


usersim shape (959, 959)


Done: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 959/959 [00:00<00:00, 4507.87it/s]


In [8]:
upcfr_caboose = UPCFr(train_baskets, test_baskets, valid_baskets, mode = 'caboose')
upcfr_caboose.train()
upcfr_caboose_preds = upcfr_caboose.predict()

number of test users: 959
filtered items: 9290
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['uid'] = self.data['user_id'].rank(method='dense')-1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['pid'] = self.data['item_id'].rank(method='dense')-1


uid       959
pid      9290
Score    1650
dtype: int64
(108124, 3)
959 9290


--Creating transpose of R...
--Computing row norms...
--Configuring for top-k -- num_threads: 4; pinning? false;
--Scheduling parallel top-k computation...


In [9]:
def sanity_check(default_preds, caboose_preds):
    bad_preds  = []
    for key in default_preds:
        if default_preds[key] != caboose_preds[key]:
            bad_preds.append(key)
    print('bad predictions:',len(bad_preds))

In [10]:
sanity_check(upcfr_similaripy_preds,upcfr_caboose_preds)

bad predictions: 21


In [11]:
user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
user_test_baskets_dict = dict(zip( user_test_baskets_df['user_id'],user_test_baskets_df['item_id']))


In [12]:
_,_ = evaluate(user_test_baskets_dict,upcfr_similaripy_preds)

10
recall: 0.0933517148009453
ndcg: 0.15772899847965743
20
recall: 0.1400875733462191
ndcg: 0.12628324010323905


In [13]:
_,_ = evaluate(user_test_baskets_dict,upcfr_caboose_preds)

10
recall: 0.0933517148009453
ndcg: 0.15772899847965743
20
recall: 0.1400875733462191
ndcg: 0.12628324010323905


In [14]:
interactions_to_forget = train_baskets[['user_id','item_id']].drop_duplicates().sample(5000)

In [15]:
upcfr_similaripy.forget_interactions(interactions_to_forget.values.tolist())
upcfr_similaripy_preds_after_forget = upcfr_similaripy.predict()

uid       959
pid      9254
Score    1634
dtype: int64
(103748, 3)
959 9290


Done: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 959/959 [00:00<00:00, 7858.02it/s]


usersim shape (959, 959)


Done: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 959/959 [00:00<00:00, 4008.22it/s]


In [16]:
upcfr_caboose.forget_interactions(interactions_to_forget.values.tolist())
upcfr_caboose_preds_after_forget = upcfr_caboose.predict()

uid       959
pid      9254
Score    1634
dtype: int64
(103748, 3)
959 9290


In [17]:
sanity_check(upcfr_similaripy_preds_after_forget,upcfr_caboose_preds_after_forget)

bad predictions: 33


In [18]:
_,_ = evaluate(user_test_baskets_dict,upcfr_similaripy_preds_after_forget)

10
recall: 0.09227087857316017
ndcg: 0.15601259476666493
20
recall: 0.140187393624672
ndcg: 0.1260321065956166


In [19]:
_,_ = evaluate(user_test_baskets_dict,upcfr_caboose_preds_after_forget)

10
recall: 0.09227087857316017
ndcg: 0.15601259476666493
20
recall: 0.140187393624672
ndcg: 0.1260321065956166
