In [1]:
import pandas as pd
import numpy as np
from caboose_nbr.upcfr import UPCFr

In [2]:
train_baskets = pd.read_csv('data/instacart_30k/train_baskets.csv.gz')
test_baskets = pd.read_csv('data/instacart_30k/test_baskets.csv')
valid_baskets = pd.read_csv('data/instacart_30k/valid_baskets.csv')

In [3]:
aisles = pd.read_csv('data/instacart_30k/aisles.csv')
products = pd.read_csv('data/instacart_30k/products.csv')
products_with_aisles = products.merge(aisles, on='aisle_id')

In [4]:
train_baskets_with_aisles = train_baskets.merge(products_with_aisles, left_on="item_id", right_on="product_id")

In [5]:
# Works for seed=42 and sample_size=100,250,1000

seed = 1312
sample_size = 250
sensitive_aisles = [82, 92, 102, 56]

np.random.seed(seed)

baby_baskets = train_baskets_with_aisles[train_baskets_with_aisles.aisle_id.isin(sensitive_aisles)]
all_baby_users = baby_baskets.user_id.unique()
baby_users = np.array(np.random.choice(all_baby_users, sample_size))
baby_user_baskets = train_baskets_with_aisles[train_baskets_with_aisles.user_id.isin(baby_users)]

other_aisles = [aisle for aisle in baby_user_baskets.aisle_id.unique() if aisle not in sensitive_aisles]

all_nonbaby_users  = train_baskets_with_aisles[(train_baskets_with_aisles.aisle_id.isin(other_aisles)) \
                          & (~train_baskets_with_aisles.user_id.isin(all_baby_users))].user_id.unique()


nonbaby_users = np.array(np.random.choice(all_nonbaby_users, sample_size))

In [6]:
users = np.concatenate((baby_users, nonbaby_users))
sampled_train_baskets = train_baskets[train_baskets['user_id'].isin(users)]
sampled_test_baskets = test_baskets[test_baskets['user_id'].isin(users)]
sampled_valid_baskets = valid_baskets[valid_baskets['user_id'].isin(users)]

In [7]:
sampled_train_baskets[['user_id','item_id']].drop_duplicates().shape

(42192, 2)

In [8]:
upcfr_caboose = UPCFr(sampled_train_baskets, sampled_test_baskets, sampled_valid_baskets, 'caboose')
upcfr_caboose.train()

number of test users: 496
filtered items: 3980
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['uid'] = self.data['user_id'].rank(method='dense')-1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['pid'] = self.data['item_id'].rank(method='dense')-1
--Creating transpose of R...
--Computing row norms...
--Configuring for top-k -- num_threads: 8; pinning? false;
--Scheduling parallel top-k computation...


In [9]:
baby_items = set(baby_baskets.item_id.unique())

for user in baby_users:
    
    predictions = upcfr_caboose.predict_for_user(user, 10)

    predicted_baby_items = set(predictions) & baby_items
    has_baby_items = len(predicted_baby_items) > 0
    
    if has_baby_items:
        #chosen_users_items = sampled_train_baskets[sampled_train_baskets.user_id==user].item_id.unique()
        chosen_users_items = upcfr_caboose.data[upcfr_caboose.data.user_id==user].item_id.unique()        
        chosen_users_baby_items = set(chosen_users_items) & baby_items

        len_before = len(upcfr_caboose.data)
        to_forget = [(user, item) for item in chosen_users_baby_items]        
        upcfr_caboose.forget_interactions(to_forget)
        predictions_after_forget = upcfr_caboose.predict_for_user(user, 10)

        remaining_baby_items = set(predictions_after_forget) & baby_items    
        print(len_before, '-->', len(upcfr_caboose.data))
        print(f'User {user}: ({len(chosen_users_baby_items)},{len(predicted_baby_items)},{len(to_forget)}) --> {len(remaining_baby_items)}')
        print('---')

98359 --> 98356
User 206056: (2,1,2) --> 1
---
98356 --> 98355
User 76573: (1,1,1) --> 1
---
98355 --> 98353
User 129043: (2,1,2) --> 1
---
98353 --> 98266
User 158226: (48,6,48) --> 0
---
98266 --> 98262
User 205326: (2,1,2) --> 0
---
98262 --> 98250
User 80520: (4,1,4) --> 0
---
98250 --> 98249
User 157817: (1,1,1) --> 1
---
98249 --> 98245
User 196384: (3,1,3) --> 0
---
98245 --> 98243
User 44924: (1,1,1) --> 1
---
98243 --> 98131
User 165126: (21,4,21) --> 0
---
98131 --> 98122
User 28902: (9,1,9) --> 0
---
98122 --> 98118
User 177746: (3,1,3) --> 0
---
98118 --> 98057
User 69141: (15,1,15) --> 0
---
98057 --> 98050
User 187202: (1,1,1) --> 1
---
98050 --> 98048
User 107791: (2,1,2) --> 1
---


In [10]:
baby_items = set(baby_baskets.item_id.unique())

for user in baby_users:
    
    predictions = upcfr_caboose.predict_for_user(user, 10)

    predicted_baby_items = set(predictions) & baby_items
    has_baby_items = len(predicted_baby_items) > 0
    
    if has_baby_items:
        chosen_users_items = sampled_train_baskets[sampled_train_baskets.user_id==user].item_id.unique()
        chosen_users_baby_items = set(chosen_users_items) & baby_items

        to_forget = [(user, item) for item in chosen_users_baby_items]
        upcfr_caboose.forget_interactions(to_forget)
        predictions_after_forget = upcfr_caboose.predict_for_user(user, 10)

        remaining_baby_items = set(predictions_after_forget) & baby_items
        #print('Before', predictions)      
        #print('After', predictions_after_forget)     
        print(f'User {user}: ({len(predicted_baby_items)},{len(to_forget)}) --> {len(remaining_baby_items)}')
        print('---')

User 206056: (1,4) --> 1
---
User 76573: (1,1) --> 1
---
User 129043: (1,2) --> 1
---
User 157817: (1,1) --> 1
---
User 44924: (1,2) --> 1
---
User 187202: (1,1) --> 1
---
User 107791: (1,2) --> 1
---
