In [1]:
import pandas as pd
import numpy as np
from caboose_nbr.pernir import Pernir

In [2]:
train_baskets = pd.read_csv('data/instacart_30k/train_baskets.csv.gz')
test_baskets = pd.read_csv('data/instacart_30k/test_baskets.csv')
valid_baskets = pd.read_csv('data/instacart_30k/valid_baskets.csv')

In [3]:
aisles = pd.read_csv('data/instacart_30k/aisles.csv')
products = pd.read_csv('data/instacart_30k/products.csv')
products_with_aisles = products.merge(aisles, on='aisle_id')

In [4]:
train_baskets_with_aisles = train_baskets.merge(products_with_aisles, left_on="item_id", right_on="product_id")

In [5]:
# Works for seed=42 and sample_size=100,250,1000

seed = 1312
sample_size = 100
sensitive_aisles = [82, 92, 102, 56]

np.random.seed(seed)

baby_baskets = train_baskets_with_aisles[train_baskets_with_aisles.aisle_id.isin(sensitive_aisles)]
all_baby_users = baby_baskets.user_id.unique()
baby_users = np.array(np.random.choice(all_baby_users, sample_size))
baby_user_baskets = train_baskets_with_aisles[train_baskets_with_aisles.user_id.isin(baby_users)]

other_aisles = [aisle for aisle in baby_user_baskets.aisle_id.unique() if aisle not in sensitive_aisles]

all_nonbaby_users  = train_baskets_with_aisles[(train_baskets_with_aisles.aisle_id.isin(other_aisles)) \
                          & (~train_baskets_with_aisles.user_id.isin(all_baby_users))].user_id.unique()


nonbaby_users = np.array(np.random.choice(all_nonbaby_users, sample_size))

In [6]:
users = np.concatenate((baby_users, nonbaby_users))
sampled_train_baskets = train_baskets[train_baskets['user_id'].isin(users)]
sampled_test_baskets = test_baskets[test_baskets['user_id'].isin(users)]
sampled_valid_baskets = valid_baskets[valid_baskets['user_id'].isin(users)]

In [7]:
sampled_train_baskets[['user_id','item_id']].drop_duplicates().shape

(15486, 2)

In [8]:
pernir_caboose = Pernir(sampled_train_baskets, sampled_test_baskets, 'caboose')
pernir_caboose.train()

start of knn
knn finished


--Creating transpose of R...
--Computing row norms...
--Configuring for top-k -- num_threads: 8; pinning? false;
--Scheduling parallel top-k computation...


In [9]:
baby_items = set(baby_baskets.item_id.unique())

for user in baby_users:
    
    predictions, scores = pernir_caboose.predict_for_user(user, 10)
    nb = pernir_caboose.user_neighbors[user]
    predicted_baby_items = set(predictions) & baby_items
    has_baby_items = len(predicted_baby_items) > 0
    
    if has_baby_items:
        chosen_users_items = sampled_train_baskets[sampled_train_baskets.user_id==user].item_id.unique()
        chosen_users_baby_items = set(chosen_users_items) & baby_items

        to_forget = [(user, item) for item in chosen_users_baby_items]
        pernir_caboose.forget_interactions(to_forget)
        #predictions_after_forget = pernir_caboose.predict_for_user(user, 10)
        predictions_after_forget, scores_after = pernir_caboose.predict_for_user(user, 10)
        na = pernir_caboose.user_neighbors[user]
        remaining_baby_items = set(predictions_after_forget) & baby_items
        #print('Before', predictions)
        #print('Before', nb)        
        #print('After', predictions_after_forget)
        #print('After', na)        
        print(f'User {user}: ({len(predicted_baby_items)},{len(to_forget)}) --> {len(remaining_baby_items)}')
        print('---')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train_baskets['user_item'] = self.train_baskets.apply(lambda x: [x['user_id'],x['item_id']],axis = 1)


Before [158226, 114276, 5218, 202924, 45893, 174739, 181611, 205905, 152313, 76694]
After [158226, 5218, 202924, 45893, 114276, 76694, 174739, 181611, 205905, 152313]
User 206056: (1,4) --> 1
---
Before [43467, 5218, 158226, 40404, 63165, 125951, 193304, 205326]
After [43467, 193304, 158226, 125951, 205326, 5218, 40404, 63165]
User 168374: (1,1) --> 1
---
Before [81175, 193304, 43467, 152923, 67810, 3194, 103467, 91692, 137197, 26310]
After [81175, 193304, 43467, 152923, 103467, 91692, 67810, 137197, 26310, 3194]
User 160617: (1,29) --> 1
---
Before [124638, 33263, 33477, 72097, 152782, 170657, 118823, 72539, 65410, 189344]
After [124638, 33477, 33263, 118823, 65410, 152782, 72097, 72539, 189344, 170657]
User 194289: (1,1) --> 1
---
Before [140883, 54522, 45603, 76943, 67341, 167309, 76573, 192448, 27377, 28905]
After [140883, 76943, 54522, 45603, 28905, 192448, 67341, 76573, 27377, 167309]
User 105207: (2,6) --> 2
---
Before [122062, 76573, 45603, 141160, 131949, 27377, 48209, 67341, 