In [1]:
import pandas as pd
import random
from caboose_nbr.tifuknn import TIFUKNN
from caboose_nbr.evaluate_recommendation import evaluate

In [2]:
train_baskets = pd.read_csv('data/instacart_30k/train_baskets.csv.gz')
test_baskets = pd.read_csv('data/instacart_30k/test_baskets.csv')
valid_baskets = pd.read_csv('data/instacart_30k/valid_baskets.csv')

In [8]:
aisles = pd.read_csv('data/instacart_30k/aisles.csv')
products = pd.read_csv('data/instacart_30k/products.csv')
products_with_aisles = products.merge(aisles, on='aisle_id')

In [11]:
train_baskets_with_aisles = train_baskets.merge(products_with_aisles, left_on="item_id", right_on="product_id")

In [60]:
import numpy as np
np.random.seed(42)

sample_size = 100
sensitive_aisles = [82, 92, 102, 56]

baby_baskets = train_baskets_with_aisles[train_baskets_with_aisles.aisle_id.isin(sensitive_aisles)]
all_baby_users = baby_baskets.user_id.unique()
baby_user_baskets = train_baskets_with_aisles[train_baskets_with_aisles.user_id.isin(baby_users)]

other_aisles = [aisle for aisle in baby_user_baskets.aisle_id.unique() if aisle not in sensitive_aisles]

all_nonbaby_users  = train_baskets_with_aisles[(train_baskets_with_aisles.aisle_id.isin(other_aisles)) \
                          & (~train_baskets_with_aisles.user_id.isin(all_baby_users))].user_id.unique()

baby_users = np.array(np.random.choice(all_baby_users, sample_size))
nonbaby_users = np.array(np.random.choice(all_nonbaby_users, sample_size))

In [66]:
users = np.concatenate((baby_users, nonbaby_users))
sampled_train_baskets = train_baskets[train_baskets['user_id'].isin(users)]
sampled_test_baskets = test_baskets[test_baskets['user_id'].isin(users)]
sampled_valid_baskets = valid_baskets[valid_baskets['user_id'].isin(users)]

In [68]:
sampled_train_baskets.shape

(39814, 10)

In [69]:
sampled_train_baskets[['user_id','item_id']].drop_duplicates().shape

(15620, 2)

In [70]:
tifu_caboose = TIFUKNN(sampled_train_baskets, sampled_test_baskets, sampled_valid_baskets, 'caboose')
tifu_caboose.train()

number of test users: 200
filtered items: 6762
initial data processing
item count: 1556
compute basket reps
compute user reps 200
(200, 1556)
start of knn
knn finished


--Creating transpose of R...
--Computing row norms...
--Configuring for top-k -- num_threads: 8; pinning? false;
--Scheduling parallel top-k computation...


In [72]:
tifu_caboose_preds = tifu_caboose.predict()

{1259: [23909,
  46676,
  13176,
  13944,
  19057,
  37646,
  33754,
  31422,
  19488,
  47119,
  5047,
  16696,
  40571,
  15693,
  40910,
  34050,
  24852,
  47865,
  21903,
  21137,
  47209,
  24964,
  47626,
  27966,
  47766,
  26209,
  27845,
  27086,
  45007,
  22935,
  16797,
  43961,
  33000,
  48679,
  28985,
  26604,
  21938,
  5876,
  44359,
  17794,
  28204,
  31717,
  45066,
  49683,
  33731,
  24184,
  5077,
  49235,
  4920,
  8518,
  19660,
  40706,
  30391,
  30489,
  16759,
  42265,
  17600,
  39275,
  24489,
  22035,
  46979,
  29487,
  5450,
  43352,
  19048,
  34126,
  39877,
  14678,
  18523,
  29447,
  14992,
  34969,
  22825,
  9839,
  11520,
  42736,
  36865,
  42828,
  17948,
  31506,
  43768,
  27344,
  3952,
  35951,
  26790,
  8021,
  27104,
  24838,
  46667,
  4605,
  44632,
  8424,
  10749,
  25890,
  39475,
  6348,
  329,
  38689,
  42701,
  7948,
  34243,
  10673,
  8277,
  18027,
  28849,
  27336,
  46906,
  41665,
  43122,
  41950,
  33787,
  18656,
  

In [76]:
baby_items = set(baby_baskets.item_id.unique())

In [82]:
for user in baby_users:
    predictions = tifu_caboose_preds[user][:10]
    predicted_baby_items = set(predictions) & baby_items
    has_baby_items = len(predicted_baby_items) > 0
    if has_baby_items:
        print(user, predictions, predicted_baby_items)

188793 [26620, 32864, 34584, 13176, 24489, 41149, 40449, 8177, 28373, 2228] {40449}
182291 [13176, 37824, 47209, 27845, 5491, 30489, 24561, 17630, 14778, 30967] {37824, 5491}
19343 [16757, 5564, 20957, 44310, 35689, 43858, 17835, 7806, 14399, 18811] {20957}
172577 [12614, 28199, 2664, 24390, 47626, 25146, 24221, 17461, 39993, 16560] {2664, 16560}
180914 [13870, 34197, 30391, 45066, 27344, 2611, 32303, 30192, 49044, 22969] {30192, 22969, 2611, 49044}
9446 [13263, 46720, 24964, 38164, 13176, 5785, 32989, 43961, 28465, 24799] {32989}
109645 [29594, 24489, 33290, 24838, 31506, 7948, 45767, 47167, 40992, 39812] {47167, 45767}
35700 [24852, 47766, 21903, 6948, 17600, 44570, 40174, 45767, 28849, 3376] {45767}
177090 [21386, 29479, 39561, 8204, 13176, 16508, 16020, 16797, 37096, 28993] {16508, 29479}
5210 [13176, 7948, 47209, 27966, 41950, 6986, 21137, 16759, 22935, 2855] {6986}
129101 [24852, 43875, 17795, 37096, 287, 34013, 16797, 47766, 21903, 13176] {43875}
126163 [1463, 24852, 2611, 5785,

In [88]:
chosen_user = 19343

chosen_users_items = sampled_train_baskets[sampled_train_baskets.user_id==chosen_user].item_id.unique()
chosen_users_baby_items = set(chosen_users_items) & baby_items

to_forget = [(chosen_user, item) for item in chosen_users_baby_items]
tifu_caboose.forget_interactions(to_forget )

In [89]:
tifu_caboose_preds_after_forget = tifu_caboose.predict()

In [93]:
predictions_after_forget = tifu_caboose_preds_after_forget[chosen_user][:10]

In [94]:
set(predictions_after_forget) & baby_items

set()