In [1]:
import pandas as pd
import numpy as np

from retrieval_importance import learn_importance, encode_retrievals
from applications.nbr.tifuknn import TIFUKNN

In [2]:
def hitrate_at_n(y_true, y_pred, n):

    num_relevant = len(set(y_pred[:n]).intersection(set(y_true)))

    if num_relevant > 0:
        return 1.0
    else:
        return 0.0
    

def recall_at_n(y_true, y_pred, n):
    a = len(set(y_pred[:n]).intersection(set(y_true)))
    b = len(set(y_true))
    return a/b
    

def precision_at_n(y_true, y_pred, n):
    a = len(set(y_pred[:n]).intersection(set(y_true)))
    return a/n    
    

In [3]:
def utility(retrieval, user_vector):    

    predictions = [(item, weight) for item, weight in enumerate(user_vector) if weight > 0.0]    
    prediction_sorted = sorted(predictions, key=lambda pred: pred[1], reverse=True)
    top_items = [pred[0] for pred in prediction_sorted]
    
    return recall_at_n(retrieval['next_basket'], top_items, 5)    

In [4]:
def compare(tifu, tifu_clean, evaluation_baskets, num_eval_users, name):
    predictions_per_user = tifu.predict()
    predictions_per_user_clean = tifu_clean.predict()

    hitrates_dirty = []
    recalls_dirty = []
    precisions_dirty = []
    
    hitrates_clean = []
    recalls_clean = []
    precisions_clean = []
    
    for user_index in range(0, num_eval_users):
        user = tifu.user_keys[user_index]
        next_basket_items = list(evaluation_baskets[evaluation_baskets.user_id==user].item_id)    

        prediction = predictions_per_user[user]
        prediction_clean = predictions_per_user_clean[user]
        
        hitrates_dirty.append(hitrate_at_n(next_basket_items, prediction, 5))
        recalls_dirty.append(recall_at_n(next_basket_items, prediction, 5))     
        precisions_dirty.append(precision_at_n(next_basket_items, prediction, 5))             
        
        hitrates_clean.append(hitrate_at_n(next_basket_items, prediction_clean, 5))
        recalls_clean.append(recall_at_n(next_basket_items, prediction_clean, 5))
        precisions_clean.append(precision_at_n(next_basket_items, prediction_clean, 5))  
    
    print(f'{name}, hitrate@5_dirty={np.mean(hitrates_dirty)}, hitrate@5_clean={np.mean(hitrates_clean)}') 
    print(f'{name}, precision@5_dirty={np.mean(precisions_dirty)}, precision@5_clean={np.mean(precisions_clean)}')     
    print(f'{name}, recall@5_dirty={np.mean(recalls_dirty)}, recall@5_clean={np.mean(recalls_clean)}')     

In [5]:
def experiment(seed, num_users, num_eval_users, threshold, all_train_baskets, all_validation_baskets, all_test_baskets):
    np.random.seed(seed)

    unique_user_ids = list(all_train_baskets.user_id.unique())
    sampled_users = np.random.choice(unique_user_ids, num_users)
    train_baskets = all_train_baskets[all_train_baskets.user_id.isin(sampled_users)]
    validation_baskets = all_validation_baskets[all_validation_baskets.user_id.isin(sampled_users)]
    test_baskets = all_test_baskets[all_test_baskets.user_id.isin(sampled_users)] 

    tifu = TIFUKNN(train_baskets, distance_metric='cosine', k=10, kplus=40)
    tifu.train()   

    retrievals = []

    for user_index in range(0, num_eval_users):

        validation_user = tifu.user_keys[user_index]
        next_basket_items = list(validation_baskets[validation_baskets.user_id==validation_user].item_id)
        next_basket = [tifu.item_id_mapper[item] for item in next_basket_items if item in tifu.item_id_mapper]

        neighbors = []
        neighbor_representations = []

        for neighbor_index in tifu.nn_indices[user_index][1:]:
            neighbors.append(tifu.user_keys[neighbor_index])
            neighbor_representations.append(tifu.user_reps[neighbor_index])

        if len(next_basket) > 0:    
            retrievals.append({
                'user': validation_user,
                'next_basket': next_basket,
                'neighbors': neighbors,
                'neighbor_representations': neighbor_representations,
            })    

    encoded_retrievals, mapping = encode_retrievals(retrievals, "neighbors", "neighbor_representations", utility)   

    v = learn_importance(encoded_retrievals, k=10, learning_rate=0.1, num_steps=500)

    users_to_retain = set([tifu.user_keys[index] for (index, value) in enumerate(v) if value >= threshold])
    for user_index in range(0, num_eval_users):
        users_to_retain.add(tifu.user_keys[user_index])

    cleaned_train_baskets = train_baskets[train_baskets.user_id.isin(users_to_retain)]

    print(f'-----SEED={seed}-----')
    
    print(f'datasize_dirty={len(train_baskets)}, datasize_clean={len(cleaned_train_baskets)}')
    
    tifu_clean = TIFUKNN(cleaned_train_baskets, distance_metric='cosine', k=10)
    tifu_clean.train() 
    
    compare(tifu, tifu_clean, validation_baskets, num_eval_users, 'validation')
    compare(tifu, tifu_clean, test_baskets, num_eval_users, 'test')
    

In [6]:
all_train_baskets = pd.read_csv("applications/nbr/data/instacart_30k/train_baskets.csv.gz")
all_validation_baskets = pd.read_csv("applications/nbr/data/instacart_30k/valid_baskets.csv")
all_test_baskets = pd.read_csv("applications/nbr/data/instacart_30k/test_baskets.csv") 

In [7]:
for seed in [42, 16, 1812, 1312, 35]:
    experiment(seed, 1000, 100, 0.5, all_train_baskets, all_validation_baskets, all_test_baskets)

-----SEED=42-----
datasize_dirty=157463, datasize_clean=88093
validation, hitrate@5_dirty=0.88, hitrate@5_clean=0.88
validation, precision@5_dirty=0.43200000000000005, precisions@5_clean=0.43
validation, recall@5_dirty=0.22947992385475557, recall@5_clean=0.2289561143309461
test, hitrate@5_dirty=0.78, hitrate@5_clean=0.78
test, precision@5_dirty=0.374, precisions@5_clean=0.37
test, recall@5_dirty=0.19696956202884747, recall@5_clean=0.19539698138368616
-----SEED=16-----
datasize_dirty=156897, datasize_clean=83947
validation, hitrate@5_dirty=0.92, hitrate@5_clean=0.9
validation, precision@5_dirty=0.4520000000000001, precisions@5_clean=0.4500000000000001
validation, recall@5_dirty=0.22461817049878455, recall@5_clean=0.22486593550517028
test, hitrate@5_dirty=0.84, hitrate@5_clean=0.85
test, precision@5_dirty=0.402, precisions@5_clean=0.4059999999999999
test, recall@5_dirty=0.20603470513012506, recall@5_clean=0.20762157381699378
-----SEED=1812-----
datasize_dirty=163196, datasize_clean=84146

In [8]:
for seed in [42, 16, 1812, 1312, 35]:
    experiment(seed, 1000, 100, 0.2, all_train_baskets, all_validation_baskets, all_test_baskets)

-----SEED=42-----
datasize_dirty=157463, datasize_clean=141432
validation, hitrate@5_dirty=0.88, hitrate@5_clean=0.88
validation, precision@5_dirty=0.43200000000000005, precisions@5_clean=0.43000000000000005
validation, recall@5_dirty=0.22947992385475557, recall@5_clean=0.22881325718808893
test, hitrate@5_dirty=0.78, hitrate@5_clean=0.78
test, precision@5_dirty=0.374, precisions@5_clean=0.37199999999999994
test, recall@5_dirty=0.19696956202884747, recall@5_clean=0.19606047111975652
-----SEED=16-----
datasize_dirty=156897, datasize_clean=140116
validation, hitrate@5_dirty=0.92, hitrate@5_clean=0.91
validation, precision@5_dirty=0.4520000000000001, precisions@5_clean=0.4520000000000001
validation, recall@5_dirty=0.22461817049878455, recall@5_clean=0.22406261494322902
test, hitrate@5_dirty=0.84, hitrate@5_clean=0.84
test, precision@5_dirty=0.402, precisions@5_clean=0.402
test, recall@5_dirty=0.20603470513012506, recall@5_clean=0.20603470513012506
-----SEED=1812-----
datasize_dirty=163196,