In [1]:
# TODO add to dev dependencies
# !pip install pandas==1.5.3
# !pip install scipy==1.10.1
# !pip install scikit-learn==1.2.2
# !pip install matplotlib

In [2]:
import pandas as pd
import numpy as np

from retrieval_importance import learn_importance, encode_retrievals
from applications.nbr.tifuknn import TIFUKNN

In [3]:
def recall_at_n(y_true, y_pred, n):
    a = len(set(y_pred[:n]).intersection(set(y_true)))
    b = len(set(y_true))
    return a/b

In [4]:
def utility(retrieval, prediction):    

    predictions = [(item, weight) for item, weight in enumerate(prediction) if weight > 0.0]    
    prediction_sorted = sorted(predictions, key=lambda pred: pred[1], reverse=True)
    top_items = [pred[0] for pred in prediction_sorted]
    
    recall = recall_at_n(retrieval['next_basket'], top_items, 5)
    
    return np.around(recall, decimals=2) 

In [5]:
def compare(tifu, tifu_clean, evaluation_baskets, num_eval_users, name):
    predictions_per_user = tifu.predict()
    predictions_per_user_clean = tifu_clean.predict()

    scores = []
    scores_clean = []

    users_better = 0
    users_worse = 0
    
    for user_index in range(0, num_eval_users):
        user = tifu.user_keys[user_index]
        next_basket_items = list(evaluation_baskets[evaluation_baskets.user_id==user].item_id)    

        prediction = predictions_per_user[user]
        prediction_clean = predictions_per_user_clean[user]
        
        score = recall_at_n(next_basket_items, prediction, 5)
        score_clean = recall_at_n(next_basket_items, prediction_clean, 5)

        if score_clean > score:
            users_better += 1
        elif score > score_clean:
            users_worse +=1

        scores.append(score)
        scores_clean.append(score_clean)

    print(f'{name}, users_worse={users_worse}, users_better={users_better}')    
    print(f'{name}, recall@5_dirty={np.mean(scores)}, recall@5_clean={np.mean(scores_clean)}')

In [6]:
def experiment(seed, num_users, num_eval_users, all_train_baskets, all_validation_baskets, all_test_baskets):
    np.random.seed(seed)

    unique_user_ids = list(all_train_baskets.user_id.unique())
    sampled_users = np.random.choice(unique_user_ids, num_users)
    train_baskets = all_train_baskets[all_train_baskets.user_id.isin(sampled_users)]
    validation_baskets = all_validation_baskets[all_validation_baskets.user_id.isin(sampled_users)]
    test_baskets = all_test_baskets[all_test_baskets.user_id.isin(sampled_users)] 

    tifu = TIFUKNN(train_baskets, distance_metric='cosine', k=10, kplus=40)
    tifu.train()   

    retrievals = []

    for user_index in range(0, num_eval_users):

        validation_user = tifu.user_keys[user_index]
        next_basket_items = list(validation_baskets[validation_baskets.user_id==validation_user].item_id)
        next_basket = [tifu.item_id_mapper[item] for item in next_basket_items if item in tifu.item_id_mapper]

        neighbors = []
        neighbor_representations = []

        for neighbor_index in tifu.nn_indices[user_index]:
            neighbors.append(tifu.user_keys[neighbor_index])
            neighbor_representations.append(tifu.user_reps[neighbor_index])

        if len(next_basket) > 0:    
            retrievals.append({
                'user': validation_user,
                'next_basket': next_basket,
                'neighbors': neighbors,
                'neighbor_representations': neighbor_representations,
            })    

    encoded_retrievals, mapping = encode_retrievals(retrievals, "neighbors", "neighbor_representations", utility)   

    v = learn_importance(encoded_retrievals, k=10, learning_rate=0.1, num_steps=500)

    users_to_retain = set([tifu.user_keys[index] for (index, value) in enumerate(v) if value >= 0.5])
    for user_index in range(0, num_eval_users):
        users_to_retain.add(tifu.user_keys[user_index])

    cleaned_train_baskets = train_baskets[train_baskets.user_id.isin(users_to_retain)]

    print(f'-----SEED={seed}-----')
    
    print(f'datasize_dirty={len(train_baskets)}, datasize_clean={len(cleaned_train_baskets)}')
    
    tifu_clean = TIFUKNN(cleaned_train_baskets, distance_metric='cosine', k=10)
    tifu_clean.train() 
    
    compare(tifu, tifu_clean, validation_baskets, num_eval_users, 'validation')
    compare(tifu, tifu_clean, test_baskets, num_eval_users, 'test')
    

In [7]:
all_train_baskets = pd.read_csv("applications/nbr/data/instacart_30k/train_baskets.csv.gz")
all_validation_baskets = pd.read_csv("applications/nbr/data/instacart_30k/valid_baskets.csv")
all_test_baskets = pd.read_csv("applications/nbr/data/instacart_30k/test_baskets.csv") 

In [8]:
for seed in [42, 16, 1812, 1312, 35]:
    experiment(seed, 1000, 100, all_train_baskets, all_validation_baskets, all_test_baskets)

-----SEED=42-----
datasize_dirty=157463, datasize_clean=88833
validation, users_worse=2, users_better=5
validation, recall@5_dirty=0.22881325718808893, recall@5_clean=0.23262278099761274
test, users_worse=2, users_better=2
test, recall@5_dirty=0.19534618540547083, recall@5_clean=0.19464698138368614
-----SEED=16-----
datasize_dirty=156897, datasize_clean=88453
validation, users_worse=5, users_better=1
validation, recall@5_dirty=0.22608281696343105, recall@5_clean=0.22392413656337135
test, users_worse=3, users_better=3
test, recall@5_dirty=0.20603470513012506, recall@5_clean=0.2047882404836604
-----SEED=1812-----
datasize_dirty=163196, datasize_clean=89226
validation, users_worse=5, users_better=4
validation, recall@5_dirty=0.19449518914579975, recall@5_clean=0.19591847653092923
test, users_worse=4, users_better=4
test, recall@5_dirty=0.18450692303365224, recall@5_clean=0.18564188121272807
-----SEED=1312-----
datasize_dirty=162644, datasize_clean=87998
validation, users_worse=7, users_be