In [1]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN
from surprise import Dataset, Reader, accuracy
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from datetime import datetime as dt

In [2]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [iid for iid, _ in user_ratings[:n]]

    return top_n

In [3]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t")
data_df.columns = ["user_id", "item_id", "rating", "timestamp"]
data_df.drop(columns=["timestamp"], axis=1, inplace=True)
data_df["user_id"] = data_df["user_id"].map({b: a for a, b in enumerate(data_df["user_id"].unique())})
data_df["item_id"] = data_df["item_id"].map({b: a for a, b in enumerate(data_df["item_id"].unique())})

train_df, test_df = train_test_split(data_df, test_size=0.2)

reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data_df, reader=reader)
raw_trainset = [(ruid, riid, r, None) for ruid, riid, r in train_df.to_records(index=False)]
raw_testset = [(ruid, riid, r, None) for ruid, riid, r in test_df.to_records(index=False)]
trainset = Dataset.construct_trainset(dataset, raw_trainset)
testset = Dataset.construct_testset(dataset, raw_testset)

In [5]:
sim = UserKNN().compute_similarities(trainset, min_support=1)
pop = UserKNN().compute_popularities(trainset)
gain = UserKNN().compute_gain(trainset)

model = UserKNN(k=30, precomputed_sim=sim)
model.fit(trainset)
predictions = model.test(testset)
mae = accuracy.mae(predictions, verbose=False)

topn = get_top_n(predictions, n=10)

In [None]:
maes = np.zeros(train_df["user_id"].nunique())
influence = np.zeros(train_df["user_id"].nunique())
s = dt.now()
i = 1
for ruid in train_df["user_id"].unique():
    train_without_u_df = train_df[train_df["user_id"] != ruid]
    raw_trainset = [(ruid, riid, r, None) for ruid, riid, r in train_without_u_df.to_records(index=False)]
    trainset_without_u = Dataset.construct_trainset(dataset, raw_trainset)
    
    model_without_u = UserKNN(k=30, precomputed_sim=sim)
    model_without_u.fit(trainset_without_u)
    predictions_without_u = model_without_u.test(testset)
    mae_without_u = accuracy.mae(predictions_without_u, verbose=False)
    maes[ruid] = mae_without_u
    
    topn_without_u = get_top_n(predictions_without_u, n=10)
    jdists = []
    for uid in topn_without_u.keys():
        jdist = 1 - len(set(topn_without_u[uid]).intersection(topn[uid])) / len(set(topn_without_u[uid]).union(topn[uid]))
        jdists.append(jdist)
    influence[ruid] = np.mean(jdists)
    
    print("[%f%% Done] Time elapsed %s, Influence %f, MAE Diff: %f" % (100 * i / train_df["user_id"].nunique(), dt.now() - s, influence[ruid], maes[ruid] - mae))
    
    i += 1