In [1]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, Counter
from datetime import datetime as dt
from scipy.stats import spearmanr

  tree = Parsing.p_module(s, pxd, full_module_name)


In [2]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t")
data_df.columns = ["user_id", "item_id", "rating", "timestamp"]
data_df.drop(columns=["timestamp"], axis=1, inplace=True)
data_df["user_id"] = data_df["user_id"].map({b: a for a, b in enumerate(data_df["user_id"].unique())})
data_df["item_id"] = data_df["item_id"].map({b: a for a, b in enumerate(data_df["item_id"].unique())})

reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data_df, reader=reader)
trainset, testset = train_test_split(dataset, test_size=0.2)
folds = KFold(n_splits=5)

In [None]:
for k in [1, 5, 10, 15, 20, 25, 30]:
    print("k: %d" % k)
    
    model = UserKNN(k=k)
    model.fit(trainset)
    _ = model.test(testset)

    D_noreuse = np.zeros((trainset.n_users, trainset.n_users))
    for alice in model.trainset.all_users():
        for bob, _ in model.known_secrets[alice]:
            D_noreuse[alice, bob] += 1

    model = UserKNN(k=k, tau_4=0.5, reuse=True)
    model.fit(trainset)
    _ = model.test(testset)

    D_reuse = np.zeros((trainset.n_users, trainset.n_users))
    for alice in model.trainset.all_users():
        for bob, _ in model.known_secrets[alice]:
            D_reuse[alice, bob] += 1
            
    protected_neighbors = defaultdict(set)
    for alice, bob in zip(*np.where(D_noreuse - D_reuse < 0)):
        protected_neighbors[alice].add(bob)
    
    model = UserKNN(k=k)
    model.fit(trainset)
    predictions = model.test(testset)
    print(accuracy.mae(predictions, verbose=False))

    D_noreuse = np.zeros((trainset.n_users, trainset.n_users))
    for alice in model.trainset.all_users():
        for bob, _ in model.known_secrets[alice]:
            D_noreuse[alice, bob] += 1

    model = UserKNN(k=k, tau_4=0.5, reuse=True, protected_neighbors=protected_neighbors)
    model.fit(trainset)
    predictions = model.test(testset)
    print(accuracy.mae(predictions, verbose=False))

    D_reuse = np.zeros((trainset.n_users, trainset.n_users))
    for alice in model.trainset.all_users():
        for bob, _ in model.known_secrets[alice]:
            D_reuse[alice, bob] += 1
            
    print("=======")
        
len(np.where(D_noreuse - D_reuse < 0)[0]) / (D_noreuse.shape[0] * D_noreuse.shape[1])

In [3]:
def get_protected_neighbors(model_baseline, model):
    D_baseline = np.zeros((trainset.n_users, trainset.n_users))
    for alice in model_baseline.trainset.all_users():
        for bob, _ in model_baseline.known_secrets[alice]:
            D_baseline[alice, bob] += 1
            
    D = np.zeros((trainset.n_users, trainset.n_users))
    for alice in model.trainset.all_users():
        for bob, _ in model.known_secrets[alice]:
            D[alice, bob] += 1
            
    protected_neighbors = defaultdict(set)
    for alice, bob in zip(*np.where(D_baseline - D < 0)):
        protected_neighbors[alice].add(bob)
    
    return protected_neighbors

In [12]:
userknn, userknnreuse, pop, popreuse, gain, gainreuse = [], [], [], [], [], []
userknn_exp, userknnreuse_exp, pop_exp, popreuse_exp, gain_exp, gainreuse_exp = [], [], [], [], [], []
Ks = np.arange(1, 31, 2)
for trainset, testset in folds.split(dataset):        
    userknn_fold, userknnreuse_fold, pop_fold, popreuse_fold, gain_fold, gainreuse_fold = [], [], [], [], [], []
    userknn_exp_fold, userknnreuse_exp_fold, pop_exp_fold, popreuse_exp_fold, gain_exp_fold, gainreuse_exp_fold = [], [], [], [], [], []
    for k in Ks:
        print(k)
        baseline = UserKNN(k=k)
        baseline.fit(trainset)
        predictions = baseline.test(testset)
        userknn_fold.append(accuracy.mae(predictions, verbose=False))
        userknn_exp_fold.append(np.sum(list(baseline.exposure_u.values())) / baseline.trainset.n_users)
        
        model = UserKNN(k=k, reuse=True)
        model.fit(trainset)
        _ = model.test(testset)
        model = UserKNN(k=k, reuse=True, protected_neighbors=get_protected_neighbors(baseline, model))
        model.fit(trainset)
        predictions = model.test(testset)
        userknnreuse_fold.append(accuracy.mae(predictions, verbose=False))
        userknnreuse_exp_fold.append(np.sum(list(model.exposure_u.values())) / model.trainset.n_users)

        model = UserKNN(k=k, tau_2=0.5)
        model.fit(trainset)
        _ = model.test(testset)
        model = UserKNN(k=k, tau_2=0.5, protected_neighbors=get_protected_neighbors(baseline, model))
        model.fit(trainset)
        predictions = model.test(testset)
        pop_fold.append(accuracy.mae(predictions, verbose=False))
        pop_exp_fold.append(np.sum(list(model.exposure_u.values())) / model.trainset.n_users)

        model = UserKNN(k=k, tau_2=0.5, reuse=True)
        model.fit(trainset)
        _ = model.test(testset)
        model = UserKNN(k=k, tau_2=0.5, reuse=True, protected_neighbors=get_protected_neighbors(baseline, model))
        model.fit(trainset)
        predictions = model.test(testset)
        popreuse_fold.append(accuracy.mae(predictions, verbose=False))
        popreuse_exp_fold.append(np.sum(list(model.exposure_u.values())) / model.trainset.n_users)

        model = UserKNN(k=k, tau_4=0.5)
        model.fit(trainset)
        _ = model.test(testset)
        model = UserKNN(k=k, tau_4=0.5, protected_neighbors=get_protected_neighbors(baseline, model))
        model.fit(trainset)
        predictions = model.test(testset)
        gain_fold.append(accuracy.mae(predictions, verbose=False))
        gain_exp_fold.append(np.sum(list(model.exposure_u.values())) / model.trainset.n_users)

        model = UserKNN(k=k, tau_4=0.5, reuse=True)
        model.fit(trainset)
        _ = model.test(testset)
        model = UserKNN(k=k, tau_4=0.5, reuse=True, protected_neighbors=get_protected_neighbors(baseline, model))
        model.fit(trainset)
        predictions = model.test(testset)
        gainreuse_fold.append(accuracy.mae(predictions, verbose=False))
        gainreuse_exp_fold.append(np.sum(list(model.exposure_u.values())) / model.trainset.n_users)
    
    userknn.append(userknn_fold)
    userknnreuse.append(userknnreuse_fold)
    pop.append(pop_fold)
    popreuse.append(popreuse_fold)
    gain.append(gain_fold)
    gainreuse.append(gainreuse_fold)
    
    userknn_exp.append(userknn_exp_fold)
    userknnreuse_exp.append(userknnreuse_exp_fold)
    pop_exp.append(pop_exp_fold)
    popreuse_exp.append(popreuse_exp_fold)
    gain_exp.append(gain_exp_fold)
    gainreuse_exp.append(gainreuse_exp_fold)
    
    print("fold finished")
    break
    
userknn = np.mean(userknn, axis=0)
userknnreuse = np.mean(userknnreuse, axis=0)
pop = np.mean(pop, axis=0)
popreuse = np.mean(popreuse, axis=0)
gain = np.mean(gain, axis=0)
gainreuse = np.mean(gainreuse, axis=0)

userknn_exp = np.mean(userknn_exp, axis=0)
userknnreuse_exp = np.mean(userknnreuse_exp, axis=0)
pop_exp = np.mean(pop_exp, axis=0)
popreuse_exp = np.mean(popreuse_exp, axis=0)
gain_exp = np.mean(gain_exp, axis=0)
gainreuse_exp = np.mean(gainreuse_exp, axis=0)

1
3
5
7
9
11
13
15
17
19
21
23
25
27
29
fold finished


In [13]:
%matplotlib qt

In [14]:
plt.figure()
plt.plot(Ks, userknn, color="C0", linestyle="dashed", label="UserKNN", alpha=0.5)
plt.plot(Ks, pop, color="C1", linestyle="dashed", label="Popularity", alpha=0.5)
plt.plot(Ks, gain, color="C2", linestyle="dashed", label="Gain", alpha=0.5)
plt.plot(Ks, userknnreuse, color="C0", linestyle="solid", label="UserKNN + Reuse")
plt.plot(Ks, popreuse, color="C1", linestyle="solid", label="Popularity + Reuse")
plt.plot(Ks, gainreuse, color="C2", linestyle="solid", label="Gain + Reuse")
plt.xlabel("Nr. of neighbors")
plt.ylabel("Mean absolute error")
plt.legend(ncol=2)
plt.tight_layout()
plt.show()

In [15]:
plt.figure()
plt.plot(userknn_exp, userknn, color="C0", linestyle="dashed", label="UserKNN", alpha=0.5)
plt.plot(pop_exp, pop, color="C1", linestyle="dashed", label="Popularity", alpha=0.5)
plt.plot(gain_exp, gain, color="C2", linestyle="dashed", label="Gain", alpha=0.5)
plt.plot(userknnreuse_exp, userknnreuse, color="C0", linestyle="solid", label="UserKNN + Reuse")
plt.plot(popreuse_exp, popreuse, color="C1", linestyle="solid", label="Popularity + Reuse")
plt.plot(gainreuse_exp, gainreuse, color="C2", linestyle="solid", label="Gain + Reuse")
plt.xlabel("Exposure")
plt.ylabel("Mean absolute error")
plt.legend(ncol=2)
plt.tight_layout()
plt.show()