In [94]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
from scipy.stats import skew
from sklearn.preprocessing import MinMaxScaler

In [40]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"], usecols=["user_id", "item_id", "rating"])
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data_df, reader)
folds = KFold(n_splits=5)
#trainset, testset = train_test_split(dataset, test_size=0.2)

In [111]:
def rank_utility(model):
    sorted_uids = [uid for uid, _ in sorted(model.mae_u.items(), key=lambda t: t[1])[::-1]]
    return {uid: r for r, uid in enumerate(sorted_uids)}

def rank_privacy(model):
    sorted_uids = [uid for uid, _ in sorted(model.exposure_u.items(), key=lambda t: t[1])[::-1]]
    return {uid: r for r, uid in enumerate(sorted_uids)}

def tradeoff(model, lam):
    utility_ranks = rank_utility(model)
    privacy_ranks = rank_privacy(model)
    uids = set(utility_ranks.keys()).intersection(privacy_ranks.keys())
    avg_ranks = {uid: lam * utility_ranks[uid] + (1 - lam) * privacy_ranks[uid] for uid in uids}
    return avg_ranks

def delta_tradeoff(model1, model2, lam):
    ranks1 = tradeoff(model1, lam)
    ranks2 = tradeoff(model2, lam)
    uids = set(ranks1.keys()).intersection(ranks2.keys())
    return [np.abs(ranks2[uid] - ranks1[uid]) for uid in uids]

In [112]:
def gini(x):
    x = MinMaxScaler().fit_transform(np.array(x).reshape(-1, 1))
    # Mean absolute difference
    mad = np.abs(np.subtract.outer(x, x)).mean()
    # Relative mean absolute difference
    rmad = mad/np.mean(x)
    # Gini coefficient
    g = 0.5 * rmad
    return g

def hoover(x):
    x = MinMaxScaler().fit_transform(np.array(x).reshape(-1, 1))
    x_ = sorted(x)
    H = 0.5 * np.sum([np.abs(x_[i] - np.mean(x_)) for i in range(len(x_))]) / np.sum(x_)
    return H

In [113]:
Ks = np.arange(1, 30, 2)

userknn_delta_avg, userknn_reuse_delta_avg = [], []
popularity_delta_avg, popularity_reuse_delta_avg = [], []
gain_delta_avg, gain_reuse_delta_avg = [], []

for trainset, testset in folds.split(dataset):
    sim = UserKNN().compute_similarities(trainset, min_support=1)
    pop_r = UserKNN().compute_popularities(trainset)
    gain_r = UserKNN().compute_gain(trainset)

    userknn_delta, userknn_reuse_delta = [], []
    popularity_delta, popularity_reuse_delta = [], []
    gain_delta, gain_reuse_delta = [], []
    for k in Ks:
        print("=== k: %d ===" % k)
        userknn = UserKNN(k=k, precomputed_sim=sim)
        userknn.fit(trainset)
        _ = userknn.test(testset)
        delta = delta_tradeoff(userknn, userknn, lam=0.5)
        print("UserKNN Delta: %f" % hoover(delta))
        userknn_delta.append(hoover(delta))

        userknn_reuse = UserKNN(k=k, precomputed_sim=sim, reuse=True)
        userknn_reuse.fit(trainset)
        _ = userknn_reuse.test(testset)
        delta = delta_tradeoff(userknn, userknn_reuse, lam=0.5)
        print("UserKNN + Reuse Delta: %f" % hoover(delta))
        userknn_reuse_delta.append(hoover(delta))

        popularity = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop_r, tau_2=0.5)
        popularity.fit(trainset)
        _ = popularity.test(testset)
        delta = delta_tradeoff(userknn, popularity, lam=0.5)
        print("Popularity Delta: %f" % hoover(delta))
        popularity_delta.append(hoover(delta))

        popularity_reuse = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop_r, tau_2=0.5, reuse=True)
        popularity_reuse.fit(trainset)
        _ = popularity_reuse.test(testset)
        delta = delta_tradeoff(userknn, popularity_reuse, lam=0.5)
        print("Popularity + Reuse Delta: %f" % hoover(delta))
        popularity_reuse_delta.append(hoover(delta))

        gain = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain_r, tau_4=0.5)
        gain.fit(trainset)
        _ = gain.test(testset)
        delta = delta_tradeoff(userknn, gain, lam=0.5)
        print("Gain Delta: %f" % hoover(delta))
        gain_delta.append(hoover(delta))

        gain_reuse = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain_r, tau_4=0.5, reuse=True)
        gain_reuse.fit(trainset)
        _ = gain_reuse.test(testset)
        delta = delta_tradeoff(userknn, gain_reuse, lam=0.5)
        print("Gain + Reuse Delta: %f" % hoover(delta))
        gain_reuse_delta.append(hoover(delta))
    
    userknn_delta_avg.append(userknn_delta)
    userknn_reuse_delta_avg.append(userknn_reuse_delta)
    popularity_delta_avg.append(popularity_delta)
    popularity_reuse_delta_avg.append(popularity_reuse_delta)
    gain_delta_avg.append(gain_delta)
    gain_reuse_delta_avg.append(gain_reuse_delta)
    
    break
    
userknn_delta_avg = np.mean(userknn_delta_avg, axis=0)
userknn_reuse_delta_avg = np.mean(userknn_reuse_delta_avg, axis=0)
popularity_delta_avg = np.mean(popularity_delta_avg, axis=0)
popularity_reuse_delta_avg = np.mean(popularity_reuse_delta_avg, axis=0)
gain_delta_avg = np.mean(gain_delta_avg, axis=0)
gain_reuse_delta_avg = np.mean(gain_reuse_delta_avg, axis=0)

=== k: 1 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.343857
Popularity Delta: 0.274854
Popularity + Reuse Delta: 0.281830
Gain Delta: 0.277501
Gain + Reuse Delta: 0.287675
=== k: 3 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.317157
Popularity Delta: 0.291264
Popularity + Reuse Delta: 0.285968
Gain Delta: 0.285467
Gain + Reuse Delta: 0.281810
=== k: 5 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.320029
Popularity Delta: 0.294034
Popularity + Reuse Delta: 0.294404
Gain Delta: 0.287439
Gain + Reuse Delta: 0.283181
=== k: 7 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.319418
Popularity Delta: 0.300092
Popularity + Reuse Delta: 0.305616
Gain Delta: 0.292878
Gain + Reuse Delta: 0.294221
=== k: 9 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.319240
Popularity Delta: 0.296790
Popularity + Reuse Delta: 0.296635
Gain Delta: 0.296293
Gain + Reuse Delta: 0.301209
=== k: 11 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.326870
Popularity Delta: 0.294340
Popularity + Reuse Delta: 0.299051
Gain Delta: 0.295298
Gain + Reuse Delta: 0.295952
=== k: 13 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.327791
Popularity Delta: 0.309606
Popularity + Reuse Delta: 0.306532
Gain Delta: 0.301215
Gain + Reuse Delta: 0.299025
=== k: 15 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.325937
Popularity Delta: 0.319323
Popularity + Reuse Delta: 0.311486
Gain Delta: 0.308103
Gain + Reuse Delta: 0.309854
=== k: 17 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.323861
Popularity Delta: 0.318396
Popularity + Reuse Delta: 0.314793
Gain Delta: 0.313156
Gain + Reuse Delta: 0.313848
=== k: 19 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.329894
Popularity Delta: 0.320563
Popularity + Reuse Delta: 0.320178
Gain Delta: 0.315566
Gain + Reuse Delta: 0.314532
=== k: 21 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.325707
Popularity Delta: 0.329294
Popularity + Reuse Delta: 0.326636
Gain Delta: 0.323120
Gain + Reuse Delta: 0.323139
=== k: 23 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.335997
Popularity Delta: 0.331656
Popularity + Reuse Delta: 0.333988
Gain Delta: 0.326111
Gain + Reuse Delta: 0.327147
=== k: 25 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.336201
Popularity Delta: 0.336792
Popularity + Reuse Delta: 0.332137
Gain Delta: 0.332176
Gain + Reuse Delta: 0.332788
=== k: 27 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.333477
Popularity Delta: 0.336438
Popularity + Reuse Delta: 0.329663
Gain Delta: 0.328997
Gain + Reuse Delta: 0.327517
=== k: 29 ===


  


UserKNN Delta: nan


  


UserKNN + Reuse Delta: 0.336478
Popularity Delta: 0.340090
Popularity + Reuse Delta: 0.331586
Gain Delta: 0.335863
Gain + Reuse Delta: 0.326937


In [88]:
%matplotlib qt

In [101]:
# todo look at distribution +-
def gini(x):
    x = MinMaxScaler().fit_transform(np.array(x).reshape(-1, 1))
    # Mean absolute difference
    mad = np.abs(np.subtract.outer(x, x)).mean()
    # Relative mean absolute difference
    rmad = mad/np.mean(x)
    # Gini coefficient
    g = 0.5 * rmad
    return g

fig, ax = plt.subplots(1, 3)

dataset = Dataset.load_from_df(data_df, reader)
trainset, testset = train_test_split(dataset, test_size=0.2)

k = 5
userknn = UserKNN(k=k, precomputed_sim=sim)
userknn.fit(trainset)
_ = userknn.test(testset)
delta1 = delta_tradeoff(userknn, userknn, lam=0.5)

userknn_reuse = UserKNN(k=k, precomputed_sim=sim, reuse=True)
userknn_reuse.fit(trainset)
_ = userknn_reuse.test(testset)
delta2 = delta_tradeoff(userknn, userknn_reuse, lam=0.5)

gain_reuse = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain_r, tau_4=0.5, reuse=True)
gain_reuse.fit(trainset)
_ = gain_reuse.test(testset)
delta3 = delta_tradeoff(userknn, gain_reuse, lam=0.5)

print(gini(delta1), gini(delta2), gini(delta3))

ax[0].set_title("k=5")
ax[0].hist(delta3, bins=20, label="Gain + Reuse", alpha=0.5)
ax[0].hist(delta2, bins=20, label="UserKNN + Reuse", alpha=0.5)
ax[0].set_ylabel("Nr. of users")
ax[0].set_xlabel("Rank difference")
ax[0].set_xlim([-650, 650])
#ax[0].set_xlim([0, 700])
ax[0].legend();

k = 15
userknn = UserKNN(k=k, precomputed_sim=sim)
userknn.fit(trainset)
_ = userknn.test(testset)
delta1 = delta_tradeoff(userknn, userknn, lam=0.5)

userknn_reuse = UserKNN(k=k, precomputed_sim=sim, reuse=True)
userknn_reuse.fit(trainset)
_ = userknn_reuse.test(testset)
delta2 = delta_tradeoff(userknn, userknn_reuse, lam=0.5)

gain_reuse = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain_r, tau_4=0.5, reuse=True)
gain_reuse.fit(trainset)
_ = gain_reuse.test(testset)
delta3 = delta_tradeoff(userknn, gain_reuse, lam=0.5)

ax[1].set_title("k=15")
ax[1].hist(delta3, bins=20, label="Gain + Reuse", alpha=0.5)
ax[1].hist(delta2, bins=20, label="UserKNN + Reuse", alpha=0.5)
ax[1].set_ylabel("Nr. of users")
ax[1].set_xlabel("Rank difference")
ax[1].set_xlim([-650, 650])
#ax[1].set_xlim([0, 700])
ax[1].legend();

print(gini(delta1), gini(delta2), gini(delta3))

k = 30
userknn = UserKNN(k=k, precomputed_sim=sim)
userknn.fit(trainset)
_ = userknn.test(testset)
delta1 = delta_tradeoff(userknn, userknn, lam=0.5)

userknn_reuse = UserKNN(k=k, precomputed_sim=sim, reuse=True)
userknn_reuse.fit(trainset)
_ = userknn_reuse.test(testset)
delta2 = delta_tradeoff(userknn, userknn_reuse, lam=0.5)

gain_reuse = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain_r, tau_4=0.5, reuse=True)
gain_reuse.fit(trainset)
_ = gain_reuse.test(testset)
delta3 = delta_tradeoff(userknn, gain_reuse, lam=0.5)

ax[2].set_title("k=30")
ax[2].hist(delta3, bins=20, label="Gain + Reuse", alpha=0.5)
ax[2].hist(delta2, bins=20, label="UserKNN + Reuse", alpha=0.5)
ax[2].set_ylabel("Nr. of users")
ax[2].set_xlabel("Rank difference")
ax[2].set_xlim([-650, 650])
#ax[2].set_xlim([0, 700])
ax[2].legend();

print(gini(delta1), gini(delta2), gini(delta3))

  import sys


nan 0.4729597557193635 0.3761189690646829


  import sys


nan 0.4492038813586473 0.39463489550445247
nan 0.4610645262938714 0.40699726379361717


  import sys


In [114]:
plt.plot(Ks, userknn_delta_avg, color="C0", alpha=0.5, linestyle="dashed", label="UserKNN")
plt.plot(Ks, popularity_delta_avg, color="C1", alpha=0.5, linestyle="dashed", label="Popularity")
plt.plot(Ks, gain_delta, color_avg="C2", alpha=0.5, linestyle="dashed", label="Gain")
plt.plot(Ks, userknn_reuse_delta_avg, color="C0", label="UserKNN + Reuse")
plt.plot(Ks, popularity_reuse_delta_avg, color="C1", label="Popularity + Reuse")
plt.plot(Ks, gain_reuse_delta_avg, color="C2", label="Gain + Reuse")
plt.ylabel("Hoover of absolute rank difference")
plt.xlabel("Nr. of neighbors")
plt.legend(ncol=2)
#plt.savefig("skew_rankdiff.png", dpi=300)

<matplotlib.legend.Legend at 0x261eb8182e8>

In [116]:
np.floor(-9.5)

-10.0