In [16]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, Counter
from datetime import datetime as dt
from scipy.stats import spearmanr
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import gaussian_kde
from sklearn.neighbors import KernelDensity
import heapq

In [2]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t")
data_df.columns = ["user_id", "item_id", "rating", "timestamp"]
data_df.drop(columns=["timestamp"], axis=1, inplace=True)
data_df["user_id"] = data_df["user_id"].map({b: a for a, b in enumerate(data_df["user_id"].unique())})
data_df["item_id"] = data_df["item_id"].map({b: a for a, b in enumerate(data_df["item_id"].unique())})

reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data_df, reader=reader)
trainset, testset = train_test_split(dataset, 0.2)

In [None]:
sim = UserKNN.compute_similarities(trainset, min_support=1)
pop = UserKNN.compute_popularities(trainset)
gain = UserKNN.compute_gain(trainset)

userknn_frac_predicted, userknnreuse_frac_predicted = [], []
pop_frac_predicted, popreuse_frac_predicted = [], []
gain_frac_predicted, gainreuse_frac_predicted = [], []

Ks = np.arange(1, 30, 2)
for k in Ks:
    model = UserKNN(k=k, precomputed_sim=sim)
    model.fit(trainset)
    _ = model.test(testset)

    exposure_real = np.zeros((trainset.n_users))
    for uid, exposure in model.exposure_u.items():
        exposure_real[uid] = exposure

    exposure_est = np.zeros((trainset.n_users))
    for alice, ratings in trainset.ur.items():
        ranks =  model.ranking[alice]
        N_alice = set()
        for iid, _ in ratings:
            possible_neighbors = [(bob, ranks[bob]) for bob, _ in trainset.ir[iid] if bob != alice]
            k_neighbors = heapq.nlargest(k, possible_neighbors, key=lambda t: t[1])
            N_alice = N_alice.union([bob for bob, _ in k_neighbors])
        for bob in N_alice:
            exposure_est[bob] += 1

    top_vulnerable_real = np.argsort(exposure_real)[::-1][:10]
    top_vulnerable_est = np.argsort(exposure_est)[::-1][:10]

    frac = len(set(top_vulnerable_real).intersection(top_vulnerable_est)) / 10
    print("[k=%d] UserKNN %f" % (k, frac))
    userknn_frac_predicted.append(frac)
    
    model = UserKNN(k=k, precomputed_sim=sim, reuse=True)
    model.fit(trainset)
    _ = model.test(testset)

    exposure_real = np.zeros((trainset.n_users))
    for uid, exposure in model.exposure_u.items():
        exposure_real[uid] = exposure

    exposure_est = np.zeros((trainset.n_users))
    for alice, ratings in trainset.ur.items():
        ranks =  model.ranking[alice]
        N_alice = set()
        for iid, _ in ratings:
            possible_neighbors = [(bob, ranks[bob]) for bob, _ in trainset.ir[iid] if bob != alice]
            k_neighbors = heapq.nlargest(k, possible_neighbors, key=lambda t: t[1])
            N_alice = N_alice.union([bob for bob, _ in k_neighbors])
        for bob in N_alice:
            exposure_est[bob] += 1

    top_vulnerable_real = np.argsort(exposure_real)[::-1][:10]
    top_vulnerable_est = np.argsort(exposure_est)[::-1][:10]

    frac = len(set(top_vulnerable_real).intersection(top_vulnerable_est)) / 10
    print("[k=%d] UserKNN+Reuse %f" % (k, frac))
    userknnreuse_frac_predicted.append(frac)
    
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, tau_2=0.5)
    model.fit(trainset)
    _ = model.test(testset)

    exposure_real = np.zeros((trainset.n_users))
    for uid, exposure in model.exposure_u.items():
        exposure_real[uid] = exposure

    exposure_est = np.zeros((trainset.n_users))
    for alice, ratings in trainset.ur.items():
        ranks =  model.ranking[alice]
        N_alice = set()
        for iid, _ in ratings:
            possible_neighbors = [(bob, ranks[bob]) for bob, _ in trainset.ir[iid] if bob != alice]
            k_neighbors = heapq.nlargest(k, possible_neighbors, key=lambda t: t[1])
            N_alice = N_alice.union([bob for bob, _ in k_neighbors])
        for bob in N_alice:
            exposure_est[bob] += 1

    top_vulnerable_real = np.argsort(exposure_real)[::-1][:10]
    top_vulnerable_est = np.argsort(exposure_est)[::-1][:10]

    frac = len(set(top_vulnerable_real).intersection(top_vulnerable_est)) / 10
    print("[k=%d] Popularity %f" % (k, frac))
    pop_frac_predicted.append(frac)
    
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_pop=pop, tau_2=0.5)
    model.fit(trainset)
    _ = model.test(testset)

    exposure_real = np.zeros((trainset.n_users))
    for uid, exposure in model.exposure_u.items():
        exposure_real[uid] = exposure

    exposure_est = np.zeros((trainset.n_users))
    for alice, ratings in trainset.ur.items():
        ranks =  model.ranking[alice]
        N_alice = set()
        for iid, _ in ratings:
            possible_neighbors = [(bob, ranks[bob]) for bob, _ in trainset.ir[iid] if bob != alice]
            k_neighbors = heapq.nlargest(k, possible_neighbors, key=lambda t: t[1])
            N_alice = N_alice.union([bob for bob, _ in k_neighbors])
        for bob in N_alice:
            exposure_est[bob] += 1

    top_vulnerable_real = np.argsort(exposure_real)[::-1][:10]
    top_vulnerable_est = np.argsort(exposure_est)[::-1][:10]

    frac = len(set(top_vulnerable_real).intersection(top_vulnerable_est)) / 10
    print("[k=%d] Popularity+Reuse %f" % (k, frac))
    popreuse_frac_predicted.append(frac)
    
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain, tau_4=0.5)
    model.fit(trainset)
    _ = model.test(testset)

    exposure_real = np.zeros((trainset.n_users))
    for uid, exposure in model.exposure_u.items():
        exposure_real[uid] = exposure

    exposure_est = np.zeros((trainset.n_users))
    for alice, ratings in trainset.ur.items():
        ranks =  model.ranking[alice]
        N_alice = set()
        for iid, _ in ratings:
            possible_neighbors = [(bob, ranks[bob]) for bob, _ in trainset.ir[iid] if bob != alice]
            k_neighbors = heapq.nlargest(k, possible_neighbors, key=lambda t: t[1])
            N_alice = N_alice.union([bob for bob, _ in k_neighbors])
        for bob in N_alice:
            exposure_est[bob] += 1

    top_vulnerable_real = np.argsort(exposure_real)[::-1][:10]
    top_vulnerable_est = np.argsort(exposure_est)[::-1][:10]

    frac = len(set(top_vulnerable_real).intersection(top_vulnerable_est)) / 10
    print("[k=%d] Gain %f" % (k, frac))
    frac_predicted.append(frac)
    
    model = UserKNN(k=k, precomputed_sim=sim, precomputed_gain=gain, tau_4=0.5)
    model.fit(trainset)
    _ = model.test(testset)

    exposure_real = np.zeros((trainset.n_users))
    for uid, exposure in model.exposure_u.items():
        exposure_real[uid] = exposure

    exposure_est = np.zeros((trainset.n_users))
    for alice, ratings in trainset.ur.items():
        ranks =  model.ranking[alice]
        N_alice = set()
        for iid, _ in ratings:
            possible_neighbors = [(bob, ranks[bob]) for bob, _ in trainset.ir[iid] if bob != alice]
            k_neighbors = heapq.nlargest(k, possible_neighbors, key=lambda t: t[1])
            N_alice = N_alice.union([bob for bob, _ in k_neighbors])
        for bob in N_alice:
            exposure_est[bob] += 1

    top_vulnerable_real = np.argsort(exposure_real)[::-1][:10]
    top_vulnerable_est = np.argsort(exposure_est)[::-1][:10]

    frac = len(set(top_vulnerable_real).intersection(top_vulnerable_est)) / 10
    print("[k=%d] Gain+Reuse %f" % (k, frac))
    gainreuse_frac_predicted.append(frac)

[k=1] UserKNN 0.600000
[k=1] UserKNN+Reuse 0.500000
[k=1] Popularity 0.700000
[k=1] Popularity+Reuse 0.700000
[k=1] Gain 0.900000
[k=1] Gain+Reuse 0.900000
[k=3] UserKNN 0.700000
[k=3] UserKNN+Reuse 0.100000
[k=3] Popularity 0.900000
[k=3] Popularity+Reuse 0.900000
[k=3] Gain 0.900000
[k=3] Gain+Reuse 0.900000
[k=5] UserKNN 0.500000
[k=5] UserKNN+Reuse 0.200000
[k=5] Popularity 0.900000
[k=5] Popularity+Reuse 0.900000
[k=5] Gain 1.000000
[k=5] Gain+Reuse 1.000000
[k=7] UserKNN 0.500000
[k=7] UserKNN+Reuse 0.300000
[k=7] Popularity 0.900000
[k=7] Popularity+Reuse 0.900000
[k=7] Gain 0.900000
[k=7] Gain+Reuse 0.900000
[k=9] UserKNN 0.600000
[k=9] UserKNN+Reuse 0.600000
[k=9] Popularity 0.900000
[k=9] Popularity+Reuse 0.900000
[k=9] Gain 0.700000
[k=9] Gain+Reuse 0.700000
[k=11] UserKNN 0.800000
[k=11] UserKNN+Reuse 0.800000
[k=11] Popularity 0.900000
[k=11] Popularity+Reuse 0.900000
[k=11] Gain 0.700000
[k=11] Gain+Reuse 0.700000
[k=13] UserKNN 0.800000
[k=13] UserKNN+Reuse 0.700000
[k=1

In [None]:
plt.plot(Ks, userknn_frac_predicted, color="C0", linestyle="dashed", label="UserKNN", alpha=0.5)
plt.plot(Ks, userknnreuse_frac_predicted, color="C1", linestyle="dashed", label="Popularity", alpha=0.5)
plt.plot(Ks, pop_frac_predicted, color="C2", linestyle="dashed", label="Gain", alpha=0.5)
plt.plot(Ks, popreuse_frac_predicted, color="C0", linestyle="solid", label="UserKNN + Reuse")
plt.plot(Ks, gain_frac_predicted, color="C1", linestyle="solid", label="Popularity + Reuse")
plt.plot(Ks, gainreuse_frac_predicted, color="C2", linestyle="solid", label="Gain + Reuse")