In [2]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, Counter
from datetime import datetime as dt
from scipy.stats import spearmanr
from sklearn.preprocessing import MinMaxScaler

In [3]:
def gini(x):
    x = MinMaxScaler().fit_transform(np.array(x).reshape(-1, 1))
    mad = np.abs(np.subtract.outer(x, x)).mean()
    rmad = mad / np.mean(x)
    g = 0.5 * rmad
    return g

In [4]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t")
data_df.columns = ["user_id", "item_id", "rating", "timestamp"]
data_df.drop(columns=["timestamp"], axis=1, inplace=True)
data_df["user_id"] = data_df["user_id"].map({b: a for a, b in enumerate(data_df["user_id"].unique())})
data_df["item_id"] = data_df["item_id"].map({b: a for a, b in enumerate(data_df["item_id"].unique())})

reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data_df, reader=reader)
folds = KFold(n_splits=5)

In [21]:
Ks = [5, 15, 30]

exposure_dist = dict()
exposure_dist["UserKNN"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
exposure_dist["UserKNN+Reuse"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
exposure_dist["Popularity"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
exposure_dist["Popularity+Reuse"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
exposure_dist["Gain"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
exposure_dist["Gain+Reuse"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))

query_dist = dict()
query_dist["UserKNN"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
query_dist["UserKNN+Reuse"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
query_dist["Popularity"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
query_dist["Popularity+Reuse"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
query_dist["Gain"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))
query_dist["Gain+Reuse"] = np.zeros((folds.n_splits, len(Ks), data_df["user_id"].nunique()))

fold_idx = 0
for trainset, testset in folds.split(dataset):
    k_idx = 0
    for k in Ks:
        print("=== UserKNN, Fold %d, k=%d ===" % (fold_idx+1, k))
        model = UserKNN(k=k)
        model.fit(trainset)
        _, model.test(testset)
        for iuid, exposure in model.exposure_u.items():
            ruid = model.trainset.to_raw_uid(iuid)
            exposure_dist["UserKNN"][fold_idx, k_idx, ruid] += exposure
        n_queries = np.zeros((data_df["user_id"].nunique(), data_df["user_id"].nunique()))
        for alice, secrets in model.known_secrets.items():    
            for bob, _ in secrets:
                ruid = model.trainset.to_raw_uid(bob)
                n_queries[alice, ruid] += 1
        query_dist["UserKNN"][fold_idx, k_idx, :] = np.max(n_queries, axis=0)
        
        
        print("=== UserKNN+Reuse, Fold %d, k=%d ===" % (fold_idx+1, k))
        model = UserKNN(k=k, reuse=True)
        model.fit(trainset)
        _, model.test(testset)
        for iuid, exposure in model.exposure_u.items():
            ruid = model.trainset.to_raw_uid(iuid)
            exposure_dist["UserKNN+Reuse"][fold_idx, k_idx, ruid] += exposure
        n_queries = np.zeros((data_df["user_id"].nunique(), data_df["user_id"].nunique()))
        for alice, secrets in model.known_secrets.items():    
            for bob, _ in secrets:
                ruid = model.trainset.to_raw_uid(bob)
                n_queries[alice, ruid] += 1
        query_dist["UserKNN+Reuse"][fold_idx, k_idx, :] = np.max(n_queries, axis=0)
            
        print("=== Popularity, Fold %d, k=%d ===" % (fold_idx+1, k))
        model = UserKNN(k=k, tau_2=0.5)
        model.fit(trainset)
        _, model.test(testset)
        for iuid, exposure in model.exposure_u.items():
            ruid = model.trainset.to_raw_uid(iuid)
            exposure_dist["Popularity"][fold_idx, k_idx, ruid] += exposure
        n_queries = np.zeros((data_df["user_id"].nunique(), data_df["user_id"].nunique()))
        for alice, secrets in model.known_secrets.items():    
            for bob, _ in secrets:
                ruid = model.trainset.to_raw_uid(bob)
                n_queries[alice, ruid] += 1
        query_dist["Popularity"][fold_idx, k_idx, :] = np.max(n_queries, axis=0)
            
        print("=== Popularity+Reuse, Fold %d, k=%d ===" % (fold_idx+1, k))
        model = UserKNN(k=k, reuse=True, tau_2=0.5)
        model.fit(trainset)
        _, model.test(testset)
        for iuid, exposure in model.exposure_u.items():
            ruid = model.trainset.to_raw_uid(iuid)
            exposure_dist["Popularity+Reuse"][fold_idx, k_idx, ruid] += exposure
        n_queries = np.zeros((data_df["user_id"].nunique(), data_df["user_id"].nunique()))
        for alice, secrets in model.known_secrets.items():    
            for bob, _ in secrets:
                ruid = model.trainset.to_raw_uid(bob)
                n_queries[alice, ruid] += 1
        query_dist["Popularity+Reuse"][fold_idx, k_idx, :] = np.max(n_queries, axis=0)
            
        print("=== Gain, Fold %d, k=%d ===" % (fold_idx+1, k))
        model = UserKNN(k=k, tau_4=0.5)
        model.fit(trainset)
        _, model.test(testset)
        for iuid, exposure in model.exposure_u.items():
            ruid = model.trainset.to_raw_uid(iuid)
            exposure_dist["Gain"][fold_idx, k_idx, ruid] += exposure
        n_queries = np.zeros((data_df["user_id"].nunique(), data_df["user_id"].nunique()))
        for alice, secrets in model.known_secrets.items():    
            for bob, _ in secrets:
                ruid = model.trainset.to_raw_uid(bob)
                n_queries[alice, ruid] += 1
        query_dist["Gain"][fold_idx, k_idx, :] = np.max(n_queries, axis=0)
            
        print("=== Gain+Reuse, Fold %d, k=%d ===" % (fold_idx+1, k))
        model = UserKNN(k=k, reuse=True, tau_4=0.5)
        model.fit(trainset)
        _, model.test(testset)
        for iuid, exposure in model.exposure_u.items():
            ruid = model.trainset.to_raw_uid(iuid)
            exposure_dist["Gain+Reuse"][fold_idx, k_idx, ruid] += exposure
        n_queries = np.zeros((data_df["user_id"].nunique(), data_df["user_id"].nunique()))
        for alice, secrets in model.known_secrets.items():    
            for bob, _ in secrets:
                ruid = model.trainset.to_raw_uid(bob)
                n_queries[alice, ruid] += 1
        query_dist["Gain+Reuse"][fold_idx, k_idx, :] = np.max(n_queries, axis=0)
            
        k_idx += 1
    fold_idx += 1

=== UserKNN, Fold 1, k=5 ===
=== UserKNN+Reuse, Fold 1, k=5 ===
=== Popularity, Fold 1, k=5 ===
=== Popularity+Reuse, Fold 1, k=5 ===
=== Gain, Fold 1, k=5 ===
=== Gain+Reuse, Fold 1, k=5 ===
=== UserKNN, Fold 1, k=15 ===
=== UserKNN+Reuse, Fold 1, k=15 ===
=== Popularity, Fold 1, k=15 ===
=== Popularity+Reuse, Fold 1, k=15 ===
=== Gain, Fold 1, k=15 ===
=== Gain+Reuse, Fold 1, k=15 ===
=== UserKNN, Fold 1, k=30 ===
=== UserKNN+Reuse, Fold 1, k=30 ===
=== Popularity, Fold 1, k=30 ===
=== Popularity+Reuse, Fold 1, k=30 ===
=== Gain, Fold 1, k=30 ===
=== Gain+Reuse, Fold 1, k=30 ===
=== UserKNN, Fold 2, k=5 ===
=== UserKNN+Reuse, Fold 2, k=5 ===
=== Popularity, Fold 2, k=5 ===
=== Popularity+Reuse, Fold 2, k=5 ===
=== Gain, Fold 2, k=5 ===
=== Gain+Reuse, Fold 2, k=5 ===
=== UserKNN, Fold 2, k=15 ===
=== UserKNN+Reuse, Fold 2, k=15 ===
=== Popularity, Fold 2, k=15 ===
=== Popularity+Reuse, Fold 2, k=15 ===
=== Gain, Fold 2, k=15 ===
=== Gain+Reuse, Fold 2, k=15 ===
=== UserKNN, Fold 2, k

In [27]:
%matplotlib qt
fig, ax = plt.subplots(len(Ks), 6, sharex=False, sharey=False)
for k_idx in range(len(Ks)):
    k = Ks[k_idx]
    exposure = np.mean(exposure_dist["UserKNN"], axis=0)[k_idx]
    n_queries = np.mean(query_dist["UserKNN"], axis=0)[k_idx]
    ax[k_idx, 0].scatter(exposure, n_queries, s=1, alpha=0.1)
    #ax[k_idx, 0].set_title(r"$k$=%d" % k)
for k_idx in range(len(Ks)):
    k = Ks[k_idx]
    exposure = np.mean(exposure_dist["UserKNN+Reuse"], axis=0)[k_idx]
    n_queries = np.mean(query_dist["UserKNN+Reuse"], axis=0)[k_idx]
    ax[k_idx, 1].scatter(exposure, n_queries, s=1, alpha=0.1)
    #ax[k_idx, 1].set_title(r"$k$=%d" % k)
for k_idx in range(len(Ks)):
    k = Ks[k_idx]
    exposure = np.mean(exposure_dist["Popularity"], axis=0)[k_idx]
    n_queries = np.mean(query_dist["Popularity"], axis=0)[k_idx]
    ax[k_idx, 2].scatter(exposure, n_queries, s=1, alpha=0.1)
    #ax[k_idx, 2].set_title(r"$k$=%d" % k)
for k_idx in range(len(Ks)):
    k = Ks[k_idx]
    exposure = np.mean(exposure_dist["Popularity+Reuse"], axis=0)[k_idx]
    n_queries = np.mean(query_dist["Popularity+Reuse"], axis=0)[k_idx]
    ax[k_idx, 3].scatter(exposure, n_queries, s=1, alpha=0.1)
    #ax[k_idx, 3].set_title(r"$k$=%d" % k)
for k_idx in range(len(Ks)):
    k = Ks[k_idx]
    exposure = np.mean(exposure_dist["Gain"], axis=0)[k_idx]
    n_queries = np.mean(query_dist["Gain"], axis=0)[k_idx]
    ax[k_idx, 4].scatter(exposure, n_queries, s=1, alpha=0.1)
    #ax[k_idx, 4].set_title(r"$k$=%d" % k)
for k_idx in range(len(Ks)):
    k = Ks[k_idx]
    exposure = np.mean(exposure_dist["Gain+Reuse"], axis=0)[k_idx]
    n_queries = np.mean(query_dist["Gain+Reuse"], axis=0)[k_idx]
    ax[k_idx, 5].scatter(exposure, n_queries, s=1, alpha=0.1)
    #ax[k_idx, 5].set_title(r"$k$=%d" % k)
    
ax[0, 0].set_title("UserKNN")
ax[0, 1].set_title("UserKNN+Reuse")
ax[0, 2].set_title("Popularity")
ax[0, 3].set_title("Popularity+Reuse")
ax[0, 4].set_title("Gain")
ax[0, 5].set_title("Gain+Reuse")
ax[0, 0].set_ylabel(r"$k=5$")
ax[1, 0].set_ylabel(r"$k=15$")
ax[2, 0].set_ylabel(r"$k=30$")

fig.text(0.06, 0.5, 'Max. Nr. of Queries', ha='center', va='center', rotation='vertical')
fig.text(0.5, 0.04, 'Exposure', ha='center', va='center')

Text(0.5, 0.04, 'Exposure')

In [83]:
ginis = defaultdict(list)
for exposure in np.mean(exposure_dist["UserKNN"], axis=0):
    ginis["UserKNN"].append(gini(exposure))
for exposure in np.mean(exposure_dist["UserKNN+Reuse"], axis=0):
    ginis["UserKNN+Reuse"].append(gini(exposure))
for exposure in np.mean(exposure_dist["Popularity"], axis=0):
    ginis["Popularity"].append(gini(exposure))
for exposure in np.mean(exposure_dist["Popularity+Reuse"], axis=0):
    ginis["Popularity+Reuse"].append(gini(exposure))
for exposure in np.mean(exposure_dist["Gain"], axis=0):
    ginis["Gain"].append(gini(exposure))
for exposure in np.mean(exposure_dist["Gain+Reuse"], axis=0):
    ginis["Gain+Reuse"].append(gini(exposure))

plt.figure()
plt.plot(Ks, ginis["UserKNN"], color="C0", linestyle="dashed", label="UserKNN", alpha=0.5)
plt.plot(Ks, ginis["Popularity"], color="C1", linestyle="dashed", label="Popularity", alpha=0.5)
plt.plot(Ks, ginis["Gain"], color="C2", linestyle="dashed", label="Gain", alpha=0.5)
plt.plot(Ks, ginis["UserKNN+Reuse"], color="C0", linestyle="solid", label="UserKNN + Reuse")
plt.plot(Ks, ginis["Popularity+Reuse"], color="C1", linestyle="solid", label="Popularity + Reuse")
plt.plot(Ks, ginis["Gain+Reuse"], color="C2", linestyle="solid", label="Gain + Reuse")
plt.xlabel("Nr. of neighbors")
plt.ylabel("Gini Exposure")
plt.legend(ncol=2)
plt.tight_layout()