In [238]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN as ReuseKNN
import pandas as pd
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.decomposition import PCA

In [239]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
reader = Reader(rating_scale=(1, 5))

dataset = Dataset.load_from_df(data_df[["user_id", "item_id", "rating"]], reader=reader)
trainset = dataset.construct_trainset(data_df.to_records(index=False))
testset = trainset.build_testset()

In [240]:
R_userprofiles = np.zeros((trainset.n_users, trainset.n_items))
for uid, ratings in trainset.ur.items():
    for iid, r in ratings:
        R_userprofiles[uid, iid] = r
R_userprofiles

array([[3., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 4., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Recommendations

## UserKNN

### No DP

In [241]:
m = ReuseKNN(k=10, protected=False)
m.fit(trainset)
p = m.test(testset)
print(np.median(m.n_queries))

R_userknn_nodp = np.zeros((trainset.n_users, trainset.n_items))
for ruid, riid, _, r_est, _ in p:
    iuid = trainset.to_inner_uid(ruid)
    iiid = trainset.to_inner_iid(riid)
    R_userknn_nodp[iuid, iiid] = r_est

638.0


### DP

In [242]:
m = ReuseKNN(k=10, protected=True)
m.fit(trainset)
p = m.test(testset)

R_userknn_dp = np.zeros((trainset.n_users, trainset.n_items))
for ruid, riid, _, r_est, _ in p:
    iuid = trainset.to_inner_uid(ruid)
    iiid = trainset.to_inner_iid(riid)
    R_userknn_dp[iuid, iiid] = r_est

## ReuseKNN

### No DP

In [243]:
m = ReuseKNN(k=10, tau_4=0.5, protected=False)
m.fit(trainset)
p = m.test(testset)
print(np.median(m.n_queries))


R_reuseknn_nodp = np.zeros((trainset.n_users, trainset.n_items))
for ruid, riid, _, r_est, _ in p:
    iuid = trainset.to_inner_uid(ruid)
    iiid = trainset.to_inner_iid(riid)
    R_reuseknn_nodp[iuid, iiid] = r_est

100.0


### DP

In [244]:
m = ReuseKNN(k=10, tau_4=0.5, protected=True)
m.fit(trainset)
p = m.test(testset)

R_reuseknn_dp = np.zeros((trainset.n_users, trainset.n_items))
for ruid, riid, _, r_est, _ in p:
    iuid = trainset.to_inner_uid(ruid)
    iiid = trainset.to_inner_iid(riid)
    R_reuseknn_dp[iuid, iiid] = r_est

# Embeddings

In [119]:
%matplotlib qt

In [280]:
transformer = UMAP(n_components=2).fit(R_userprofiles)
E_userprofiles = transformer.transform(R_userprofiles)
E_userknnnodp = transformer.transform(R_userknn_nodp)
E_userknndp = transformer.transform(R_userknn_dp)
E_reuseknnnodp = transformer.transform(R_reuseknn_nodp)
E_reuseknndp = transformer.transform(R_reuseknn_dp)

In [272]:
transformer = TSNE(n_components=2, init='random').fit_transform(R_userprofiles)
E_userprofiles = TSNE(n_components=2, init='random').fit_transform(R_userprofiles)
E_userknnnodp = TSNE(n_components=2, init='random').fit_transform(R_userknn_nodp)
E_userknndp = TSNE(n_components=2, init='random').fit_transform(R_userknn_dp)
E_reuseknnnodp = TSNE(n_components=2, init='random').fit_transform(R_reuseknn_nodp)
E_reuseknndp = TSNE(n_components=2, init='random').fit_transform(R_reuseknn_dp)

In [283]:
plt.scatter(E_userprofiles[:, 0], E_userprofiles[:, 1], s=2, label="User Profiles")

<matplotlib.collections.PathCollection at 0x1a74211a518>

In [282]:
plt.scatter(E_userknnnodp[:, 0], E_userknnnodp[:, 1], s=2, label="UserKNN")
plt.scatter(E_reuseknnnodp[:, 0], E_reuseknnnodp[:, 1], s=2, label="ReuseKNN")
plt.legend()
plt.title("No DP")

Text(0.5, 1.0, 'No DP')

In [284]:
plt.scatter(E_userprofiles[:, 0], E_userprofiles[:, 1], s=2, label="User Profiles")
plt.scatter(E_userknnnodp[:, 0], E_userknnnodp[:, 1], s=2, label="UserKNN")
plt.scatter(E_reuseknnnodp[:, 0], E_reuseknnnodp[:, 1], s=2, label="ReuseKNN")
plt.legend()
plt.title("No DP")

Text(0.5, 1.0, 'No DP')

In [285]:
plt.scatter(E_userknndp[:, 0], E_userknndp[:, 1], s=2, label="UserKNN")
plt.scatter(E_reuseknndp[:, 0], E_reuseknndp[:, 1], s=2, label="ReuseKNN")
plt.legend()
plt.title("DP")

Text(0.5, 1.0, 'DP')

In [286]:
plt.scatter(E_userprofiles[:, 0], E_userprofiles[:, 1], s=2, label="User Profiles")
plt.scatter(E_userknndp[:, 0], E_userknndp[:, 1], s=2, label="UserKNN")
plt.scatter(E_reuseknndp[:, 0], E_reuseknndp[:, 1], s=2, label="ReuseKNN")
plt.legend()
plt.title("DP")

Text(0.5, 1.0, 'DP')

In [287]:
plt.scatter(E_userknnnodp[:, 0] - E_userprofiles[:, 0], E_userknnnodp[:, 1] - E_userprofiles[:, 1], s=2, label="UserKNN")
plt.scatter(E_reuseknnnodp[:, 0] - E_userprofiles[:, 0], E_reuseknnnodp[:, 1] - E_userprofiles[:, 1], s=2, label="ReuseKNN")
plt.axhline(y=0, linestyle="dashed", c="Gray", linewidth=1)
plt.axvline(x=0, linestyle="dashed", c="Gray", linewidth=1)
plt.legend()
plt.title("No DP")

Text(0.5, 1.0, 'No DP')

In [288]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(E_userprofiles, E_userknndp).diagonal().mean(), cosine_similarity(E_userprofiles, E_reuseknndp).diagonal().mean())
print(cosine_similarity(R_userprofiles, R_userknn_nodp).diagonal().mean(), cosine_similarity(R_userprofiles, R_reuseknn_nodp).diagonal().mean())

0.9807789 0.97561544
0.9766340236215932 0.9743109978914815


In [None]:
# cosine similarity overlap