In [1]:
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, Counter
from datetime import datetime as dt
from scipy.stats import spearmanr

  tree = Parsing.p_module(s, pxd, full_module_name)


In [98]:
def get_distance_matrix(model):
    n_user_ratings = {uid: len(ratings) for uid, ratings in model.trainset.ur.items()}
    avg_item_ratings = defaultdict(list)
    for uid, iid, r in trainset.all_ratings():
        avg_item_ratings[iid].append(r)
    for iid, ratings in avg_item_ratings.items():
        avg_item_ratings[iid] = np.mean(ratings)
    
    distances = np.ones((model.trainset.n_users, model.trainset.n_users)) * np.inf
    for alice in model.trainset.all_users():
        d = defaultdict(list)
        for bob, iid, r_bob, r_avg in model.known_ratings[alice]:
            d[bob].append((iid, r_bob, r_avg))
        
        for bob, secrets in d.items():
            known_iids = set([iid for iid, _, _ in secrets])
            distance = 0
            for iid, r_bob, r_avg in secrets:
                distance += (r_bob - r_avg) ** 2
            #unknown_iids = set([iid for iid, _ in model.trainset.ur[bob]]).difference(known_iids)
            #for iid in unknown_iids:
            #    distance += (avg_item_ratings[iid] - r_avg) ** 2
            #distance = np.sqrt(distance)
            
            distances[alice, bob] = distance / len(secrets)
            
    return distances
    
def get_avg_distance(model):
    D = get_distance_matrix(model)
    
    min_distances = np.amin(D, axis=0)
    avg_distance = np.mean(min_distances[~np.isinf(min_distances)])
    return avg_distance

def get_avg_top_neighbor_distance(model):
    n_known_secrets_matrix = np.zeros((trainset.n_users, trainset.n_users))
    for alice in model.trainset.all_users():
        for bob, secret in model.known_secrets[alice]:
            n_known_secrets_matrix[alice, bob] += 1

    top_neighbors = []
    for alice, row in enumerate(n_known_secrets_matrix):
        top_neighbor = np.argmax(row)
        if n_known_secrets_matrix[alice, top_neighbor] > 0:
            top_neighbors.append((alice, top_neighbor))

    D = get_distance_matrix(model)
    top_neighbor_distance = []
    for alice, top_neighbor in top_neighbors:
        top_neighbor_distance.append(D[alice, top_neighbor])

    return np.mean(top_neighbor_distance)

In [90]:
data_df = pd.read_csv("data/ml-100k/u.data", sep="\t")
data_df.columns = ["user_id", "item_id", "rating", "timestamp"]
data_df.drop(columns=["timestamp"], axis=1, inplace=True)
data_df["user_id"] = data_df["user_id"].map({b: a for a, b in enumerate(data_df["user_id"].unique())})
data_df["item_id"] = data_df["item_id"].map({b: a for a, b in enumerate(data_df["item_id"].unique())})

reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(data_df, reader=reader)
#trainset, testset = train_test_split(dataset, test_size=0.2)
folds = KFold(n_splits=5)

In [99]:
userknn, userknnreuse, pop, popreuse, gain, gainreuse = [], [], [], [], [], []
userknntop, userknnreusetop, poptop, popreusetop, gaintop, gainreusetop = [], [], [], [], [], []
Ks = np.arange(1, 31, 2)
for trainset, testset in folds.split(dataset):    
    userknn_fold, userknnreuse_fold, pop_fold, popreuse_fold, gain_fold, gainreuse_fold = [], [], [], [], [], []
    userknntop_fold, userknnreusetop_fold, poptop_fold, popreusetop_fold, gaintop_fold, gainreusetop_fold = [], [], [], [], [], []
    for k in Ks:
        print(k)
        model = UserKNN(k=k)
        model.fit(trainset)
        _ = model.test(testset)
        userknn_fold.append(get_avg_distance(model))
        userknntop_fold.append(get_avg_top_neighbor_distance(model))

        model = UserKNN(k=k, reuse=True)
        model.fit(trainset)
        _ = model.test(testset)
        userknnreuse_fold.append(get_avg_distance(model))
        userknnreusetop_fold.append(get_avg_top_neighbor_distance(model))

        model = UserKNN(k=k, tau_2=0.5)
        model.fit(trainset)
        _ = model.test(testset)
        pop_fold.append(get_avg_distance(model))
        poptop_fold.append(get_avg_top_neighbor_distance(model))

        model = UserKNN(k=k, tau_2=0.5, reuse=True)
        model.fit(trainset)
        _ = model.test(testset)
        popreuse_fold.append(get_avg_distance(model))
        popreusetop_fold.append(get_avg_top_neighbor_distance(model))

        model = UserKNN(k=k, tau_4=0.5)
        model.fit(trainset)
        _ = model.test(testset)
        gain_fold.append(get_avg_distance(model))
        gaintop_fold.append(get_avg_top_neighbor_distance(model))

        model = UserKNN(k=k, tau_4=0.5, reuse=True)
        model.fit(trainset)
        _ = model.test(testset)
        gainreuse_fold.append(get_avg_distance(model))
        gainreusetop_fold.append(get_avg_top_neighbor_distance(model))
    
    userknn.append(userknn_fold)
    userknnreuse.append(userknnreuse_fold)
    pop.append(pop_fold)
    popreuse.append(popreuse_fold)
    gain.append(gain_fold)
    gainreuse.append(gainreuse_fold)
    
    userknntop.append(userknntop_fold)
    userknnreusetop.append(userknnreusetop_fold)
    poptop.append(poptop_fold)
    popreusetop.append(popreusetop_fold)
    gaintop.append(gaintop_fold)
    gainreusetop.append(gainreusetop_fold)
    
    print("fold finished")
    break
    
userknn = np.mean(userknn, axis=0)
userknnreuse = np.mean(userknnreuse, axis=0)
pop = np.mean(pop, axis=0)
popreuse = np.mean(popreuse, axis=0)
gain = np.mean(gain, axis=0)
gainreuse = np.mean(gainreuse, axis=0)

userknntop = np.mean(userknntop, axis=0)
userknnreusetop = np.mean(userknnreusetop, axis=0)
poptop = np.mean(poptop, axis=0)
popreusetop = np.mean(popreusetop, axis=0)
gaintop = np.mean(gaintop, axis=0)
gainreusetop = np.mean(gainreusetop, axis=0)

1
3
5
7
9
11
13
15
17
19
21
23
25
27


KeyboardInterrupt: 

In [None]:
plt.figure()
plt.plot(Ks, userknn, color="C0", linestyle="dashed", label="UserKNN", alpha=0.5)
plt.plot(Ks, pop, color="C1", linestyle="dashed", label="Popularity", alpha=0.5)
plt.plot(Ks, gain, color="C2", linestyle="dashed", label="Gain", alpha=0.5)
plt.plot(Ks, userknnreuse, color="C0", linestyle="solid", label="UserKNN + Reuse")
plt.plot(Ks, popreuse, color="C1", linestyle="solid", label="Popularity + Reuse")
plt.plot(Ks, gainreuse, color="C2", linestyle="solid", label="Gain + Reuse")
plt.xlabel("Nr. of neighbors")
plt.ylabel("Avg. (c, t)-isolation")
plt.legend(ncol=2)
plt.tight_layout()
plt.show()

In [None]:
plt.figure()
plt.plot(Ks, userknntop, color="C0", linestyle="dashed", label="UserKNN", alpha=0.5)
plt.plot(Ks, poptop, color="C1", linestyle="dashed", label="Popularity", alpha=0.5)
plt.plot(Ks, gaintop, color="C2", linestyle="dashed", label="Gain", alpha=0.5)
plt.plot(Ks, userknnreusetop, color="C0", linestyle="solid", label="UserKNN + Reuse")
plt.plot(Ks, popreusetop, color="C1", linestyle="solid", label="Popularity + Reuse")
plt.plot(Ks, gainreusetop, color="C2", linestyle="solid", label="Gain + Reuse")
plt.xlabel("Nr. of neighbors")
plt.ylabel("Avg. (c, t)-isolation of top neighbors")
plt.legend(ncol=2)
plt.tight_layout()
plt.show()

In [157]:
k = 10
model = UserKNN(k=k)
model.fit(trainset)
predictions = model.test(testset)

deltas = []
for u in model.trainset.all_users():
    ranks = np.zeros((model.trainset.n_users))
    simrank = {v: k for k, v in dict(enumerate(np.argsort(model.sim[u, :]))).items()}
    for u_ in model.trainset.all_users():
        ranks[u_] = simrank[u_]
    max_rank = np.sort(ranks)[-1]
    kth_rank = np.sort(ranks)[::-1][k-1]
    deltas.append(max_rank - kth_rank)
np.mean(deltas)

9.0

In [162]:
k = 10
model = UserKNN(k=k, tau_2=0.5)
model.fit(trainset)
predictions = model.test(testset)

deltas = []
poprank = {v: k+1 for k, v in dict(enumerate(np.argsort(model.pop)[::-1])).items()}
for u in model.trainset.all_users():
    ranks = np.zeros((model.trainset.n_users))
    simrank = {v: k+1 for k, v in dict(enumerate(np.argsort(model.sim[u, :])[::-1])).items()}
    for u_ in model.trainset.all_users():
        ranks[u_] = 0.5 * poprank[u_] + 0.5 * simrank[u_]
    print(np.sort(ranks))
    break
    min_rank = np.sort(ranks)[0]
    kth_rank = np.sort(ranks)[k-1]
    deltas.append(max_rank - kth_rank)
np.mean(deltas)

[ 44.   83.  107.  120.5 123.  128.  130.5 145.  146.5 152.  154.5 156.5
 156.5 156.5 158.  162.5 164.  167.  172.5 177.  177.  183.5 184.  186.
 186.  189.  189.5 191.5 192.5 192.5 195.5 196.  197.  198.  199.5 200.
 200.5 202.5 204.  207.5 211.  212.  212.  213.  213.5 216.5 217.5 217.5
 219.5 222.5 222.5 224.  226.  226.5 228.  228.5 228.5 228.5 229.  229.5
 229.5 230.  230.5 231.  231.5 232.5 234.  234.  234.  236.  238.  241.
 243.  243.  244.  245.  245.5 246.5 247.  249.5 251.5 251.5 252.  252.
 252.5 253.5 254.  256.  256.5 257.  257.  257.5 257.5 258.5 259.  260.
 260.  261.5 261.5 264.5 265.  265.  265.5 266.5 267.  268.  268.  268.5
 268.5 268.5 269.5 270.  270.  270.  270.5 271.5 271.5 271.5 271.5 272.5
 273.5 273.5 274.  276.  277.5 277.5 277.5 278.5 279.5 280.5 281.  281.5
 281.5 282.5 282.5 282.5 282.5 283.  283.  283.  286.  286.  286.5 287.
 288.  289.  289.  289.  289.5 290.  290.  290.  290.  291.  292.5 293.
 293.5 294.  294.  294.5 294.5 294.5 295.  295.  295.  296

nan

In [159]:
a = np.zeros((model.trainset.n_users))
for u_, rank in model.ranking[u].items():
    a[u_] = rank
np.argsort(a)

{0: 708.5,
 1: 603.5,
 2: 665.5,
 3: 655.0,
 4: 595.5,
 5: 304.0,
 6: 262.5,
 7: 614.0,
 8: 435.5,
 9: 691.5,
 10: 726.0,
 11: 615.5,
 12: 734.0,
 13: 463.0,
 14: 477.5,
 15: 670.5,
 16: 601.0,
 17: 724.5,
 18: 683.0,
 19: 712.0,
 20: 717.5,
 21: 598.0,
 22: 552.5,
 23: 256.0,
 24: 560.5,
 25: 591.5,
 26: 771.0,
 27: 674.0,
 28: 600.0,
 29: 651.0,
 30: 554.0,
 31: 503.5,
 32: 722.5,
 33: 523.5,
 34: 644.0,
 35: 500.0,
 36: 202.0,
 37: 649.5,
 38: 739.5,
 39: 459.0,
 40: 630.0,
 41: 639.0,
 42: 521.0,
 43: 717.5,
 44: 748.5,
 45: 540.0,
 46: 800.5,
 47: 679.0,
 48: 593.5,
 49: 690.5,
 50: 661.5,
 51: 535.0,
 52: 590.5,
 53: 731.0,
 54: 620.5,
 55: 603.5,
 56: 595.5,
 57: 558.5,
 58: 490.5,
 59: 513.0,
 60: 412.5,
 61: 515.0,
 62: 486.0,
 63: 178.5,
 64: 621.5,
 65: 597.0,
 66: 603.5,
 67: 678.5,
 68: 629.0,
 69: 138.5,
 70: 120.5,
 71: 737.5,
 72: 603.5,
 73: 744.5,
 74: 644.5,
 75: 587.5,
 76: 613.0,
 77: 631.0,
 78: 668.5,
 79: 582.5,
 80: 679.5,
 81: 773.5,
 82: 239.5,
 83: 544.0,
 8