In [25]:
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, Reader, accuracy
import pyximport
import numpy as np
pyximport.install(setup_args={"include_dirs": np.get_include()},
                  reload_support=True)
from algorithms.knn_neighborhood import UserKNN

In [15]:
ml100k_df = pd.read_csv("data/ml-100k/ml-100k/u.data", sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])
ml100k_df.drop(columns=["timestamp"], inplace=True)
ml100k_df.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [16]:
ml1m_df = pd.read_csv("data/ml-1m/ratings.dat", sep="::", header=None, names=["user_id", "item_id", "rating", "timestamp"])
ml1m_df.drop(columns=["timestamp"], inplace=True)
ml1m_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,user_id,item_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [17]:
anime_df = pd.read_csv("data/anime_small_sample.csv", sep=";", header=None, names=["user_id", "item_id", "rating"])
anime_df.head()

Unnamed: 0,user_id,item_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10


In [32]:
def run_knn(dataset_df):
    errors = []
    K = np.arange(1, 30, 1)

    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(dataset_df, reader=reader)
    folds = KFold(n_splits=5)
    f = 1
    for trainset, testset in folds.split(dataset):
        sim = UserKNN.compute_similarities(trainset, min_support=1)
        errors_at_fold = []
        for k in K:
            model = UserKNN(k=k, precomputed_sim=sim)
            model.fit(trainset)
            predictions = model.test(testset)
            mae = accuracy.mae(predictions, verbose=False)
            errors_at_fold.append(mae)

            print("Fold %d, k: %d, MAE: %f" % (f, k, mae))
        errors.append(errors_at_fold)
        f += 1
    return np.mean(errors, axis=0)

In [33]:
ml100k_errors = run_knn(ml100k_df)
ml1m_errors = run_knn(ml1m_df)
anime_errors = run_knn(anime_df)

Fold 1, k: 1, MAE: 1.043559
Fold 1, k: 2, MAE: 0.948891
Fold 1, k: 3, MAE: 0.901925
Fold 1, k: 4, MAE: 0.878987
Fold 1, k: 5, MAE: 0.864165
Fold 1, k: 6, MAE: 0.853963
Fold 1, k: 7, MAE: 0.844505
Fold 1, k: 8, MAE: 0.839440
Fold 1, k: 9, MAE: 0.835521
Fold 1, k: 10, MAE: 0.832539
Fold 1, k: 11, MAE: 0.829783
Fold 1, k: 12, MAE: 0.827412
Fold 1, k: 13, MAE: 0.824935
Fold 1, k: 14, MAE: 0.824176
Fold 1, k: 15, MAE: 0.822113
Fold 1, k: 16, MAE: 0.819897
Fold 1, k: 17, MAE: 0.818629
Fold 1, k: 18, MAE: 0.818003
Fold 1, k: 19, MAE: 0.817202
Fold 1, k: 20, MAE: 0.816116
Fold 1, k: 21, MAE: 0.815523
Fold 1, k: 22, MAE: 0.815347
Fold 1, k: 23, MAE: 0.814993
Fold 1, k: 24, MAE: 0.814717
Fold 1, k: 25, MAE: 0.814077
Fold 1, k: 26, MAE: 0.813034
Fold 1, k: 27, MAE: 0.812957
Fold 1, k: 28, MAE: 0.812662
Fold 1, k: 29, MAE: 0.812581
Fold 2, k: 1, MAE: 1.038523
Fold 2, k: 2, MAE: 0.933903
Fold 2, k: 3, MAE: 0.894382
Fold 2, k: 4, MAE: 0.870359
Fold 2, k: 5, MAE: 0.854976
Fold 2, k: 6, MAE: 0.845562


Fold 5, k: 26, MAE: 0.773818
Fold 5, k: 27, MAE: 0.773208
Fold 5, k: 28, MAE: 0.772645
Fold 5, k: 29, MAE: 0.772065
Fold 1, k: 1, MAE: 2.949514
Fold 1, k: 2, MAE: 2.926052
Fold 1, k: 3, MAE: 2.922731
Fold 1, k: 4, MAE: 2.921920
Fold 1, k: 5, MAE: 2.921753
Fold 1, k: 6, MAE: 2.921641
Fold 1, k: 7, MAE: 2.921539
Fold 1, k: 8, MAE: 2.921493
Fold 1, k: 9, MAE: 2.921498
Fold 1, k: 10, MAE: 2.921507
Fold 1, k: 11, MAE: 2.921508
Fold 1, k: 12, MAE: 2.921523
Fold 1, k: 13, MAE: 2.921520
Fold 1, k: 14, MAE: 2.921514
Fold 1, k: 15, MAE: 2.921511
Fold 1, k: 16, MAE: 2.921504
Fold 1, k: 17, MAE: 2.921505
Fold 1, k: 18, MAE: 2.921510
Fold 1, k: 19, MAE: 2.921506
Fold 1, k: 20, MAE: 2.921511
Fold 1, k: 21, MAE: 2.921517
Fold 1, k: 22, MAE: 2.921522
Fold 1, k: 23, MAE: 2.921518
Fold 1, k: 24, MAE: 2.921514
Fold 1, k: 25, MAE: 2.921509
Fold 1, k: 26, MAE: 2.921509
Fold 1, k: 27, MAE: 2.921506
Fold 1, k: 28, MAE: 2.921499
Fold 1, k: 29, MAE: 2.921498
Fold 2, k: 1, MAE: 2.946892
Fold 2, k: 2, MAE: 2.925

In [None]:
plt.plot(K, ml100k_errors, label="ML-100k")
plt.plot(K, ml1m_errors, label="ML-1M")
plt.plot(K, anime_errors, label="Anime")
plt.legend()