In [1]:
# TODO: directly install req here
# pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd

from collections import defaultdict

import surprise
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

In [4]:
# WARNING: ONLY RUN ONCE OR DATA WILL BE ERASED 

# Instance datasets
train_file = pd.read_csv('dataset/train.csv', sep=',', header=0)
test_file = pd.read_csv('dataset/validation.csv', sep=',', header=0)
anime_file = pd.read_csv('dataset/anime.csv', sep=',', header=0)

train_file.to_csv("dataset/train.csv", index=False, header=False)  # Remove 1st row from CSV
test_file.to_csv("dataset/validation.csv", index=False, header=False)
anime_file.to_csv("dataset/anime.csv", index=False, header=False)

train_file = pd.read_csv('dataset/train.csv', names = ['user_id','item_id','rating'] ,sep=',', header=0)
test_file = pd.read_csv('dataset/validation.csv', names = ['user_id','item_id','rating'], sep=',', header=0)
anime_file = pd.read_csv('dataset/anime.csv', names = ['anime_id','name','genre','type','episodes','rating','members'], sep=',', header=0)

train_file.head()

Unnamed: 0,user_id,item_id,rating
0,25996,1241,8
1,25409,8668,7
2,29728,6325,5
3,71886,1887,9
4,58916,258,10


In [5]:
anime_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12290 entries, 0 to 12289
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12290 non-null  int64  
 1   name      12290 non-null  object 
 2   genre     12228 non-null  object 
 3   type      12265 non-null  object 
 4   episodes  12290 non-null  object 
 5   rating    12060 non-null  float64
 6   members   12290 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.2+ KB


In [6]:
# Convert data 
reader = surprise.Reader(line_format='user item rating', sep=',', rating_scale=(1,10))
data = surprise.Dataset.load_from_folds([("dataset/train.csv", "dataset/validation.csv")], reader=reader)
pkf = PredefinedKFold()
trainset, testset = next(pkf.split(data))

a_testset = trainset.build_anti_testset() 

In [7]:
print("Total users used for training:", trainset.n_users)
print("Total animes used for training:", trainset.n_items)
print("Total ratings in the training set:", trainset.n_ratings)


Total users used for training: 16709
Total animes used for training: 3543
Total ratings in the training set: 22234


Lets start first using 5 neighbours w/ pearson.

In [8]:
myUserKnn = surprise.KNNBasic(k=5, sim_options={'name': 'pearson', 'user_based': True}) 
myUserKnn.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11da93730>

We can now predict what rating user X will give item Y

In [9]:
myUserKnn.predict("68084", "6325")

Prediction(uid='68084', iid='6325', r_ui=None, est=7.829315462804714, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})

This raises the question: What happened?

Basically, the recomSys can't predict what the user will rate that item. This is because not much data is available to compare them with other users.
To corroborate our answer, we can check how many users rated the item and how many items the user rated:

In [10]:
# How many users rated the item 6325
item_inner_id = trainset.to_inner_iid('6325')
neighbors = trainset.ir[item_inner_id]  # list of (user_id, rating) who rated this item
print(len(neighbors))  # how many users rated it

13


In [11]:
# How many items user 68084 has rated
user_inner_id = trainset.to_inner_uid('68084')
user_ratings = trainset.ur[user_inner_id]
print(len(user_ratings))  # how many ratings this user has made

2


This confirms that we know too few about the users taste. This will translate in userKNN not finding enough neighbours to make a prediction.
We can use the anti test set results to see the valid predictions and see which one could be made.

In [12]:
predictions = myUserKnn.test(a_testset)

In [13]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])

[Prediction(uid='44929', iid='6', r_ui=7.829315462804714, est=7.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='8314', r_ui=7.829315462804714, est=6.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='18179', r_ui=7.829315462804714, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='52016', iid='8675', r_ui=7.829315462804714, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='39921', iid='11319', r_ui=7.829315462804714, est=1, details={'actual_k': 1, 'was_impossible': False})]


We now see everyone that every prediction is being made with one neighbour. Checking the RMSE we can see its pretty low:

In [14]:
RMSE_VALUE = accuracy.rmse(predictions)
print(RMSE_VALUE)

RMSE: 0.0012
0.001170078553044881


However, this is misleading since surprise only calculates the possible predictions (was_impossible=False). Since most were impossible to make, this becomes a misleading stat. We can further see this by dividing all predictions made and get another sigh of the evaluation:

In [15]:

print(f"Valid predictions: {len(valid_predictions)}")
print(f"All predictions: {len(predictions)}")
print(f"rate: {len(valid_predictions) / len(predictions)}")

Valid predictions: 19
All predictions: 59177753
rate: 3.2106660082210287e-07


We can try changing the number of neighbours but we see not too different results:

In [16]:
myUserKnn5 = surprise.KNNBasic(k=50, sim_options={'name': 'pearson', 'user_based': True}) 
myUserKnn5.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11db5eb60>

In [17]:
predictions = myUserKnn5.test(a_testset)

In [18]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])


[Prediction(uid='44929', iid='6', r_ui=7.829315462804714, est=7.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='8314', r_ui=7.829315462804714, est=6.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='18179', r_ui=7.829315462804714, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='52016', iid='8675', r_ui=7.829315462804714, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='39921', iid='11319', r_ui=7.829315462804714, est=1, details={'actual_k': 1, 'was_impossible': False})]


Lets check now with 20 neighbours w/ cosine to see if we can get predictions with more than one neighbour, or more valid predictions.

In [19]:
myUserKnn = surprise.KNNBasic(k=20, sim_options={'name': 'cosine', 'user_based': True}) 
myUserKnn.fit(trainset)
predictions = myUserKnn.test(a_testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [20]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])

[Prediction(uid='20881', iid='6956', r_ui=7.829315462804714, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='20881', iid='437', r_ui=7.829315462804714, est=10, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='20881', iid='7322', r_ui=7.829315462804714, est=5.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='20881', iid='834', r_ui=7.829315462804714, est=6.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='20881', iid='189', r_ui=7.829315462804714, est=7.0, details={'actual_k': 1, 'was_impossible': False})]


In [21]:

print(f"Valid predictions: {len(valid_predictions)}")
print(f"All predictions: {len(predictions)}")
print(f"rate: {len(valid_predictions) / len(predictions)}")

Valid predictions: 260546
All predictions: 59177753
rate: 0.004402769398831348


Information was used from these sources
https://github.com/NicolasHug/Surprise/blob/master/surprise/prediction_algorithms/knns.py
https://surprise.readthedocs.io/en/stable/model_selection.html 
https://surprise.readthedocs.io/en/stable/knn_inspired.html