In [1]:
import pandas as pd
import surprise
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

In [2]:
# WARNING: Only run once or data will be lost

# Instance datasets
train_file = pd.read_csv('dataset/train.csv', sep=',', header=0)
test_file = pd.read_csv('dataset/validation.csv', sep=',', header=0)
anime_file = pd.read_csv('dataset/anime.csv', sep=',', header=0)

train_file.to_csv("dataset/train.csv", index=False, header=False)  # Remove 1st row from CSV
test_file.to_csv("dataset/validation.csv", index=False, header=False)
anime_file.to_csv("dataset/anime.csv", index=False, header=False)

train_file = pd.read_csv('dataset/train.csv', names = ['user_id','item_id','rating'] ,sep=',', header=None) 
test_file = pd.read_csv('dataset/validation.csv', names = ['user_id','item_id','rating'], sep=',', header=None)
anime_file = pd.read_csv('dataset/anime.csv', names = ['anime_id','name','genre','type','episodes','rating','members'], sep=',', header=None)

train_file.head()

Unnamed: 0,user_id,item_id,rating
0,40748,9926,-1
1,35757,79,10
2,18266,51,-1
3,31006,8795,7
4,68084,14837,8


In [3]:
anime_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [4]:
# Convert data 
reader = surprise.Reader(line_format='user item rating', sep=',', rating_scale=(1,10))
data = surprise.Dataset.load_from_folds([("dataset/train.csv", "dataset/validation.csv")], reader=reader)
pkf = PredefinedKFold()
trainset, testset = next(pkf.split(data))

a_testset = trainset.build_anti_testset() 

Lets start first using 5 neighbours w/ pearson.

In [5]:
myUserKnn = surprise.KNNBasic(k=5, sim_options={'name': 'pearson', 'user_based': True}) 
myUserKnn.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11a7daa70>

We can now predict what rating user X will give item Y

In [6]:
myUserKnn.predict("68084", "6325")

Prediction(uid='68084', iid='6325', r_ui=None, est=6.165656934306569, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})

This raises the question: What happened?

Basically, the recomSys can't predict what the user will rate that item. This is because not much data is available to compare them with other users.
To corroborate our answer, we can check how many users rated the item and how many items the user rated:

In [7]:
# How many users rated the item 6325
item_inner_id = trainset.to_inner_iid('6325')
neighbors = trainset.ir[item_inner_id]  # list of (user_id, rating) who rated this item
print(len(neighbors))  # how many users rated it

20


In [8]:
# How many items user 68084 has rated
user_inner_id = trainset.to_inner_uid('68084')
user_ratings = trainset.ur[user_inner_id]
print(len(user_ratings))  # how many ratings this user has made

3


This confirms that we know too few about the users taste. This will translate in userKNN not finding enough neighbours to make a prediction.
We can use the anti test set results to see the valid predictions and see which one could be made.

In [9]:
predictions = myUserKnn.test(a_testset)

In [10]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])

[Prediction(uid='44929', iid='6', r_ui=6.165656934306569, est=7.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='8314', r_ui=6.165656934306569, est=6.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='18179', r_ui=6.165656934306569, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='52016', iid='8675', r_ui=6.165656934306569, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='39921', iid='11319', r_ui=6.165656934306569, est=1, details={'actual_k': 1, 'was_impossible': False})]


We now see everyone that every prediction is being made with one neighbour. Checking the RMSE we can see its pretty low:

In [11]:
RMSE_VALUE = accuracy.rmse(predictions)
print(RMSE_VALUE)

RMSE: 0.0009
0.0008835097156136474


However, this is misleading since surprise only calculates the possible predictions (was_impossible=False). Since most were impossible to make, this becomes a misleading stat. We can further see this by dividing all predictions made and get another sigh of the evaluation:

In [12]:

print(f"Valid predictions: {len(valid_predictions)}")
print(f"All predictions: {len(predictions)}")
print(f"rate: {len(valid_predictions) / len(predictions)}")

Valid predictions: 20
All predictions: 78455796
rate: 2.5492061797448337e-07


We can try changing the number of neighbours but we see not too different results:

In [13]:
myUserKnn5 = surprise.KNNBasic(k=50, sim_options={'name': 'pearson', 'user_based': True}) 
myUserKnn5.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x11a7d83d0>

In [15]:
predictions = myUserKnn5.test(a_testset)

In [16]:
RMSE_VALUE = accuracy.rmse(predictions)
print(RMSE_VALUE)

RMSE: 0.0009
0.0008835097156136474


In [17]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])


[Prediction(uid='44929', iid='6', r_ui=6.165656934306569, est=7.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='8314', r_ui=6.165656934306569, est=6.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='44929', iid='18179', r_ui=6.165656934306569, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='52016', iid='8675', r_ui=6.165656934306569, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='39921', iid='11319', r_ui=6.165656934306569, est=1, details={'actual_k': 1, 'was_impossible': False})]


Lets check now with 20 neighbours w/ cosine to see if we can get predictions with more than one neighbour, or more valid predictions.

In [18]:
myUserKnn = surprise.KNNBasic(k=20, sim_options={'name': 'cosine', 'user_based': True}) 
myUserKnn.fit(trainset)
predictions = myUserKnn.test(a_testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [19]:
RMSE_VALUE = accuracy.rmse(predictions)
print(RMSE_VALUE)

RMSE: 0.1572
0.15715038369397516


In [20]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])

[Prediction(uid='40748', iid='2904', r_ui=6.165656934306569, est=1, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='35757', iid='6045', r_ui=6.165656934306569, est=10, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='35757', iid='3652', r_ui=6.165656934306569, est=10, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='35757', iid='6746', r_ui=6.165656934306569, est=9.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='35757', iid='8424', r_ui=6.165656934306569, est=7.0, details={'actual_k': 1, 'was_impossible': False})]


In [21]:
print(f"Valid predictions: {len(valid_predictions)}")
print(f"All predictions: {len(predictions)}")
print(f"rate: {len(valid_predictions) / len(predictions)}")

Valid predictions: 290312
All predictions: 78455796
rate: 0.0037003257222704105


Information was used from these sources
https://github.com/NicolasHug/Surprise/blob/master/surprise/prediction_algorithms/knns.py
https://surprise.readthedocs.io/en/stable/model_selection.html 
https://surprise.readthedocs.io/en/stable/knn_inspired.html