In [None]:
# TODO: directly install req here?
# pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd

from collections import defaultdict

import surprise
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

In [None]:
# TODO: Get dataset from external url

In [3]:
# WARNING: ONLY RUN ONCE OR DATA WILL BE ERASED 

# Instance datasets
train_file = pd.read_csv('dataset/train.csv', sep=',', header=0)
test_file = pd.read_csv('dataset/validation.csv', sep=',', header=0)
anime_file = pd.read_csv('dataset/anime.csv', sep=',', header=0)

train_file.to_csv("dataset/train.csv", index=False, header=False)  # Remove 1st row from CSV
test_file.to_csv("dataset/validation.csv", index=False, header=False)
anime_file.to_csv("dataset/anime.csv", index=False, header=False)

train_file = pd.read_csv('dataset/train.csv', names = ['user_id','item_id','rating'] ,sep=',', header=0)
test_file = pd.read_csv('dataset/validation.csv', names = ['user_id','item_id','rating'], sep=',', header=0)
anime_file = pd.read_csv('dataset/anime.csv', names = ['anime_id','name','genre','type','episodes','rating','members'], sep=',', header=0)

train_file.head()

Unnamed: 0,user_id,item_id,rating
0,20881,1536,9
1,25996,1241,8
2,25409,8668,7
3,29728,6325,5
4,71886,1887,9


In [4]:
# Convert data 
reader = surprise.Reader(line_format='user item rating', sep=',', rating_scale=(1,10))
data = surprise.Dataset.load_from_folds([("dataset/train.csv", "dataset/validation.csv")], reader=reader)
pkf = PredefinedKFold()
trainset, testset = next(pkf.split(data))
a_testset = trainset.build_anti_testset()

In [5]:
myItemKnn = surprise.KNNBasic(k=7, sim_options={'name': 'pearson', 'user_based': False})
myItemKnn.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x10540c760>

In [6]:
predictions = myItemKnn.test(a_testset)

In [7]:
def get_top_n(predictions, n=10):
    """Devuelve las N-mejores recomendaciones para cada usuario de un set de predicción.

    Args:
        predictions(lista de objetos Prediction): La lista de predicción obtenida del método test.
        n(int): El número de recomendaciónes por usuario

    Returns:
    Un diccionario donde las llaves son ids de usuario y los valores son listas de tuplas:
        [(item id, rating estimation), ...] de tamaño n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Predict top 10 ratings of user 31006
top_n = get_top_n(predictions, n=10)
print(top_n["31006"])

[('14837', 7.829323139194963), ('1536', 7.829323139194963), ('1241', 7.829323139194963), ('8668', 7.829323139194963), ('6325', 7.829323139194963), ('1887', 7.829323139194963), ('258', 7.829323139194963), ('27989', 7.829323139194963), ('1654', 7.829323139194963), ('12049', 7.829323139194963)]


In [8]:
RMSE_VALUE = accuracy.rmse(predictions)
print(RMSE_VALUE)

RMSE: 0.0064
0.006411760937818795


Lets check in depth these results

In [9]:
print(predictions[0:5])

[Prediction(uid='68084', iid='1536', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='1241', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='8668', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='6325', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='1887', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})]


In [10]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])

[Prediction(uid='46414', iid='20159', r_ui=7.829323139194963, est=5.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='67409', iid='8675', r_ui=7.829323139194963, est=7.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='32545', iid='31737', r_ui=7.829323139194963, est=7.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='70120', iid='27899', r_ui=7.829323139194963, est=7.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='31874', iid='21561', r_ui=7.829323139194963, est=10, details={'actual_k': 1, 'was_impossible': False})]


In [None]:
print(f"Valid predictions: {len(valid_predictions)}")
print(f"All predictions: {len(predictions)}")
print(f"rate: {len(valid_predictions) / len(predictions)}")

Valid predictions: 1097
All predictions: 59177752
rate: 1.8537371950188306e-05


We see that the situation with UserKNN happened again. Lets try to modify the parameters to have more valid predictions.

In [14]:
myItemKnn = surprise.KNNBasic(k=5, sim_options={'name': 'cosine', 'user_based': False})
myItemKnn.fit(trainset)
predictions = myItemKnn.test(a_testset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [15]:
print(predictions[0:5])

[Prediction(uid='68084', iid='1536', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='1241', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='8668', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='6325', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}), Prediction(uid='68084', iid='1887', r_ui=7.829323139194963, est=7.829323139194963, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})]


In [16]:
valid_predictions = [pred for pred in predictions if not pred.details['was_impossible']]
print(valid_predictions[0:5])

[Prediction(uid='68084', iid='21881', r_ui=7.829323139194963, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='68084', iid='9731', r_ui=7.829323139194963, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='68084', iid='14713', r_ui=7.829323139194963, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='68084', iid='30503', r_ui=7.829323139194963, est=8.0, details={'actual_k': 1, 'was_impossible': False}), Prediction(uid='68084', iid='2889', r_ui=7.829323139194963, est=8.0, details={'actual_k': 1, 'was_impossible': False})]


In [17]:
print(f"Valid predictions: {len(valid_predictions)}")
print(f"All predictions: {len(predictions)}")
print(f"rate: {len(valid_predictions) / len(predictions)}")

Valid predictions: 260592
All predictions: 59177752
rate: 0.0044035467923823806
