In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

from surprise import BaselineOnly
from surprise import Dataset
from surprise.model_selection import cross_validate, train_test_split, LeaveOneOut
from surprise import Reader

from surprise.model_selection import GridSearchCV
from recommender_metrics import *

file_path = "./BX-CSV/BookRatings.csv"
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,10), skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)
trainset, testset = train_test_split(data, test_size=0.25, random_state=10)


In [2]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)

Using ALS
Estimating biases using als...


In [3]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError: # book was not part of the trainset
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [4]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
29542,175003,1551668912,0.0,0.0,{'was_impossible': False},267,37,0.0
13259,96054,440224861,0.0,0.0,{'was_impossible': False},66,34,0.0
1358,262399,553560441,0.0,0.0,{'was_impossible': False},153,39,0.0
31315,56856,446606243,0.0,0.0,{'was_impossible': False},201,31,0.0
13254,175003,1551669498,0.0,0.0,{'was_impossible': False},267,35,0.0
26269,76352,553580930,0.0,0.0,{'was_impossible': False},512,43,0.0
7744,175003,743411331,0.0,0.0,{'was_impossible': False},267,29,0.0
13244,87746,743225406,0.0,0.0,{'was_impossible': False},165,69,0.0
18781,35857,821769340,0.0,0.0,{'was_impossible': False},192,34,0.0
13237,148744,743411250,0.0,0.0,{'was_impossible': False},238,49,0.0


In [5]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
10950,201017,1401088945,10.0,0.155595,{'was_impossible': False},56,45,9.844405
18772,205735,373825013,10.0,0.057083,{'was_impossible': False},80,70,9.942917
8769,55548,553278398,10.0,0.0,{'was_impossible': False},137,26,10.0
3761,24921,440236665,10.0,0.0,{'was_impossible': False},97,31,10.0
15040,36836,345334531,10.0,0.0,{'was_impossible': False},165,40,10.0
1375,238120,385413041,10.0,0.0,{'was_impossible': False},323,25,10.0
6937,91832,440241537,10.0,0.0,{'was_impossible': False},85,81,10.0
4375,31826,439136350,0.0,10.0,{'was_impossible': False},77,88,10.0
7139,31826,439139597,0.0,10.0,{'was_impossible': False},77,81,10.0
24161,31826,439064864,0.0,10.0,{'was_impossible': False},77,85,10.0


In [6]:
LOOCV = LeaveOneOut(n_splits=1, random_state=1)
metrics = RecommenderMetrics

print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)

for trainset, testset in LOOCV.split(data):
    print("Computing recommendations with leave-one-out...")
    algo.fit(trainset)
    
    print("Predict ratings for left-out set...")
    left_out_predictions = algo.test(testset)
    
    print("Predict all missing ratings...")
    # build_anti_testset returns a list of ratings that can be used as a testset in the test() method.
    big_testset = trainset.build_anti_testset()
    all_predictions = algo.test(big_testset)
    
    print("Compute top 10 recommendations per user...")
    topN_predicted = metrics.GetTopN(all_predictions, n=10, min_rating=4.0)

    print("\nHit Rate: ", metrics.HitRate(topN_predicted, left_out_predictions))
    print("\nRating Hit Rate: ")
    metrics.RatingHitRate(topN_predicted, left_out_predictions)
    print("\nCumulative Hit Rate: ", metrics.CumulativeHitRate(topN_predicted, left_out_predictions, rating_cutoff=5.0))
    print("\nAverage Reciprocal Hit Rate: ", metrics.AverageReciprocalHitRank(topN_predicted, left_out_predictions))
    print("\nUser Coverage: ", metrics.UserCoverage(topN_predicted, 5))

Using ALS
Computing recommendations with leave-one-out...
Estimating biases using als...
Predict ratings for left-out set...
Predict all missing ratings...
Compute top 10 recommendations per user...

Hit Rate:  0.0074142724745134385

Rating Hit Rate: 
0.0 0.0014570179698882952
7.0 0.010526315789473684
8.0 0.0035714285714285713
10.0 0.0625

Cumulative Hit Rate:  0.018502202643171806

Average Reciprocal Hit Rate:  0.0033872976644060983

User Coverage:  0.7624343527957986
