In [1]:
import pandas as pd
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader

df = pd.read_csv("./BX-CSV/BookRatings.csv")
df.head()

Unnamed: 0,userid,ISBN,rating
0,243,60915544,10
1,243,60977493,7
2,243,156006529,0
3,243,316096199,0
4,243,316601950,9


In [2]:
# Number of ratings per user
data = df.groupby('userid')['rating'].count()

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [3]:
# build dataset from csv
file_path = "./BX-CSV/BookRatings.csv"
# header needs to be skipped
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,10), skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)

In [16]:
from surprise import NormalPredictor, BaselineOnly, KNNBaseline, SVD, SVDpp, CoClustering, NMF, SlopeOne

benchmark = []
# Iterate over all algorithms
# SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), BaselineOnly(), CoClustering()
for algorithm in [SVD(), SVDpp(), NormalPredictor(), KNNBaseline(), BaselineOnly(),]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,3.379937,0.114256,0.220361
KNNBaseline,3.492386,0.602707,4.411165
SVD,3.54416,3.520979,0.293712
SVDpp,3.817582,84.318835,3.599812
NormalPredictor,4.711406,0.096762,0.253889


In [5]:
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import BaselineOnly

bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }

# BaselineOnly achieves the lowest rmse error
trainset, testset = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
acc = accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 3.3510


In [8]:
predictions[:5]

[Prediction(uid='16916', iid='440213290', r_ui=0.0, est=1.2724379523831697, details={'was_impossible': False}),
 Prediction(uid='231237', iid='553579754', r_ui=0.0, est=3.531503325428009, details={'was_impossible': False}),
 Prediction(uid='144555', iid='425174271', r_ui=8.0, est=2.0187836604917866, details={'was_impossible': False}),
 Prediction(uid='99630', iid='068484477X', r_ui=0.0, est=2.520605577418237, details={'was_impossible': False}),
 Prediction(uid='21014', iid='399148027', r_ui=7.0, est=2.239091304939575, details={'was_impossible': False})]

In [23]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError: # book was not part of the trainset
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
30551,212898,553575090,0.0,0.0,{'was_impossible': False},182,34,0.0
9669,191187,067101417X,0.0,0.0,{'was_impossible': False},54,41,0.0
33942,228998,553560441,0.0,0.0,{'was_impossible': False},187,39,0.0
4501,165759,446604402,0.0,0.0,{'was_impossible': False},21,46,0.0
33948,76352,553578359,0.0,0.0,{'was_impossible': False},498,25,0.0
30908,180957,515113328,0.0,0.0,{'was_impossible': False},55,42,0.0
9640,227447,440225299,0.0,0.0,{'was_impossible': False},310,45,0.0
20359,102967,449227545,0.0,0.0,{'was_impossible': False},392,23,0.0
15654,212898,446604232,0.0,0.0,{'was_impossible': False},182,46,0.0
1201,87746,044022165X,0.0,0.0,{'was_impossible': False},164,164,0.0


In [24]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
5857,115490,081297106X,10.0,0.224067,{'was_impossible': False},168,43,9.775933
1269,166596,312144075,10.0,0.026085,{'was_impossible': False},200,42,9.973915
13300,172030,425125467,10.0,0.0,{'was_impossible': False},109,36,10.0
422,81045,312995423,10.0,0.0,{'was_impossible': False},48,81,10.0
1208,245963,425130711,10.0,0.0,{'was_impossible': False},149,40,10.0
17101,238120,385413041,10.0,0.0,{'was_impossible': False},307,30,10.0
22083,266226,156711427,10.0,0.0,{'was_impossible': False},126,35,10.0
29977,245963,425170349,10.0,0.0,{'was_impossible': False},149,50,10.0
6601,166123,449221504,10.0,0.0,{'was_impossible': False},190,100,10.0
22449,81045,515120278,10.0,0.0,{'was_impossible': False},48,26,10.0
