In [1]:
import os
import pandas as pd
import numpy as np


import surprise

from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV  #not as same as the cross_validation from scikit learn?
from surprise import Reader, Dataset, accuracy
from surprise import dump

#libraries for data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
import seaborn as sns
#sns.set_palette('Set2')
sns.set_color_codes("pastel")

# Load built in data from surprise library

In [2]:
data = surprise.Dataset.load_builtin('ml-100k')   #Dataset.load_builtin('ml-100k')

In [3]:
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
del df["id"]
df.head(10)

Unnamed: 0,user,item,rate
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0
5,298,474,4.0
6,115,265,2.0
7,253,465,5.0
8,305,451,3.0
9,6,86,3.0


In [4]:
np.sort( df.rate.unique() )

array([1., 2., 3., 4., 5.])

In [5]:
df.isnull().sum()

user    0
item    0
rate    0
dtype: int64

In [37]:
df.groupby('item')[['rate']].mean().sort_values( by='rate', ascending=False)[:10]

Unnamed: 0_level_0,rate
item,Unnamed: 1_level_1
1500,5.0
1293,5.0
1189,5.0
1653,5.0
1467,5.0
1122,5.0
1599,5.0
1201,5.0
1536,5.0
814,5.0


In [None]:
df.groupby('itemp')

# model application

In [8]:
trainset = data.build_full_trainset()
testset = trainset.build_testset()

train, test = train_test_split(df, test_size=0.25, random_state=0)

In [9]:
algo = SVD()
algo.fit(trainset)

predictions = algo.test(testset)

In [10]:
accuracy.rmse(predictions)

RMSE: 0.6753


0.6753353885026904

In [11]:
predictions

[Prediction(uid='196', iid='242', r_ui=3.0, est=3.6759698006198342, details={'was_impossible': False}),
 Prediction(uid='196', iid='393', r_ui=4.0, est=3.7679536732123955, details={'was_impossible': False}),
 Prediction(uid='196', iid='381', r_ui=4.0, est=3.4253368479565114, details={'was_impossible': False}),
 Prediction(uid='196', iid='251', r_ui=3.0, est=3.9902666324975518, details={'was_impossible': False}),
 Prediction(uid='196', iid='655', r_ui=5.0, est=4.248156194639818, details={'was_impossible': False}),
 Prediction(uid='196', iid='67', r_ui=5.0, est=3.561784577106656, details={'was_impossible': False}),
 Prediction(uid='196', iid='306', r_ui=4.0, est=4.038074781857716, details={'was_impossible': False}),
 Prediction(uid='196', iid='238', r_ui=4.0, est=3.3959518103229827, details={'was_impossible': False}),
 Prediction(uid='196', iid='663', r_ui=5.0, est=4.233639984433278, details={'was_impossible': False}),
 Prediction(uid='196', iid='111', r_ui=4.0, est=3.5021553043145657, d

In [16]:
benchmark = []

for algo in [SVD(), KNNBaseline(), KNNBasic(), BaselineOnly()]:
    results = cross_validate(algo, data, measures=['rmse'], 
                             cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split('.')[-1]] , index=['Algorithm']))
    benchmark.append(tmp)
    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [20]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline object at 0x7fd18d153850>,0.935529,0.282634,4.695073
SVD object at 0x7fd18d153ad0>,0.946523,2.884495,0.195349
BaselineOnly object at 0x7fd18d153310>,0.946572,0.117564,0.194544
KNNBasic object at 0x7fd18d153d10>,0.988409,0.206372,4.320039


from surprise import GridSearch
    # Select your best algo with grid search.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=1)
print(grid_search.best_params)
grid_search.evaluate(data)

In [22]:
algo = SVD()  # SVD(n_factors=50, random_state=0)
results = cross_validate(algo, 
                         data,
                         measures = ['rmse'], 
                         cv=3,
                        verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9429  0.9482  0.9420  0.9444  0.0027  
Fit time          3.42    3.45    3.59    3.48    0.07    
Test time         0.21    0.29    0.21    0.23    0.04    


# content-based recommender systems

In [44]:
df_table = df.pivot_table(index='user', columns='item',values='rate') #df.set_index(["user", "item"]).unstack()
df_table.shape

(943, 1682)

In [45]:
df_table

item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,5.0,,,,,,,,...,,,,,,,,,,
10,4.0,,5.0,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,3.0,,,,,,,,,
101,3.0,,,,,,,,,,...,,,,,,,,,,
102,3.0,,,,,,,,,,...,,,,2.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.0,,,,,,,,,,...,,,,,,,,,,
96,5.0,,5.0,,,,,,,,...,,,,,,,,,,
97,4.0,,2.0,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [46]:
df_table['1']

user
1      5.0
10     4.0
100    NaN
101    3.0
102    3.0
      ... 
95     5.0
96     5.0
97     4.0
98     NaN
99     4.0
Name: 1, Length: 943, dtype: float64

In [57]:
corr_item_1 = df_table.corr()['1']
corr_item_1.head()

item
1       1.000000
10      0.187502
100     0.105460
1000   -0.500000
1001   -0.244073
Name: 1, dtype: float64

# Baseline model

In [90]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als', #ALS - Alternating Least Squares
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

Estimating biases using als...
RMSE: 0.9453
Estimating biases using als...
RMSE: 0.9377
Estimating biases using als...
RMSE: 0.9500


0.9443304984013942

In [125]:
cross_validate(algo, data)

{'test_rmse': array([0.93819203, 0.9273263 , 0.93157922, 0.93909507, 0.94453112]),
 'test_mae': array([0.73759499, 0.73186477, 0.73583537, 0.74017781, 0.74563123]),
 'fit_time': (3.937307119369507,
  3.9623632431030273,
  3.8970069885253906,
  3.783194065093994,
  3.644335985183716),
 'test_time': (0.11777305603027344,
  0.11740732192993164,
  0.12404990196228027,
  0.2074587345123291,
  0.10339713096618652)}