In [9]:
import os
import pandas as pd
import numpy as np


import surprise

from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV  #not as same as the cross_validation from scikit learn?
from surprise import Reader, Dataset, accuracy
from surprise import dump

from sklearn.metrics.pairwise import cosine_similarity

from sklearn import preprocessing

#libraries for data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
import seaborn as sns
#sns.set_palette('Set2')
sns.set_color_codes("pastel")

# Load built in data from surprise library

In [2]:
data = surprise.Dataset.load_builtin('ml-100k')   #Dataset.load_builtin('ml-100k')

In [3]:
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
del df["id"]
df.head(10)

Unnamed: 0,user,item,rate
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0
5,298,474,4.0
6,115,265,2.0
7,253,465,5.0
8,305,451,3.0
9,6,86,3.0


# Data exploration

In [4]:
print("number of users: ", df.user.nunique())
print("number of movies: ", df.item.nunique())

number of users:  943
number of movies:  1682


In [5]:
np.sort( df.rate.unique() )

array([1., 2., 3., 4., 5.])

In [6]:
df.isnull().sum()

user    0
item    0
rate    0
dtype: int64

In [7]:
df.groupby('item')[['rate']].mean().sort_values( by='rate', ascending=False)[:10]

Unnamed: 0_level_0,rate
item,Unnamed: 1_level_1
1500,5.0
1293,5.0
1189,5.0
1653,5.0
1467,5.0
1122,5.0
1599,5.0
1201,5.0
1536,5.0
814,5.0


# Collaborative filtering

In [49]:
df_pvt = df.pivot_table(index='item', columns='user',values='rate')# <- not to use
df_pvt = df_pvt.fillna(0)

items_similarity = pd.DataFrame(cosine_similarity(df_pvt), index=df_pvt.index, columns=df_pvt.index)
items_similarity

item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.273935,0.630601,0.114364,0.109701,0.084998,0.100452,0.061399,0.093729,0.101029,...,0.085165,0.091251,0.070873,0.353507,0.060461,0.085890,0.098793,0.141426,0.138101,0.099471
10,0.273935,1.000000,0.336233,0.044153,0.053510,0.025705,0.046341,0.000000,0.224322,0.130158,...,0.085546,0.081927,0.067685,0.027119,0.048401,0.123714,0.007909,0.033473,0.121554,0.000000
100,0.630601,0.336233,1.000000,0.084698,0.144486,0.084889,0.061514,0.097603,0.153229,0.168087,...,0.089523,0.098840,0.075603,0.191055,0.090889,0.119886,0.053259,0.139800,0.127146,0.038349
1000,0.114364,0.044153,0.084698,1.000000,0.381626,0.198191,0.044662,0.000000,0.000000,0.042403,...,0.021277,0.000000,0.456630,0.104545,0.000000,0.000000,0.365864,0.129040,0.288370,0.521862
1001,0.109701,0.053510,0.144486,0.381626,1.000000,0.192154,0.014434,0.000000,0.000000,0.000000,...,0.025786,0.016116,0.379473,0.081088,0.000000,0.005315,0.128093,0.260643,0.183478,0.210819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.085890,0.123714,0.119886,0.000000,0.005315,0.008511,0.046029,0.000000,0.058251,0.032776,...,0.106898,0.154184,0.033615,0.014366,0.128201,1.000000,0.000000,0.016624,0.027862,0.000000
996,0.098793,0.007909,0.053259,0.365864,0.128093,0.015778,0.142220,0.010056,0.005400,0.006751,...,0.000000,0.000000,0.000000,0.123177,0.014854,0.000000,1.000000,0.154092,0.137742,0.301202
997,0.141426,0.033473,0.139800,0.129040,0.260643,0.016695,0.150482,0.138329,0.005713,0.007144,...,0.032260,0.050408,0.000000,0.098631,0.015717,0.016624,0.154092,1.000000,0.081981,0.186824
998,0.138101,0.121554,0.127146,0.288370,0.183478,0.083943,0.176552,0.000000,0.143636,0.137690,...,0.022529,0.000000,0.276289,0.103319,0.000000,0.027862,0.137742,0.081981,1.000000,0.055258


In [50]:
df_pvt = df.pivot_table(index='user', columns='item',values='rate')# <- not to use
df_pvt = df_pvt.fillna(0)
#df_pvt = df_pvt.fillna(df_pvt.mean(axis=0))

user_similarity = pd.DataFrame(cosine_similarity(df_pvt), index=df_pvt.index, columns=df_pvt.index)
user_similarity

user,1,10,100,101,102,103,104,105,106,107,...,94,940,941,942,943,95,96,97,98,99
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.376544,0.057354,0.154457,0.397145,0.226111,0.231134,0.102414,0.288715,0.080152,...,0.480574,0.314072,0.148617,0.179508,0.398175,0.468911,0.361273,0.352280,0.135963,0.281790
10,0.376544,1.000000,0.066987,0.030877,0.288020,0.145788,0.156615,0.077478,0.284969,0.095741,...,0.418951,0.342961,0.090305,0.212330,0.221860,0.375380,0.341416,0.301478,0.140115,0.193943
100,0.057354,0.066987,1.000000,0.012998,0.171988,0.055922,0.375736,0.507451,0.041973,0.329853,...,0.090008,0.289131,0.099363,0.237968,0.000000,0.036854,0.000000,0.000000,0.006082,0.192041
101,0.154457,0.030877,0.012998,1.000000,0.153750,0.321378,0.268215,0.037979,0.035617,0.031846,...,0.160808,0.083545,0.237333,0.081874,0.232549,0.180604,0.086543,0.073269,0.000000,0.360147
102,0.397145,0.288020,0.171988,0.153750,1.000000,0.237494,0.215930,0.185148,0.123308,0.128144,...,0.439064,0.323743,0.164963,0.201437,0.383536,0.416339,0.281648,0.301406,0.169144,0.294568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.468911,0.375380,0.036854,0.180604,0.416339,0.230296,0.144948,0.019579,0.278008,0.010945,...,0.529910,0.330689,0.110761,0.247625,0.400270,1.000000,0.345199,0.343122,0.133712,0.278597
96,0.361273,0.341416,0.000000,0.086543,0.281648,0.226449,0.099971,0.000000,0.158699,0.000000,...,0.342054,0.276660,0.110265,0.231118,0.288281,0.345199,1.000000,0.308487,0.074055,0.216023
97,0.352280,0.301478,0.000000,0.073269,0.301406,0.148177,0.060202,0.000000,0.155511,0.000000,...,0.305776,0.291914,0.150569,0.220158,0.257559,0.343122,0.308487,1.000000,0.119453,0.160602
98,0.135963,0.140115,0.006082,0.000000,0.169144,0.030079,0.024732,0.000000,0.181077,0.029806,...,0.141533,0.152999,0.000000,0.124439,0.066392,0.133712,0.074055,0.119453,1.000000,0.079174


In [51]:
df_df_pvt

item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


df_table = df.set_index(["user", "item"]).unstack()
#df.pivot_table(index='user', columns='item',values='rate') <- not to use
#df.set_index(["user", "item"]).unstack()
df_table.shape

### user-item matrix  (user utility matrix)

df_table

df_table.corr()['1'].sort_values(ascending=False).head()

from scipy.sparse import csr_matrix
csr_matrix(df_table)

#### calcualte sparsity

In [20]:
df_table.values

np.isnan( df_table.values  ).sum()

np.prod(df_table.values.shape)

np.prod(df_table.shape)

#sparsity of matrix = Number of Empty cells / Total Number of cells.
sparsity = 1 - np.isnan(df_table.values).sum() / np.prod(df_table.shape)
print("Sparsity: ", '{:2.2%}'.format(sparsity) )

Sparsity:  6.30%


#### Only 6.3% of cells in the user-item matrix are populated with ratings. A general rule of thumb is that your matrix sparsity should be no lower than 0.5% to generate decent results.

### Normalization (mean normalization)
    need to normalize ratings by accounting for user and item bias
    subtract item's average rating from each user's rating for given item.

In [21]:
df_table.isnull().sum()

      item
rate  1       491
      10      854
      100     435
      1000    933
      1001    926
             ... 
      995     912
      996     929
      997     927
      998     927
      999     933
Length: 1682, dtype: int64

#Users who have not rated any moveis. 
df_table[df_table.isnull()]

#users who are likely give out low ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=True)[:10]

#users who are likely give out high ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=False)[:10]

In [27]:
df_table = df_table.fillna(df_table.mean(axis=0))
df_table

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5.000000,3.000000,5.000000,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8
10,4.000000,3.831461,5.000000,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8
100,3.878319,3.831461,4.155512,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.000000,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8
101,3.000000,3.831461,4.155512,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8
102,3.000000,3.831461,4.155512,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,2.000000,2.285714,3.16129,2.5,2.125,2.5625,2.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.000000,3.831461,4.155512,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8
96,5.000000,3.831461,5.000000,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8
97,4.000000,3.831461,2.000000,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8
98,3.878319,3.831461,4.155512,3.0,2.0,1.875,2.25,3.111111,3.681818,2.826087,...,3.090909,2.24,3.0,3.515152,2.285714,3.16129,2.5,2.125,2.5625,2.8


In [28]:
# mean normalization
scaler = preprocessing.StandardScaler().fit(df_table)
df_table_norm = scaler.transform(df_table)

In [30]:
df_table_norm = pd.DataFrame(df_table_norm, 
                             index=df_table.index, 
                             columns=df_table.columns)#.fillna(df_table.mean(axis=0))
df_table_norm

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,1.747984e+00,-2.684360e+00,1.180332,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
10,1.896235e-01,-1.433736e-15,1.180332,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
100,-1.384102e-15,-1.433736e-15,0.000000,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-3.582377e-01,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
101,-1.368737e+00,-1.433736e-15,0.000000,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
102,-1.368737e+00,-1.433736e-15,0.000000,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,-5.982581e+00,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.747984e+00,-1.433736e-15,0.000000,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
96,1.747984e+00,-1.433736e-15,1.180332,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
97,1.896235e-01,-1.433736e-15,-3.012736,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0
98,-1.384102e-15,-1.433736e-15,0.000000,0.0,0.0,0.0,0.0,-4.574065e-15,6.937899e-15,-2.519185e-15,...,-1.749984e-15,-5.103604e-15,0.0,1.753488e-15,1.000699e-14,-2.051382e-15,0.0,0.0,0.0,0.0


df_table_norm.shape

## similarity based on cosine metrics

In [31]:


user_based_collabor = cosine_similarity(df_table_norm)
user_based_collabor

array([[ 1.00000000e+00,  4.17495337e-03,  6.67086392e-03, ...,
         1.04324263e-01, -2.91643607e-02, -1.59275968e-02],
       [ 4.17495337e-03,  1.00000000e+00, -1.03747431e-02, ...,
        -4.13089074e-02,  1.35275418e-02, -4.44923387e-02],
       [ 6.67086392e-03, -1.03747431e-02,  1.00000000e+00, ...,
        -3.58290250e-17,  2.96887189e-03,  3.09173666e-02],
       ...,
       [ 1.04324263e-01, -4.13089074e-02, -3.58290250e-17, ...,
         1.00000000e+00,  4.15727182e-03, -8.84314363e-03],
       [-2.91643607e-02,  1.35275418e-02,  2.96887189e-03, ...,
         4.15727182e-03,  1.00000000e+00, -3.49991578e-02],
       [-1.59275968e-02, -4.44923387e-02,  3.09173666e-02, ...,
        -8.84314363e-03, -3.49991578e-02,  1.00000000e+00]])

In [32]:
user_based_collabor.shape

(943, 943)

In [33]:
user_similarity = pd.DataFrame(data=user_based_collabor, 
                                index=df_table.index, 
                                columns=df_table.index)

In [34]:
user_similarity

user,1,10,100,101,102,103,104,105,106,107,...,94,940,941,942,943,95,96,97,98,99
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.004175,6.670864e-03,-4.063929e-02,0.026185,-0.038452,-0.004261,5.943633e-02,-0.016445,1.318275e-02,...,0.061482,-0.005764,1.530273e-02,-0.072516,-4.747351e-02,-0.016778,9.438812e-03,1.043243e-01,-2.916436e-02,-0.015928
10,0.004175,1.000000,-1.037474e-02,-1.275766e-03,-0.036779,-0.008855,-0.012981,-5.902711e-03,0.048677,-6.579796e-03,...,0.010686,-0.028991,1.522382e-02,0.055093,-8.760490e-04,-0.064844,-3.865276e-02,-4.130891e-02,1.352754e-02,-0.044492
100,0.006671,-0.010375,1.000000e+00,-1.992130e-03,-0.004028,0.002281,0.031316,-4.338612e-02,-0.001555,1.390645e-02,...,-0.010342,-0.093843,6.888880e-03,-0.005399,6.905475e-17,-0.001921,-1.164526e-16,-3.582902e-17,2.968872e-03,0.030917
101,-0.040639,-0.001276,-1.992130e-03,1.000000e+00,0.009106,0.019500,-0.013154,2.423987e-03,0.004283,-1.059296e-03,...,-0.012765,0.001020,2.091435e-02,-0.019117,7.756135e-03,-0.016543,-8.369877e-03,-1.014376e-02,-1.303897e-16,0.040542
102,0.026185,-0.036779,-4.027821e-03,9.106114e-03,1.000000,0.016339,0.038233,2.390839e-03,-0.000285,1.910828e-02,...,-0.005529,0.054660,-6.920105e-02,-0.025540,-2.897801e-02,0.001185,-4.796444e-02,-5.279640e-03,3.756581e-02,-0.022244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.016778,-0.064844,-1.920687e-03,-1.654288e-02,0.001185,0.008067,-0.002855,1.291757e-03,0.000482,-3.450411e-03,...,-0.025853,0.012363,1.018271e-02,-0.020189,-5.980051e-04,1.000000,8.652163e-03,-1.041474e-02,-1.840873e-02,0.014374
96,0.009439,-0.038653,-1.164526e-16,-8.369877e-03,-0.047964,-0.006588,-0.001733,-3.014374e-16,-0.011713,8.427297e-17,...,0.006803,-0.017019,2.683076e-02,-0.009881,-2.650100e-02,0.008652,1.000000e+00,6.370871e-02,-1.187362e-02,0.007785
97,0.104324,-0.041309,-3.582902e-17,-1.014376e-02,-0.005280,-0.024987,-0.003299,-4.149017e-17,0.012584,2.545743e-16,...,0.012701,0.009063,4.057270e-02,-0.033380,-4.609338e-02,-0.010415,6.370871e-02,1.000000e+00,4.157272e-03,-0.008843
98,-0.029164,0.013528,2.968872e-03,-1.303897e-16,0.037566,-0.002617,-0.002880,-4.394455e-17,0.015183,5.689219e-03,...,0.012236,-0.020072,-3.764850e-16,0.041741,1.178150e-02,-0.018409,-1.187362e-02,4.157272e-03,1.000000e+00,-0.034999


In [None]:
def get_similar_user(user_id):
    return user_similarity[user_id].sort_values(ascending=False)[:5]

In [None]:
get_similar_user("100")

### Pick a model

Matric factorization. factorize the user-itm matrix to get 2 latent factor matrics:
    - user-factor matrix
    - item -factor matric

Algorithms for matric factorization:
    - Alternating Least Squares (ALS)
    - Stochastic Gradient Descent (SGD)
    - Singular Value Decompostion (SVD)

cross_validate(SVD(), np.array(df_table_norm), 
               measures=['rmse'], cv=3, verbose=False)

In [None]:
cross_validate(SVD(), data, measures=['rmse'], cv=3, verbose=False)

In [None]:
benchmark = []

for algo in [SVD(), KNNBaseline(), KNNBasic(), BaselineOnly()]:
    results = cross_validate(algo, data, measures=['rmse'], 
                             cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split('.')[-1]] , index=['Algorithm']))
    benchmark.append(tmp)
    

In [None]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

### Pick an evaluation metric


### Hyperparameter Tuning
    - try to tune k (# of factors)
    - try to tune lambda regularization parameter
  
methods: 
- Grid Search 
    sklearn.model_selection.GridSearchCV
- Random Search
    sklearn.model_selection.RandomizedSearchCV
- Sequential Model-Based Optimization

### Model training
train model with optimal hyperparameters

### Post-processing
sort predicted ratings and get top N
Filter out items that a user has already has seen

### Evaluation
- A/B testing from users - most optimal method
- Traditional ML. 
- Recommendation systems. 

Precision and Recall

# model application

train, test = train_test_split(df, test_size=0.25, random_state=0)

In [None]:
trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [None]:
algo = SVD()
algo.fit(trainset)

predictions = algo.test(testset)

In [None]:
accuracy.rmse(predictions)

In [None]:
predictions

from surprise import GridSearch
    # Select your best algo with grid search.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=1)
print(grid_search.best_params)
grid_search.evaluate(data)

In [None]:
algo = SVD()  # SVD(n_factors=50, random_state=0)
results = cross_validate(algo, 
                         data,
                         measures = ['rmse'], 
                         cv=3,
                        verbose=True)

# Baseline model

In [None]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als', #ALS - Alternating Least Squares
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

In [None]:
cross_validate(algo, data)