In [1]:
import os
import pandas as pd
import numpy as np


import surprise

from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV  #not as same as the cross_validation from scikit learn?
from surprise import Reader, Dataset, accuracy
from surprise import dump

from sklearn import preprocessing

#libraries for data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
import seaborn as sns
#sns.set_palette('Set2')
sns.set_color_codes("pastel")

# Load built in data from surprise library

In [2]:
data = surprise.Dataset.load_builtin('ml-100k')   #Dataset.load_builtin('ml-100k')

In [3]:
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
del df["id"]
df.head(10)

Unnamed: 0,user,item,rate
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0
5,298,474,4.0
6,115,265,2.0
7,253,465,5.0
8,305,451,3.0
9,6,86,3.0


# Data exploration

In [4]:
np.sort( df.rate.unique() )

array([1., 2., 3., 4., 5.])

In [5]:
df.isnull().sum()

user    0
item    0
rate    0
dtype: int64

In [6]:
df.groupby('item')[['rate']].mean().sort_values( by='rate', ascending=False)[:10]

Unnamed: 0_level_0,rate
item,Unnamed: 1_level_1
1500,5.0
1293,5.0
1189,5.0
1653,5.0
1467,5.0
1122,5.0
1599,5.0
1201,5.0
1536,5.0
814,5.0


# Collaborative filtering

In [7]:
df_table = df.set_index(["user", "item"]).unstack()
#df.pivot_table(index='user', columns='item',values='rate') <- not to use
#df.set_index(["user", "item"]).unstack()
df_table.shape

(943, 1682)

### user-item matrix  (user utility matrix)

In [8]:
df_table

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5.0,3.0,5.0,,,,,,,,...,,,,,,,,,,
10,4.0,,5.0,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,3.0,,,,,,,,,
101,3.0,,,,,,,,,,...,,,,,,,,,,
102,3.0,,,,,,,,,,...,,,,2.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.0,,,,,,,,,,...,,,,,,,,,,
96,5.0,,5.0,,,,,,,,...,,,,,,,,,,
97,4.0,,2.0,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


df_table.corr()['1'].sort_values(ascending=False).head()

from scipy.sparse import csr_matrix
csr_matrix(df_table)

#### calcualte sparsity

In [9]:
df_table.values

array([[ 5.,  3.,  5., ..., nan, nan, nan],
       [ 4., nan,  5., ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 4., nan,  2., ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [ 4., nan,  5., ..., nan, nan, nan]])

In [10]:
np.isnan( df_table.values  ).sum()

1486126

In [11]:
np.prod(df_table.values.shape)

1586126

In [12]:
np.prod(df_table.shape)

1586126

In [13]:
#sparsity of matrix = Number of Empty cells / Total Number of cells.
sparsity = 1 - np.isnan(df_table.values).sum() / np.prod(df_table.shape)
print("Sparsity: ", '{:2.2%}'.format(sparsity) )

Sparsity:  6.30%


#### Only 6.3% of cells in the user-item matrix are populated with ratings. A general rule of thumb is that your matrix sparsity should be no lower than 0.5% to generate decent results.

### Normalization (mean normalization)
    need to normalize ratings by accounting for user and item bias
    subtract item's average rating from each user's rating for given item.

In [14]:
df_table.isnull().sum()

      item
rate  1       491
      10      854
      100     435
      1000    933
      1001    926
             ... 
      995     912
      996     929
      997     927
      998     927
      999     933
Length: 1682, dtype: int64

In [15]:
#Users who have not rated any moveis. 
df_table[df_table.isnull()]

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
101,,,,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,,...,,,,,,,,,,
96,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# users who are likely give out low ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=True)[:10]

Unnamed: 0_level_0,rate
user,Unnamed: 1_level_1
181,1.491954
405,1.834464
445,1.985185
685,2.05
774,2.058036
724,2.164706
206,2.171875
865,2.287879
626,2.34375
609,2.392857


In [17]:
# users who are likely give out high ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=False)[:10]

Unnamed: 0_level_0,rate
user,Unnamed: 1_level_1
849,4.869565
688,4.833333
507,4.724138
628,4.703704
928,4.6875
118,4.661972
907,4.571429
686,4.56338
427,4.548387
565,4.542857


In [18]:
# mean normalization
scaler = preprocessing.StandardScaler().fit(df_table)
df_table_norm = scaler.transform(df_table)

In [19]:
df_table_norm = pd.DataFrame(df_table_norm)
df_table_norm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,1.210182,-0.824669,0.866324,,,,,,,,...,,,,,,,,,,
1,0.131282,,0.866324,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,-0.067015,,,,,,,,,
3,-0.947618,,,,,,,,,,...,,,,,,,,,,
4,-0.947618,,,,,,,,,,...,,,,-1.582722,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,1.210182,,,,,,,,,,...,,,,,,,,,,
939,1.210182,,0.866324,,,,,,,,...,,,,,,,,,,
940,0.131282,,-2.211246,,,,,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,


In [20]:
df_table_norm.T.shape

(1682, 943)

### Pick a model

Matric factorization. factorize the user-itm matrix to get 2 latent factor matrics:
    - user-factor matrix
    - item -factor matric

Algorithms for matric factorization:
    - Alternating Least Squares (ALS)
    - Stochastic Gradient Descent (SGD)
    - Singular Value Decompostion (SVD)

cross_validate(SVD(), np.array(df_table_norm), 
               measures=['rmse'], cv=3, verbose=False)

In [21]:
cross_validate(SVD(), data, measures=['rmse'], cv=3, verbose=False)

{'test_rmse': array([0.94255916, 0.94599802, 0.94829916]),
 'fit_time': (2.8308491706848145, 2.8646750450134277, 2.814517021179199),
 'test_time': (0.21713519096374512, 0.2125546932220459, 0.2510530948638916)}

In [22]:
benchmark = []

for algo in [SVD(), KNNBaseline(), KNNBasic(), BaselineOnly()]:
    results = cross_validate(algo, data, measures=['rmse'], 
                             cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split('.')[-1]] , index=['Algorithm']))
    benchmark.append(tmp)
    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [23]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline object at 0x7fbf16feafd0>,0.935926,0.283278,4.704242
SVD object at 0x7fbf16feaf90>,0.946664,2.829528,0.19349
BaselineOnly object at 0x7fbf16fea050>,0.946926,0.11882,0.183958
KNNBasic object at 0x7fbf16fea950>,0.989162,0.23414,4.405996


### Pick an evaluation metric


### Hyperparameter Tuning
    - try to tune k (# of factors)
    - try to tune lambda regularization parameter
  
methods: 
- Grid Search 
    sklearn.model_selection.GridSearchCV
- Random Search
    sklearn.model_selection.RandomizedSearchCV
- Sequential Model-Based Optimization

### Model training
train model with optimal hyperparameters

### Post-processing
sort predicted ratings and get top N
Filter out items that a user has already has seen

### Evaluation
- A/B testing from users - most optimal method
- Traditional ML. 
- Recommendation systems. 

Precision and Recall

# model application

train, test = train_test_split(df, test_size=0.25, random_state=0)

In [25]:
trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [26]:
algo = SVD()
algo.fit(trainset)

predictions = algo.test(testset)

In [27]:
accuracy.rmse(predictions)

RMSE: 0.6768


0.6767747970677724

In [28]:
predictions

[Prediction(uid='196', iid='242', r_ui=3.0, est=3.773395695576679, details={'was_impossible': False}),
 Prediction(uid='196', iid='393', r_ui=4.0, est=3.590598370205465, details={'was_impossible': False}),
 Prediction(uid='196', iid='381', r_ui=4.0, est=3.4174500527342104, details={'was_impossible': False}),
 Prediction(uid='196', iid='251', r_ui=3.0, est=4.057035777760627, details={'was_impossible': False}),
 Prediction(uid='196', iid='655', r_ui=5.0, est=3.8395722026270063, details={'was_impossible': False}),
 Prediction(uid='196', iid='67', r_ui=5.0, est=3.2500831015989595, details={'was_impossible': False}),
 Prediction(uid='196', iid='306', r_ui=4.0, est=3.9788787732425552, details={'was_impossible': False}),
 Prediction(uid='196', iid='238', r_ui=4.0, est=3.8690771857036266, details={'was_impossible': False}),
 Prediction(uid='196', iid='663', r_ui=5.0, est=4.285245222642207, details={'was_impossible': False}),
 Prediction(uid='196', iid='111', r_ui=4.0, est=3.698911381858543, de

from surprise import GridSearch
    # Select your best algo with grid search.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=1)
print(grid_search.best_params)
grid_search.evaluate(data)

In [29]:
algo = SVD()  # SVD(n_factors=50, random_state=0)
results = cross_validate(algo, 
                         data,
                         measures = ['rmse'], 
                         cv=3,
                        verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9487  0.9448  0.9480  0.9472  0.0017  
Fit time          2.91    3.05    3.28    3.08    0.15    
Test time         0.18    0.26    0.21    0.22    0.03    


# Baseline model

In [30]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als', #ALS - Alternating Least Squares
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

Estimating biases using als...
RMSE: 0.9453
Estimating biases using als...
RMSE: 0.9377
Estimating biases using als...
RMSE: 0.9500


0.9443304984013942

In [31]:
cross_validate(algo, data)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.9384446 , 0.94651657, 0.93612815, 0.94221861, 0.94428787]),
 'test_mae': array([0.74477853, 0.75124267, 0.73975393, 0.745764  , 0.74659098]),
 'fit_time': (0.07780003547668457,
  0.08590102195739746,
  0.09300398826599121,
  0.09913206100463867,
  0.09116578102111816),
 'test_time': (0.08355998992919922,
  0.07891011238098145,
  0.07885479927062988,
  0.09049201011657715,
  0.07709598541259766)}