In [1]:
import os
import pandas as pd
import numpy as np


import surprise

from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV  #not as same as the cross_validation from scikit learn?
from surprise import Reader, Dataset, accuracy
from surprise import dump

from sklearn.metrics.pairwise import cosine_similarity

from sklearn import preprocessing

#libraries for data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
import seaborn as sns
#sns.set_palette('Set2')
sns.set_color_codes("pastel")

# Load built in data from surprise library

In [2]:
data = surprise.Dataset.load_builtin('ml-100k')   #Dataset.load_builtin('ml-100k')

In [3]:
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
del df["id"]
df.head(10)

Unnamed: 0,user,item,rate
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0
5,298,474,4.0
6,115,265,2.0
7,253,465,5.0
8,305,451,3.0
9,6,86,3.0


# Data exploration

In [4]:
print("number of users: ", df.user.nunique())
print("number of movies: ", df.item.nunique())

number of users:  943
number of movies:  1682


In [5]:
np.sort( df.rate.unique() )

array([1., 2., 3., 4., 5.])

In [6]:
df.isnull().sum()

user    0
item    0
rate    0
dtype: int64

In [7]:
df.groupby('item')[['rate']].mean().sort_values( by='rate', ascending=False)[:10]

Unnamed: 0_level_0,rate
item,Unnamed: 1_level_1
1500,5.0
1293,5.0
1189,5.0
1653,5.0
1467,5.0
1122,5.0
1599,5.0
1201,5.0
1536,5.0
814,5.0


# Collaborative filtering

## user-item matrix  (user utility matrix)

In [20]:
df_table = df.pivot_table(index='user', columns='item',values='rate')
df_table

item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,5.0,,,,,,,,...,,,,,,,,,,
10,4.0,,5.0,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,3.0,,,,,,,,,
101,3.0,,,,,,,,,,...,,,,,,,,,,
102,3.0,,,,,,,,,,...,,,,2.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.0,,,,,,,,,,...,,,,,,,,,,
96,5.0,,5.0,,,,,,,,...,,,,,,,,,,
97,4.0,,2.0,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [21]:
df_pvt = df_table.fillna(0)
df_pvt

item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,4.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## sparsity calculation

In [22]:
df_table.values

np.isnan( df_table.values  ).sum()

np.prod(df_table.values.shape)

np.prod(df_table.shape)

#sparsity of matrix = Number of Empty cells / Total Number of cells.
sparsity = 1 - np.isnan(df_table.values).sum() / np.prod(df_table.shape)
print("Sparsity: ", '{:2.2%}'.format(sparsity) )

Sparsity:  6.30%


#### Only 6.3% of cells in the user-item matrix are populated with ratings. A general rule of thumb is that your matrix sparsity should be no lower than 0.5% to generate decent results.

## similarity based on cosine metrics

In [46]:
df_item = df.pivot_table(index='item', columns='user',values='rate').fillna(0)

item_similarity = pd.DataFrame(cosine_similarity(df_item), index=df_item.index, columns=df_item.index)
item_similarity

item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.273935,0.630601,0.114364,0.109701,0.084998,0.100452,0.061399,0.093729,0.101029,...,0.085165,0.091251,0.070873,0.353507,0.060461,0.085890,0.098793,0.141426,0.138101,0.099471
10,0.273935,1.000000,0.336233,0.044153,0.053510,0.025705,0.046341,0.000000,0.224322,0.130158,...,0.085546,0.081927,0.067685,0.027119,0.048401,0.123714,0.007909,0.033473,0.121554,0.000000
100,0.630601,0.336233,1.000000,0.084698,0.144486,0.084889,0.061514,0.097603,0.153229,0.168087,...,0.089523,0.098840,0.075603,0.191055,0.090889,0.119886,0.053259,0.139800,0.127146,0.038349
1000,0.114364,0.044153,0.084698,1.000000,0.381626,0.198191,0.044662,0.000000,0.000000,0.042403,...,0.021277,0.000000,0.456630,0.104545,0.000000,0.000000,0.365864,0.129040,0.288370,0.521862
1001,0.109701,0.053510,0.144486,0.381626,1.000000,0.192154,0.014434,0.000000,0.000000,0.000000,...,0.025786,0.016116,0.379473,0.081088,0.000000,0.005315,0.128093,0.260643,0.183478,0.210819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.085890,0.123714,0.119886,0.000000,0.005315,0.008511,0.046029,0.000000,0.058251,0.032776,...,0.106898,0.154184,0.033615,0.014366,0.128201,1.000000,0.000000,0.016624,0.027862,0.000000
996,0.098793,0.007909,0.053259,0.365864,0.128093,0.015778,0.142220,0.010056,0.005400,0.006751,...,0.000000,0.000000,0.000000,0.123177,0.014854,0.000000,1.000000,0.154092,0.137742,0.301202
997,0.141426,0.033473,0.139800,0.129040,0.260643,0.016695,0.150482,0.138329,0.005713,0.007144,...,0.032260,0.050408,0.000000,0.098631,0.015717,0.016624,0.154092,1.000000,0.081981,0.186824
998,0.138101,0.121554,0.127146,0.288370,0.183478,0.083943,0.176552,0.000000,0.143636,0.137690,...,0.022529,0.000000,0.276289,0.103319,0.000000,0.027862,0.137742,0.081981,1.000000,0.055258


In [41]:
df_table = df.pivot_table(index='user', columns='item',values='rate')# <- not to use
df_pvt = df_table.fillna(0)
#df_pvt = df_pvt.fillna(df_pvt.mean(axis=0))

user_similarity = pd.DataFrame(cosine_similarity(df_pvt), index=df_pvt.index, columns=df_pvt.index)
user_similarity

user,1,10,100,101,102,103,104,105,106,107,...,94,940,941,942,943,95,96,97,98,99
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.376544,0.057354,0.154457,0.397145,0.226111,0.231134,0.102414,0.288715,0.080152,...,0.480574,0.314072,0.148617,0.179508,0.398175,0.468911,0.361273,0.352280,0.135963,0.281790
10,0.376544,1.000000,0.066987,0.030877,0.288020,0.145788,0.156615,0.077478,0.284969,0.095741,...,0.418951,0.342961,0.090305,0.212330,0.221860,0.375380,0.341416,0.301478,0.140115,0.193943
100,0.057354,0.066987,1.000000,0.012998,0.171988,0.055922,0.375736,0.507451,0.041973,0.329853,...,0.090008,0.289131,0.099363,0.237968,0.000000,0.036854,0.000000,0.000000,0.006082,0.192041
101,0.154457,0.030877,0.012998,1.000000,0.153750,0.321378,0.268215,0.037979,0.035617,0.031846,...,0.160808,0.083545,0.237333,0.081874,0.232549,0.180604,0.086543,0.073269,0.000000,0.360147
102,0.397145,0.288020,0.171988,0.153750,1.000000,0.237494,0.215930,0.185148,0.123308,0.128144,...,0.439064,0.323743,0.164963,0.201437,0.383536,0.416339,0.281648,0.301406,0.169144,0.294568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.468911,0.375380,0.036854,0.180604,0.416339,0.230296,0.144948,0.019579,0.278008,0.010945,...,0.529910,0.330689,0.110761,0.247625,0.400270,1.000000,0.345199,0.343122,0.133712,0.278597
96,0.361273,0.341416,0.000000,0.086543,0.281648,0.226449,0.099971,0.000000,0.158699,0.000000,...,0.342054,0.276660,0.110265,0.231118,0.288281,0.345199,1.000000,0.308487,0.074055,0.216023
97,0.352280,0.301478,0.000000,0.073269,0.301406,0.148177,0.060202,0.000000,0.155511,0.000000,...,0.305776,0.291914,0.150569,0.220158,0.257559,0.343122,0.308487,1.000000,0.119453,0.160602
98,0.135963,0.140115,0.006082,0.000000,0.169144,0.030079,0.024732,0.000000,0.181077,0.029806,...,0.141533,0.152999,0.000000,0.124439,0.066392,0.133712,0.074055,0.119453,1.000000,0.079174


In [42]:
def get_similar_user(user_id):
    return user_similarity[user_id].sort_values(ascending=False)[:5]

In [43]:
get_similar_user("100")

user
100    1.000000
863    0.621852
784    0.600292
616    0.589742
856    0.580211
Name: 100, dtype: float64

In [47]:
def get_similar_item(item_id):
    return item_similarity[item_id].sort_values(ascending=False)[:5]

In [51]:
get_similar_item('10')

item
10     1.000000
190    0.400624
20     0.397113
582    0.385038
52     0.377012
Name: 10, dtype: float64

df_table = df.set_index(["user", "item"]).unstack()
#df.pivot_table(index='user', columns='item',values='rate') <- not to use
#df.set_index(["user", "item"]).unstack()
df_table.shape

df_table

df_table.corr()['1'].sort_values(ascending=False).head()

from scipy.sparse import csr_matrix
csr_matrix(df_table)

#Users who have not rated any moveis. 
df_table[df_table.isnull()]

#users who are likely give out low ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=True)[:10]

#users who are likely give out high ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=False)[:10]

df_table_norm.shape

### Pick a model

Matric factorization. factorize the user-itm matrix to get 2 latent factor matrics:
    - user-factor matrix
    - item -factor matric

Algorithms for matric factorization:
    - Alternating Least Squares (ALS)
    - Stochastic Gradient Descent (SGD)
    - Singular Value Decompostion (SVD)

cross_validate(SVD(), np.array(df_table_norm), 
               measures=['rmse'], cv=3, verbose=False)

In [52]:
cross_validate(SVD(), data, measures=['rmse'], cv=3, verbose=False)

{'test_rmse': array([0.94130819, 0.95329265, 0.94160619]),
 'fit_time': (2.8095040321350098, 2.834678888320923, 2.832736015319824),
 'test_time': (0.18181920051574707, 0.18414521217346191, 0.1765148639678955)}

In [53]:
benchmark = []

for algo in [SVD(), KNNBaseline(), KNNBasic(), BaselineOnly()]:
    results = cross_validate(algo, data, measures=['rmse'], 
                             cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split('.')[-1]] , index=['Algorithm']))
    benchmark.append(tmp)
    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [54]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline object at 0x7fb29e5b61d0>,0.935525,0.287413,4.490506
SVD object at 0x7fb29e5b6210>,0.945326,3.176913,0.211414
BaselineOnly object at 0x7fb29e5b6750>,0.947085,0.102608,0.107026
KNNBasic object at 0x7fb29e5b6d90>,0.989532,0.178528,3.542592


### Pick an evaluation metric


### Hyperparameter Tuning
    - try to tune k (# of factors)
    - try to tune lambda regularization parameter
  
methods: 
- Grid Search 
    sklearn.model_selection.GridSearchCV
- Random Search
    sklearn.model_selection.RandomizedSearchCV
- Sequential Model-Based Optimization

### Model training
train model with optimal hyperparameters

### Post-processing
sort predicted ratings and get top N
Filter out items that a user has already has seen

### Evaluation
- A/B testing from users - most optimal method
- Traditional ML. 
- Recommendation systems. 

Precision and Recall

# model application

train, test = train_test_split(df, test_size=0.25, random_state=0)

In [55]:
trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [56]:
algo = SVD()
algo.fit(trainset)

predictions = algo.test(testset)

In [57]:
accuracy.rmse(predictions)

RMSE: 0.6758


0.6758075538362123

In [58]:
predictions

[Prediction(uid='196', iid='242', r_ui=3.0, est=3.8418389579300842, details={'was_impossible': False}),
 Prediction(uid='196', iid='393', r_ui=4.0, est=3.6026998236428733, details={'was_impossible': False}),
 Prediction(uid='196', iid='381', r_ui=4.0, est=3.497090204995882, details={'was_impossible': False}),
 Prediction(uid='196', iid='251', r_ui=3.0, est=3.9385076972726742, details={'was_impossible': False}),
 Prediction(uid='196', iid='655', r_ui=5.0, est=4.132127365850747, details={'was_impossible': False}),
 Prediction(uid='196', iid='67', r_ui=5.0, est=3.7423463168462057, details={'was_impossible': False}),
 Prediction(uid='196', iid='306', r_ui=4.0, est=3.8031478689143587, details={'was_impossible': False}),
 Prediction(uid='196', iid='238', r_ui=4.0, est=3.9035004537482703, details={'was_impossible': False}),
 Prediction(uid='196', iid='663', r_ui=5.0, est=4.114636948962045, details={'was_impossible': False}),
 Prediction(uid='196', iid='111', r_ui=4.0, est=3.500267628710387, d

from surprise import GridSearch
    # Select your best algo with grid search.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=1)
print(grid_search.best_params)
grid_search.evaluate(data)

In [59]:
algo = SVD()  # SVD(n_factors=50, random_state=0)
results = cross_validate(algo, 
                         data,
                         measures = ['rmse'], 
                         cv=3,
                        verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9468  0.9398  0.9523  0.9463  0.0051  
Fit time          2.87    2.88    2.92    2.89    0.02    
Test time         0.18    0.18    0.17    0.18    0.00    


# Baseline model

In [60]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als', #ALS - Alternating Least Squares
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

Estimating biases using als...
RMSE: 0.9453
Estimating biases using als...
RMSE: 0.9377
Estimating biases using als...
RMSE: 0.9500


0.9443304984013942

In [61]:
cross_validate(algo, data)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.9384446 , 0.94651657, 0.93612815, 0.94221861, 0.94428787]),
 'test_mae': array([0.74477853, 0.75124267, 0.73975393, 0.745764  , 0.74659098]),
 'fit_time': (0.07156205177307129,
  0.08118724822998047,
  0.08850908279418945,
  0.08518505096435547,
  0.09156608581542969),
 'test_time': (0.0676579475402832,
  0.0625150203704834,
  0.06650018692016602,
  0.06548905372619629,
  0.07133007049560547)}