In [1]:
import os
import pandas as pd
import numpy as np


import surprise

from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV  #not as same as the cross_validation from scikit learn?
from surprise import Reader, Dataset, accuracy
from surprise import dump

from sklearn import preprocessing

#libraries for data visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
import seaborn as sns
#sns.set_palette('Set2')
sns.set_color_codes("pastel")

# Load built in data from surprise library

In [2]:
data = surprise.Dataset.load_builtin('ml-100k')   #Dataset.load_builtin('ml-100k')

In [3]:
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
del df["id"]
df.head(10)

Unnamed: 0,user,item,rate
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0
5,298,474,4.0
6,115,265,2.0
7,253,465,5.0
8,305,451,3.0
9,6,86,3.0


# Data exploration

In [37]:
print("number of users: ", df.user.nunique())
print("number of movies: ", df.item.nunique())

number of users:  943
number of movies:  1682


In [4]:
np.sort( df.rate.unique() )

array([1., 2., 3., 4., 5.])

In [5]:
df.isnull().sum()

user    0
item    0
rate    0
dtype: int64

In [6]:
df.groupby('item')[['rate']].mean().sort_values( by='rate', ascending=False)[:10]

Unnamed: 0_level_0,rate
item,Unnamed: 1_level_1
1500,5.0
1293,5.0
1189,5.0
1653,5.0
1467,5.0
1122,5.0
1599,5.0
1201,5.0
1536,5.0
814,5.0


# Collaborative filtering

In [7]:
df_table = df.set_index(["user", "item"]).unstack()
#df.pivot_table(index='user', columns='item',values='rate') <- not to use
#df.set_index(["user", "item"]).unstack()
df_table.shape

(943, 1682)

### user-item matrix  (user utility matrix)

In [8]:
df_table

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5.0,3.0,5.0,,,,,,,,...,,,,,,,,,,
10,4.0,,5.0,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,3.0,,,,,,,,,
101,3.0,,,,,,,,,,...,,,,,,,,,,
102,3.0,,,,,,,,,,...,,,,2.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5.0,,,,,,,,,,...,,,,,,,,,,
96,5.0,,5.0,,,,,,,,...,,,,,,,,,,
97,4.0,,2.0,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


df_table.corr()['1'].sort_values(ascending=False).head()

from scipy.sparse import csr_matrix
csr_matrix(df_table)

#### calcualte sparsity

In [9]:
df_table.values

array([[ 5.,  3.,  5., ..., nan, nan, nan],
       [ 4., nan,  5., ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 4., nan,  2., ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [ 4., nan,  5., ..., nan, nan, nan]])

In [10]:
np.isnan( df_table.values  ).sum()

1486126

In [11]:
np.prod(df_table.values.shape)

1586126

In [12]:
np.prod(df_table.shape)

1586126

In [13]:
#sparsity of matrix = Number of Empty cells / Total Number of cells.
sparsity = 1 - np.isnan(df_table.values).sum() / np.prod(df_table.shape)
print("Sparsity: ", '{:2.2%}'.format(sparsity) )

Sparsity:  6.30%


#### Only 6.3% of cells in the user-item matrix are populated with ratings. A general rule of thumb is that your matrix sparsity should be no lower than 0.5% to generate decent results.

### Normalization (mean normalization)
    need to normalize ratings by accounting for user and item bias
    subtract item's average rating from each user's rating for given item.

In [14]:
df_table.isnull().sum()

      item
rate  1       491
      10      854
      100     435
      1000    933
      1001    926
             ... 
      995     912
      996     929
      997     927
      998     927
      999     933
Length: 1682, dtype: int64

In [15]:
#Users who have not rated any moveis. 
df_table[df_table.isnull()]

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
100,,,,,,,,,,,...,,,,,,,,,,
101,,,,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,,...,,,,,,,,,,
96,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# users who are likely give out low ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=True)[:10]

In [None]:
# users who are likely give out high ratings in general
df.groupby('user').mean().sort_values(by='rate', ascending=False)[:10]

In [60]:
# mean normalization
scaler = preprocessing.StandardScaler().fit(df_table)
df_table_norm = scaler.transform(df_table)

In [102]:
df_table_norm = pd.DataFrame(df_table_norm, 
                             index=df_table.index, 
                             columns=df_table.columns).fillna(df_table.mean(axis=0))
df_table_norm

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,1.210182,-0.824669,0.866324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
10,0.131282,0.000000,0.866324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
100,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.067015,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
101,-0.947618,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
102,-0.947618,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,-1.582722,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.210182,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
96,1.210182,0.000000,0.866324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
97,0.131282,0.000000,-2.211246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
98,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


df_table_norm.shape

## similarity based on cosine metrics

In [68]:
from sklearn.metrics.pairwise import cosine_similarity

user_based_collabor = cosine_similarity(df_table_norm)
user_based_collabor

array([[ 1.        , -0.00649771, -0.00123956, ...,  0.10725584,
        -0.05586412, -0.03736155],
       [-0.00649771,  1.        , -0.03888102, ..., -0.05207321,
         0.01450152, -0.02427356],
       [-0.00123956, -0.03888102,  1.        , ...,  0.        ,
         0.00641509,  0.06245626],
       ...,
       [ 0.10725584, -0.05207321,  0.        , ...,  1.        ,
         0.02069663, -0.02542329],
       [-0.05586412,  0.01450152,  0.00641509, ...,  0.02069663,
         1.        , -0.05166711],
       [-0.03736155, -0.02427356,  0.06245626, ..., -0.02542329,
        -0.05166711,  1.        ]])

In [69]:
user_based_collabor.shape

(943, 943)

In [74]:
user_similarity = pd.DataFrame(data=user_based_collabor, 
                                index=df_table.index, 
                                columns=df_table.index)

In [75]:
user_similarity

user,1,10,100,101,102,103,104,105,106,107,...,94,940,941,942,943,95,96,97,98,99
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.006498,-0.001240,-0.122673,0.013203,-0.043085,0.000339,0.098428,-0.026226,0.039656,...,0.065033,-0.005099,0.041892,-0.098571,-0.074395,0.011422,0.043737,0.107256,-0.055864,-0.037362
10,-0.006498,1.000000,-0.038881,-0.009718,-0.111744,-0.002386,-0.043616,-0.012689,-0.025986,-0.018546,...,0.106657,-0.062667,0.025397,0.096885,-0.003569,-0.129954,-0.009042,-0.052073,0.014502,-0.024274
100,-0.001240,-0.038881,1.000000,-0.016725,-0.014880,0.013686,0.087030,-0.065769,-0.011035,0.015145,...,-0.002398,-0.063409,0.032341,-0.015794,0.000000,-0.007560,0.000000,0.000000,0.006415,0.062456
101,-0.122673,-0.009718,-0.016725,1.000000,0.018626,0.019996,-0.008582,0.010057,0.002158,-0.005683,...,-0.040357,0.004292,0.033372,-0.041457,-0.013976,-0.075847,-0.049853,-0.039741,0.000000,0.108012
102,0.013203,-0.111744,-0.014880,0.018626,1.000000,0.029949,0.075517,-0.025341,0.010708,0.065706,...,-0.010038,0.092170,-0.107603,-0.046035,-0.058489,-0.001342,-0.074848,-0.028072,0.059045,-0.085469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.011422,-0.129954,-0.007560,-0.075847,-0.001342,0.007137,-0.014104,0.005229,0.030385,-0.018062,...,-0.031195,-0.002839,0.028465,-0.012772,0.023304,1.000000,-0.004016,0.024054,-0.043625,0.026657
96,0.043737,-0.009042,0.000000,-0.049853,-0.074848,-0.015746,-0.005036,0.000000,-0.052316,0.000000,...,0.049613,-0.062605,0.095848,0.030863,-0.062755,-0.004016,1.000000,0.148264,-0.007089,0.048661
97,0.107256,-0.052073,0.000000,-0.039741,-0.028072,-0.048289,-0.009090,0.000000,0.063718,0.000000,...,0.008447,0.002452,-0.019167,-0.036852,-0.098524,0.024054,0.148264,1.000000,0.020697,-0.025423
98,-0.055864,0.014502,0.006415,0.000000,0.059045,-0.003521,-0.007852,0.000000,0.045672,0.009038,...,-0.009552,0.021519,0.000000,0.052470,0.029954,-0.043625,-0.007089,0.020697,1.000000,-0.051667


In [82]:
def get_similar_user(user_id):
    return user_similarity[user_id].sort_values(ascending=False)[:5]

In [83]:
get_similar_user("100")

user
100    1.000000
755    0.184117
570    0.175303
531    0.173218
824    0.169562
Name: 100, dtype: float64

### Pick a model

Matric factorization. factorize the user-itm matrix to get 2 latent factor matrics:
    - user-factor matrix
    - item -factor matric

Algorithms for matric factorization:
    - Alternating Least Squares (ALS)
    - Stochastic Gradient Descent (SGD)
    - Singular Value Decompostion (SVD)

cross_validate(SVD(), np.array(df_table_norm), 
               measures=['rmse'], cv=3, verbose=False)

In [21]:
cross_validate(SVD(), data, measures=['rmse'], cv=3, verbose=False)

{'test_rmse': array([0.95377498, 0.93889949, 0.94302621]),
 'fit_time': (2.8528590202331543, 2.7782609462738037, 3.072308301925659),
 'test_time': (0.2098989486694336, 0.21741700172424316, 0.23355865478515625)}

In [22]:
benchmark = []

for algo in [SVD(), KNNBaseline(), KNNBasic(), BaselineOnly()]:
    results = cross_validate(algo, data, measures=['rmse'], 
                             cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split('.')[-1]] , index=['Algorithm']))
    benchmark.append(tmp)
    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [23]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline object at 0x7fee7890ca10>,0.934832,0.291599,4.511411
SVD object at 0x7fee7890ca90>,0.945683,3.138411,0.234724
BaselineOnly object at 0x7fee7890cd90>,0.947763,0.112169,0.165995
KNNBasic object at 0x7fee7890c2d0>,0.988887,0.201084,3.62649


### Pick an evaluation metric


### Hyperparameter Tuning
    - try to tune k (# of factors)
    - try to tune lambda regularization parameter
  
methods: 
- Grid Search 
    sklearn.model_selection.GridSearchCV
- Random Search
    sklearn.model_selection.RandomizedSearchCV
- Sequential Model-Based Optimization

### Model training
train model with optimal hyperparameters

### Post-processing
sort predicted ratings and get top N
Filter out items that a user has already has seen

### Evaluation
- A/B testing from users - most optimal method
- Traditional ML. 
- Recommendation systems. 

Precision and Recall

# model application

train, test = train_test_split(df, test_size=0.25, random_state=0)

In [24]:
trainset = data.build_full_trainset()
testset = trainset.build_testset()

In [25]:
algo = SVD()
algo.fit(trainset)

predictions = algo.test(testset)

In [26]:
accuracy.rmse(predictions)

RMSE: 0.6776


0.6775680479541911

In [27]:
predictions

[Prediction(uid='196', iid='242', r_ui=3.0, est=3.658314900644186, details={'was_impossible': False}),
 Prediction(uid='196', iid='393', r_ui=4.0, est=3.468804029424991, details={'was_impossible': False}),
 Prediction(uid='196', iid='381', r_ui=4.0, est=3.5133965313231026, details={'was_impossible': False}),
 Prediction(uid='196', iid='251', r_ui=3.0, est=4.092522190157864, details={'was_impossible': False}),
 Prediction(uid='196', iid='655', r_ui=5.0, est=4.096837700239498, details={'was_impossible': False}),
 Prediction(uid='196', iid='67', r_ui=5.0, est=3.3938039115700045, details={'was_impossible': False}),
 Prediction(uid='196', iid='306', r_ui=4.0, est=4.04382495052769, details={'was_impossible': False}),
 Prediction(uid='196', iid='238', r_ui=4.0, est=3.8102271530417777, details={'was_impossible': False}),
 Prediction(uid='196', iid='663', r_ui=5.0, est=4.148157829644522, details={'was_impossible': False}),
 Prediction(uid='196', iid='111', r_ui=4.0, est=3.5904327625048253, deta

from surprise import GridSearch
    # Select your best algo with grid search.
print('Grid Search...')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=1)
print(grid_search.best_params)
grid_search.evaluate(data)

In [28]:
algo = SVD()  # SVD(n_factors=50, random_state=0)
results = cross_validate(algo, 
                         data,
                         measures = ['rmse'], 
                         cv=3,
                        verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9462  0.9420  0.9463  0.9448  0.0020  
Fit time          3.40    3.47    3.36    3.41    0.05    
Test time         0.25    0.26    0.23    0.25    0.01    


# Baseline model

In [29]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als', #ALS - Alternating Least Squares
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

Estimating biases using als...
RMSE: 0.9453
Estimating biases using als...
RMSE: 0.9377
Estimating biases using als...
RMSE: 0.9500


0.9443304984013942

In [30]:
cross_validate(algo, data)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.9384446 , 0.94651657, 0.93612815, 0.94221861, 0.94428787]),
 'test_mae': array([0.74477853, 0.75124267, 0.73975393, 0.745764  , 0.74659098]),
 'fit_time': (0.07180070877075195,
  0.08360910415649414,
  0.0868370532989502,
  0.0923759937286377,
  0.09219098091125488),
 'test_time': (0.07082009315490723,
  0.14700913429260254,
  0.07483601570129395,
  0.07335686683654785,
  0.06868505477905273)}