In [1]:
# this is just to know how much time will it take to run this entire ipython notebook 
from datetime import datetime

In [2]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib
matplotlib.use('nbagg')

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

import seaborn as sns
sns.set_style('whitegrid')

In [4]:
import pickle
import os
from surprise import dump

# 1. Creating sparse matrix for Train data

In [5]:
import os
from scipy import sparse
start = datetime.now()
if os.path.isfile('sample/large/train_sparse_matrix.npz'):
    print("It is present in your pwd, getting it from disk....")
    # just get it from the disk instead of computing it
    train_sparse_matrix = sparse.load_npz('sample/large/train_sparse_matrix.npz')
    print("DONE..")
else: 
    print("There is no Train_Sparse Matrix. Get it first..")

print(datetime.now() - start)

It is present in your pwd, getting it from disk....
DONE..
0:00:04.600554


### Sparsity

In [6]:
u_trn, m_trn, r_trn = sparse.find(train_sparse_matrix)

print('\nTrain data')
print('-'*15)

print('No of elements :', len(r_trn))
print('No of users    :', len(np.unique(u_trn)))
print('No of Movies   :', len(np.unique(m_trn)))


Train data
---------------
No of elements : 80384405
No of users    : 405041
No of Movies   : 17424


In [13]:
r, c = train_sparse_matrix.shape
elem = train_sparse_matrix.count_nonzero()
print("Sparsity Of matrix : {} % ".format(  (1-(elem/(r*c))) * 100) )

Sparsity Of matrix : 99.8292709259195 % 


# 2. Creating sparse matrix for Test data

In [7]:
import os
from scipy import sparse
start = datetime.now()
if os.path.isfile('sample/large/test_sparse_matrix.npz'):
    print("It is present in your pwd, getting it from disk....")
    # just get it from the disk instead of computing it
    test_sparse_matrix = sparse.load_npz('sample/large/test_sparse_matrix.npz')
    print("DONE..")
else: 
    print("There is no Train_Sparse Matrix. Get it first..")

print(datetime.now() - start)

It is present in your pwd, getting it from disk....
DONE..
0:00:01.052938


### Sparsity

In [8]:
u_tst, m_tst, r_tst = sparse.find(test_sparse_matrix)

print('\nTest data')
print('-'*15)

print('No of elements :', len(r_tst))
print('No of users    :', len(np.unique(u_tst)))
print('No of Movies   :', len(np.unique(m_tst)))


Test data
---------------
No of elements : 20096102
No of users    : 349312
No of Movies   : 17757


In [10]:
r,c = test_sparse_matrix.shape
elem = test_sparse_matrix.count_nonzero()

print("Sparsity Of matrix : {} % ".format(  (1-(elem/(u*m))) * 100) )

Sparsity Of matrix : 99.95731855608713 % 


# 3. Preparing Trainset and testset for Surprise based alorithms

### TrainSet

In [22]:
from surprise import Reader, Dataset

In [23]:
if os.path.isfile('sample/large/trainset.pickle'):
    print('loading it from the disk')
    trainset = pickle.load(open('sample/large/trainset.pickle', 'rb'))
    print('done')
else:
    print('creating it from sparse_matrix ( if it is loaded)')
    
    train_users, train_movies, train_ratings = sparse.find(train_sparse_matrix)
    
   

    print('preparing train dataframe with users, movies and ratings of the trainset..')
    surp_train = pd.DataFrame({'user': train_users,
                           'movie': train_movies,
                           'rating': train_ratings}, )
    surp_train = surp_train[['user','movie','rating']]
    print(surp_train.head(2))
    
    print('Creating trainset from the dataframe...')
    trainset = Dataset.load_from_df(surp_train,Reader(rating_scale=(1,5))).build_full_trainset()
    
    print('No of unique users, unique movies and ratings in train data', end=' : ')
    print('(users, movies, ratings) : ({}, {}, {})'.format(trainset.n_users, 
                                                           trainset.n_items,
                                                           trainset.n_ratings))
    
    # saving testset to dsik
    print('saving it to disk..')
    start = datetime.now()
    with open('sample/large/trainset.pickle', 'wb') as f:
        pickle.dump(trainset, f)
    print(datetime.now() - start)
    print('Done')

loading it from the disk
done


In [25]:
trainset.n_users, trainset.n_items, trainset.n_ratings

(405024, 17423, 80382095)

### TestSet

In [17]:
if os.path.isfile('sample/large/testset.pickle'):
    print('loading testset from the disk..')
    testset = pickle.load(open('sample/large/testset.pickle', 'rb'))
    print('Done')
else:
    print('creating tesetset from sparse matrix.( if it is loaded )')
    
    test_users, test_movies, test_ratings = sparse.find(test_sparse_matrix)
    
    print('No of unique users, unique movies..', end=' : ')
    print(len(np.unique(test_users)),  len(np.unique(test_movies)))

    print("No of ratings in test set :",len(test_ratings))

    testset = list(zip(test_users, test_movies, test_ratings))

    # saving testset to dsik
    print('saving it to disk..')
    start = datetime.now()
    with open('sample/large/testset.pickle', 'wb') as f:
        pickle.dump(testset, f)
    print(datetime.now() - start)
    print('Done')

loading testset from the disk..
Done


## Generic function to run any surprise based algorithm 

    - given prefectly initialized ALGO, TRAINSET and TESTSET
    

In [9]:
##########################################################
# get  (actual_list , predicted_list) ratings given list 
# of predictions (prediction is a class in Surprise).    
##########################################################
def get_ratings(predictions, return_actual=False):
    if return_actual:
        actual = np.array([pred.r_ui for pred in predictions])
        
    pred = np.array([pred.est for pred in predictions])
    
    if return_actual:
        return actual, pred
    
    return pred

################################################################
# get ''rmse'' and ''mape'' , given list of prediction classes 
################################################################
def get_errors(predictions, print_them=False):

    actual, pred = get_ratings(predictions, return_actual=True)
    rmse = np.sqrt(np.mean((pred - actual)**2))
    mape = np.mean(np.abs(pred - actual)/actual)

    return rmse, mape*100

##################################################################################
# It will return predicted ratings, rmse and mape of both train and test data   #
##################################################################################
def run_surprise(algo, trainset, testset,evaluate_train=False, verbose=True): 
    '''
        return train_dict, test_dict
    
        It returns two dictionaries, one for train and the other is for test
        Each of them have 3 key-value pairs, which specify ''rmse'', ''mape'', and ''predicted ratings''.
    '''
    start = datetime.now()
    # dictionaries that stores metrics for train and test..
    train = dict()
    test = dict()
    
    # train the algorithm with the trainset
    st = datetime.now()
    print('Training the model...')
    algo.fit(trainset)
    print('Done. time taken : {} \n'.format(datetime.now()-st))
    
    # ---------------- Evaluating train data--------------------#
    if evaluate_train:
        st = datetime.now()
        print('Evaluating the model with train data..')
        # get the train predictions (list of prediction class inside Surprise)
        train_preds = algo.test(trainset.build_testset())
        # get predicted ratings from the train predictions..
        train_actual_ratings, train_pred_ratings = get_ratings(train_preds)
        # get ''rmse'' and ''mape'' from the train predictions.
        train_rmse, train_mape = get_errors(train_preds)
        print('time taken : {}'.format(datetime.now()-st))
        if verbose:
            print('-'*15)
            print('Train Data')
            print('-'*15)
            print("RMSE : {}\n\nMAPE : {}\n".format(train_rmse, train_mape))
    
        #store them in the train dictionary
        if verbose:
            print('adding train results in the dictionary..')
        train['rmse'] = train_rmse
        train['mape'] = train_mape
        train['predictions'] = train_pred_ratings
    else:
        print('\n we are skipping model evaluation with train data..\n')
        train = None
    
    
    
    #------------ Evaluating Test data---------------#
    st = datetime.now()
    print('\nEvaluating for test data...')
    # get the predictions( list of prediction classes) of test data
    test_preds = algo.test(testset)
    # get the predicted ratings from the list of predictions
    test_pred_ratings = get_ratings(test_preds, return_actual=False)
    # get error metrics from the predicted and actual ratings
    test_rmse, test_mape = get_errors(test_preds)
    print('time taken : {}'.format(datetime.now()-st))
    
    if verbose:
        print('-'*15)
        print('Test Data')
        print('-'*15)
        print("RMSE : {}\n\nMAPE : {}\n".format(test_rmse, test_mape))
    # store them in test dictionary
    if verbose:
        print('storing the test results in test dictionary...')
    test['rmse'] = test_rmse
    test['mape'] = test_mape
    test['predictions'] = test_pred_ratings
    
    print('\n'+'-'*45)
    print('Total time taken to run this algorithm :', datetime.now() - start)
    
    # return two dictionaries train and test
    return train, test

### Some Global dictionary that stores rmse and mape for all the models....

- It stores the metrics in a dictionary of dictionaries

> __keys__ : model names(string)

> __value__: dict(__key__ : metric, __value__ : value )

In [5]:
models_evaluation_train = dict()
models_evaluation_test = dict()

models_evaluation_train, models_evaluation_test

({}, {})

## 1. Baseline Model  ( with User and Item biases )

In [6]:
from surprise import BaselineOnly

- #### Predicted_rating : ( baseline prediction )

>$   \large {\hat{r}_{ui} = b_{ui} =\mu + b_u + b_i} $

- ####  Optimization function ( Least Squares Problem )

> $ \large \sum_{r_{ui} \in R_{train}} \left(r_{ui} - (\mu + b_u + b_i)\right)^2 +
\lambda \left(b_u^2 + b_i^2 \right).\text {        [mimimize } {b_u, b_i]}$

In [7]:
from surprise import dump

if os.path.isfile('sample/large/bsl_algo'):
    
    print('This model is pretrained and saved in the directory spedified....')
    
    ## get the predicted ratings and the algorithm itself from the disk
    print('Getting it from the disk...(both predicted_ratings and algorithm...)')
    test_pred_ratings , bsl_algo = dump.load('sample/large/bsl_algo')
    print('Done...')
    
    print('Getting rmse and mape values from disk..')
    with open('sample/large/metrics_bsl_algo','rb') as f:
        models_evaluation_test['bsl_algo'] = pickle.load(f)

models_evaluation_test      

This model is pretrained and saved in the directory spedified....
Getting it from the disk...(both predicted_ratings and algorithm...)
Done...
Getting rmse and mape values from disk..


{'bsl_algo': {'mape': 30.90283402482485, 'rmse': 1.0052001851833883}}

In [None]:
#####################################################################################################
#####################################################################################################
# print('We have to train the model from test dataset. It may take a while..')
    
# # specifying which method to use (ALS or SGD) to calculate biases
# bsl_options = {'method': 'sgd',
#            'learning_rate': .001
#            }
# bsl = BaselineOnly(bsl_options=bsl_options)


# bsl_train_results, bsl_test_results = run_surprise(bsl, trainset, testset,
#                                                evaluate_train=False ,verbose=True)

# # store them in models dictionary..
# models_evaluation_train['bsl'] = bsl_train_results
# models_evaluation_test['bsl'] = bsl_test_results

# # store the trained model in disk...
# dump.dump('sample/large/bsl_algo', predictions=models_evaluation_test['bsl']['predictions'],
#           algo=bsl_algo, verbose=1)

# # saving test results in dictionary in python
# models_evaluation_test['bsl_algo'] = {'rmse':bsl_test_results['rmse'], 'mape':bsl_test_results['mape']}

# with open('sample/large/metrics_bsl_algo', mode='wb') as f:
#     pickle.dump(models_evaluation_test['bsl_algo'], f)
#####################################################################################################
#####################################################################################################

# 2. KNN with Baseline_model 

In [8]:
from surprise import KNNBaseline

- __predicted Rating__ : ( ___ based on User-User similarity ___ )

\begin{align} \hat{r}_{ui} = b_{ui} + \frac{ \sum\limits_{v \in N^k_i(u)}
\text{sim}(u, v) \cdot (r_{vi} - b_{vi})} {\sum\limits_{v \in
N^k_i(u)} \text{sim}(u, v)} \end{align}

- $\pmb{b_{ui}}$ -  _Baseline prediction_ of (user,movie) rating


- $ \pmb {N_i^k (u)}$ - Set of __K similar__ users (neighbours) of __user (u)__ who rated __movie(i)__  


- _sim (u, v)_ - __Similarity__ between users __u and v__  
    - Generally, it will be cosine similarity or Pearson correlation coefficient. 
    - But we use __shrunk Pearson-baseline correlation coefficient__, which is based on the pearsonBaseline similarity ( we take base line predictions instead of mean rating of user/item)
        - Computation of the correlation coefficient is based only on the common user support.
        - similarities based on a greater user support are more reliable ie., Users who has more number of common movie ratings are considered as more similar than users who has few no of movies  in common which are rated.
        - \begin{align}\begin{aligned}\text{pearson_baseline_shrunk_sim}(u, v) &= \frac{|I_{uv}| - 1}{|I_{uv}| - 1 + \text{shrinkage}} \cdot \hat{\rho}_{uv}\end{aligned}\end{align}
            - $\pmb{|I_{uv}|}$ - No of common movies between users(u and v)
            - __shrinkage__ - kind of hyperparameter. The defalut value suggested is ___100___
                - __0__ : There is no shrinkage at all ( It is normal pearson correlation coefficient ) 
            - $ \pmb {\hat \rho_uv}$ - Pearson Correlation Coefficient ( between users )
                - \begin{align} \text{pearson_baseline_sim}(u, v) = \hat{\rho}_{uv} = \frac{
    \sum\limits_{i \in I_{uv}} (r_{ui} -  b_{ui}) \cdot (r_{vi} -
    b_{vi})} {\sqrt{\sum\limits_{i \in I_{uv}} (r_{ui} -  b_{ui})^2}
    \cdot \sqrt{\sum\limits_{i \in I_{uv}} (r_{vi} -  b_{vi})^2}} \end{align}
 

 ------------ or 
 --

- __ Predicted rating __ ( based on Item Item similarity ):
 \begin{align} \hat{r}_{ui} = b_{ui} + \frac{ \sum\limits_{j \in N^k_u(i)}\text{sim}(i, j) \cdot (r_{uj} - b_{uj})} {\sum\limits_{j \in N^k_u(j)} \text{sim}(i, j)} \end{align}

    -  ___Notations follows same as above (user user based predicted rating ) ___

- We can do both and blend them ( see if we can better results when combined ) 

-----------------------------------------------------------------------------------------
-----------

- ## We are not doing Knn_baseline with USER_USER similarities is.., It will take up more ram and memory
- ## Instead We will just perform knn based on the ITEM_ITEM similarities..

-----------------------------------------------------------------------------------------
-----------

## 2.1 KNN with User User similarities 

 - - We are not doing knn with user user similarities., because we have around 480k users and we will run out of memory while computing similarities.., and Even if we can somehow make the RAM available, It will take like forever....

In [None]:
# # we specify , how to compute similarities and what to consider with sim_options to our algorithm
# sim_options = {'user_based' : True,
#                'name': 'pearson_baseline',
#                'shrinkage': 100,
#                'min_support': 2
#               } 

# bsl_options = {'method': 'sgd'} # we keep other parameters like regularization parameter and learning_rate as default values.

# knn_bsl_u = KNNBaseline(k=40, sim_options = sim_options, bsl_options = bsl_options)

# knn_bsl_u_train_results, knn_bsl_u_test_results = run_surprise(knn_bsl_u, trainset, testset, 
#                                                                evaluate_train=False, verbose=True)

# models_evaluation_train['knn_bsl_u'] = knn_bsl_u_train_results
# models_evaluation_test['knn_bsl_u']  = knn_bsl_u_test_results

# models_evaluation_test['knn_bsl_u'] 

## 2.2 KNN with Item Item similarities 

In [9]:
if os.path.isfile('sample/large/knn_bsl_m'):
    
    print('This model is pretrained and saved in the directory spedified....')

    ## get the predicted ratings and the algorithm itself from the disk
    print('Getting it from the disk...(both predicted_ratings and algorithm...)')
    knn_bsl_m_test_pred_ratings , knn_bsl_m = dump.load('sample/large/knn_bsl_m')
    print('Done...')
    
    print('loading rmse and mape values from disk...')
    with open('sample/large/metrics_knn_bsl_m','rb') as f:
        models_evaluation_test['knn_bsl_m'] = pickle.load(f)

models_evaluation_test

This model is pretrained and saved in the directory spedified....
Getting it from the disk...(both predicted_ratings and algorithm...)
Done...
loading rmse and mape values from disk...


{'bsl_algo': {'mape': 30.90283402482485, 'rmse': 1.0052001851833883},
 'knn_bsl_m': {'mape': 30.14567877982246, 'rmse': 0.9984568872852136}}

In [19]:
# # we specify , how to compute similarities and what to consider with sim_options to our algorithm
# sim_options = {'user_based' : False,
#                'name': 'pearson_baseline',
#                'shrinkage': 100,
#                'min_support': 2
#               } 
# # we keep other parameters like regularization parameter and learning_rate as default values.
# bsl_options = {'method': 'sgd'} 

# knn_bsl_m = KNNBaseline(k=40, sim_options = sim_options, bsl_options = bsl_options)

# knn_bsl_m_train_results, knn_bsl_m_test_results = run_surprise(knn_bsl_m, trainset, testset, verbose=True,
#                                                                evaluate_train=False)

# models_evaluation_train['knn_bsl_m'] = knn_bsl_m_train_results
# models_evaluation_test['knn_bsl_m']  = knn_bsl_m_test_results

# # store the trained model in disk...
# print('Storing the trained model (with predicted ratings) in the disk..')
# dump.dump('sample/large/knn_bsl_m', predictions=models_evaluation_test['knn_bsl_m']['predictions'],
#           algo=knn_bsl_m, verbose=1)
# print('Done..')
# # saving test results in dictionary in python
# models_evaluation_test['knn_bsl_m'] = {'rmse':knn_bsl_m_test_results['rmse'],
#                                        'mape':knn_bsl_m_test_results['mape']}

# with open('sample/large/metrics_knn_bsl_m', mode='wb') as f:
#     pickle.dump(models_evaluation_test['knn_bsl_m'], f)


Training the model...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. time taken : 1:15:13.788940 


 we are skipping model evaluation with train data..


Evaluating for test data...
time taken : 1:08:19.145859
---------------
Test Data
---------------
RMSE : 0.9984568872852136

MAPE : 30.14567877982246

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 2:23:32.935913
Storing the trained model (with predicted ratings) in the disk..
The dump has been saved as file sample/large/knn_bsl_m
Done..


# 3. Matrix Factorization Techniques

## 3.1 SVD -  MF algorithm with user item interactions 

In [5]:
from surprise import SVD

- __ Predicted Rating : __
    - $ \large \hat r_{ui} = \mu + b_u + b_i + q_i^Tp_u $ 

- __Optimization problem with user item interactions and regularization (to avoid overfitting)__


- $\Large \sum_{r_{ui} \in R_{train}} \left(r_{ui} - \hat{r}_{ui} \right)^2 +
\lambda\left(b_i^2 + b_u^2 + ||q_i||^2 + ||p_u||^2\right) $

In [10]:
from surprise import dump

if os.path.isfile('sample/large/svd'):
    
    print('This model is pretrained and saved in the directory spedified....')
    
    ## get the predicted ratings and the algorithm itself from the disk
    print('Getting svd from the disk...(both predicted_ratings and algorithm...)')
    svd_test_pred_ratings , svd = dump.load('sample/large/svd')
    print('Done...\n')
    
    print('Getting rmse and mape fom disk...')
    with open('sample/large/metrics_svd','rb') as f:
        models_evaluation_test['svd'] = pickle.load(f)
    print('Done')
    
models_evaluation_test

This model is pretrained and saved in the directory spedified....
Getting svd from the disk...(both predicted_ratings and algorithm...)
Done...

Getting rmse and mape fom disk...
Done


{'bsl_algo': {'mape': 30.90283402482485, 'rmse': 1.0052001851833883},
 'knn_bsl_m': {'mape': 30.14567877982246, 'rmse': 0.9984568872852136},
 'svd': {'mape': 29.491243711503127, 'rmse': 0.9868320443529861}}

In [24]:
####################################################################################################
####################################################################################################
# print('We have to train the model from test dataset. It may take a while..(90 min)')

# svd = SVD(n_factors=100, n_epochs=20, random_state=15, verbose=True)


# svd_train_results, svd_test_results = run_surprise(svd, trainset, testset,
#                                                    evaluate_train=False, verbose=True)

# # store them in models dictionary..
# models_evaluation_train['svd'] = svd_train_results
# models_evaluation_test['svd'] = svd_test_results

# store the trained svd model in disk...(along with the predicted ratings...)
# print('storing the trained svd along with the predictions..')
# dump.dump('sample/large/svd', predictions=models_evaluation_test['svd']['predictions'],
#           algo=svd, verbose=1)

# # saving test results in dictionary in python
# models_evaluation_test['svd'] = {'rmse':svd_test_results['rmse'],
#                                  'mape':svd_test_results['mape']}

# with open('sample/large/metrics_svd', mode='wb') as f:
#     pickle.dump(models_evaluation_test['svd'], f)
####################################################################################################
####################################################################################################

storing the trained svd along with the predictions..
The dump has been saved as file sample/large/svd


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .

 ### 3.2 SVD -  with Implicit feedback of Items(movies)

In [27]:
# from surprise import SVDpp

- __ Predicted Rating : __ 
    - $ \large \hat{r}_{ui} = \mu + b_u + b_i + q_i^T\left(p_u +
    |I_u|^{-\frac{1}{2}} \sum_{j \in I_u}y_j\right) $ 
 


 - $ \pmb{I_u}$ --- the set of all items rated by user u

- $\pmb{y_j}$ --- Our new set of item factors that capture implicit ratings.  

- __Optimization problem with user item interactions and regularization (to avoid overfitting)__ 


   
- $ \Large \sum_{r_{ui} \in R_{train}} \left(r_{ui} - \hat{r}_{ui} \right)^2 +
\lambda\left(b_i^2 + b_u^2 + ||q_i||^2 + ||p_u||^2 + y_j^2\right) $ 

In [None]:
# svdpp = SVDpp(n_factors=20, random_state=15, verbose=True)
# svdpp_train_results, svdpp_test_results = run_surprise(svdpp, trainset, testset)

# models_evaluation_train['svdpp'] = svdpp_train_results
# models_evaluation_test['svdpp'] = svdpp_test_results

# models_evaluation_test['svdpp']

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - .

## Test data metrics of models that we have trained so far 

In [11]:
pd.DataFrame(models_evaluation_test)

Unnamed: 0,bsl_algo,knn_bsl_m,svd
mape,30.902834,30.145679,29.491244
rmse,1.0052,0.998457,0.986832


- - - - - - - - - - - - - - - - - - - - - - - - - - - - .
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - .
 

### Final thoughts on the problem :

- > We didn't do KNN with User_User similarity and SVD++, because of memory or/and time constraints.


- > But we already proved that ( with small and medium samples ), if we blend all the models (including svd++ and Knn_UU_sim), with XGBoost, we can __furthur reduce the rmse__. 

- > If we do that, I am very much confident that we can get __rmse of our final model__ less than __0.9514 (rmse of Cinematch)__.