#### Cold Start Analysis:

This notebook analyses the performance of different approaches in case of a new user or a user with less number of interaction with the system, namely the cold start problem. \\
We compute the rmse and mae for those customers who have rated less than 18 books and so on. \\
We also observe the performance of approached for customers who have rated more than 1000 movies. 

In [None]:
!pip install surprise

In [None]:
import pickle
import os

import pandas as pd

from surprise import SVD, SVDpp
from surprise import KNNBasic, KNNBaseline, BaselineOnly
from surprise import Dataset                                                     
from surprise import Reader                                                      
from surprise import dump
from surprise.accuracy import rmse

In [None]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [None]:
traindf.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,1,4.0,964982703,"['Adventure', 'Animation', 'Children', 'Comedy...",[]
1,1,6,4.0,964982224,"['Action', 'Crime', 'Thriller']",[]
2,1,47,5.0,964983815,"['Mystery', 'Thriller']",[]
3,1,50,5.0,964982931,"['Crime', 'Mystery', 'Thriller']",[]
4,1,70,3.0,964982400,"['Action', 'Comedy', 'Horror', 'Thriller']",[]


In [None]:
algo_svd = SVD()     
algo_svdpp = SVDpp()                                    
algo_knn = KNNBasic()


algo_svd.fit(trainset)                             
predictions_svd = algo_svd.test(testset)

algo_svdpp.fit(trainset)                             
predictions_svdpp = algo_svdpp.test(testset)

algo_knn.fit(trainset)
predictions_knn = algo_knn.test(testset)

# rmse(predictions_svd)
# rmse(predictions_knn)                                                                           

dump.dump('./dump_SVD', predictions_svd, algo_svd)
dump.dump('./dump_SVDpp', predictions_svdpp, algo_svdpp)
dump.dump('./dump_KNN', predictions_knn, algo_knn)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
df_svd = pd.DataFrame(predictions_svd, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_svdpp = pd.DataFrame(predictions_svdpp, columns=['uid', 'iid', 'rui', 'est', 'details'])        
df_knn = pd.DataFrame(predictions_knn, columns=['uid', 'iid', 'rui', 'est', 'details']) 

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False  # compute  similarities between items
               }
# algo = KNNBaseline(sim_options=sim_options)
algo_knnbaseline = KNNBaseline(sim_options=sim_options)
algo_knnbaseline.fit(trainset)
predictions_knnbaseline = algo_knnbaseline.test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [None]:
df_knnbaseline = pd.DataFrame(predictions_knnbaseline, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_knnbaseline['err'] = abs(df_knnbaseline.est - df_knnbaseline.rui)
df_knnbaseline['sqr_err'] = (df_knnbaseline.est - df_knnbaseline.rui)**2

In [None]:
df_svd['err'] = abs(df_svd.est - df_svd.rui)
df_svdpp['err'] = abs(df_svdpp.est - df_svdpp.rui)
df_knn['err'] = abs(df_knn.est - df_knn.rui)

In [None]:
df_svd['sqr_err'] = (df_svd.est - df_svd.rui)**2
df_svdpp['sqr_err'] = (df_svdpp.est - df_svdpp.rui)**2
df_knn['sqr_err'] = (df_knn.est - df_knn.rui)**2

In [None]:
algo_baselineonly = BaselineOnly()
algo_baselineonly.fit(trainset)
predictions_baselineonly = algo_baselineonly.test(testset)

df_baselineonly = pd.DataFrame(predictions_baselineonly, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_baselineonly['err'] = abs(df_baselineonly.est - df_baselineonly.rui)
df_baselineonly['sqr_err'] = (df_baselineonly.est - df_baselineonly.rui)**2
df_baselineonly['Iu'] = df_baselineonly.uid.apply(get_Iu)

Estimating biases using als...


In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True  # compute  similarities between items
               }
algo_knnbaseline_user = KNNBaseline(sim_options=sim_options)
algo_knnbaseline_user.fit(trainset)
predictions_knnbaseline_user = algo_knnbaseline_user.test(testset)

df_knn_user = pd.DataFrame(predictions_knnbaseline_user, columns=['uid', 'iid', 'rui', 'est', 'details']) 
df_knn_user['err'] = abs(df_knn_user.est - df_knn_user.rui)
df_knn_user['sqr_err'] = (df_knn_user.est - df_knn_user.rui)**2
df_knn_user['Iu'] = df_knn_user.uid.apply(get_Iu)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [None]:
df_svd.head()

Unnamed: 0,uid,iid,rui,est,details,err,Iu
0,1,3,4.0,4.200548,{'was_impossible': False},0.200548,186
1,1,163,5.0,4.261322,{'was_impossible': False},0.738678,186
2,1,316,3.0,4.024986,{'was_impossible': False},1.024986,186
3,1,349,4.0,4.443186,{'was_impossible': False},0.443186,186
4,1,441,4.0,4.758104,{'was_impossible': False},0.758104,186


In [None]:
content = pd.read_csv('content_based_genre_ratings.csv')

In [None]:
def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return traindf[traindf['userId'] == uid].shape[0]
    except ValueError:  # user was not part of the trainset
        return 0

In [None]:
content['Iu'] = content.userId.apply(get_Iu)

In [None]:
content['err'] = abs(content.pred_rating - content.og_rating)
content['sqr_err'] = (content.pred_rating - content.og_rating)**2
# rmse = ((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2).mean() ** .5
# mae = (((algo_predictions.og_rating - algo_predictions.pred_rating) ** 2) ** .5).mean()


In [None]:
print("Content based                 ",content[content.Iu < 18].err.mean())
print("Content based                ",content[content.Iu < 18].sqr_err.mean()** .5)

Content based                  0.7942792057878261
Content based                 1.0584107905057996


In [None]:
df_knn['Iu'] = df_knn.uid.apply(get_Iu)
df_svd['Iu'] = df_svd.uid.apply(get_Iu)
df_svdpp['Iu'] = df_svdpp.uid.apply(get_Iu)
df_knnbaseline['Iu'] = df_knnbaseline.uid.apply(get_Iu)

In [None]:
print("--------------------------MAE-----------------------")
print("KNN Basic                 ",df_knn[df_knn.Iu < 18].err.mean())
print("SVD                       ", df_svd[df_svd.Iu < 18].err.mean())
print("SVDpp                     ",  df_svdpp[df_svdpp.Iu < 18].err.mean())
print("KNN Baseline (item-item)  ", df_knnbaseline[df_knnbaseline.Iu < 18].err.mean())
print("BaselineOnly              ",df_baselineonly[df_baselineonly.Iu < 18].err.mean() )
print("KNN Baseline (user-user)  ",df_knn_user[df_knn_user.Iu < 18].err.mean() )

--------------------------MAE-----------------------
KNN Basic                  0.9356541418761788
SVD                        0.8174986369636367
SVDpp                      0.7853538665933238
KNN Baseline (item-item)   0.7549100058171629
BaselineOnly               0.828373767989461
KNN Baseline (user-user)   0.8527037143570998


In [None]:
print("--------------------------RMSE-----------------------")
print("KNN Basic                ",df_knn[df_knn.Iu < 18].sqr_err.mean()** .5)
print("SVD                      ", df_svd[df_svd.Iu < 18].sqr_err.mean()** .5)
print("SVDpp                    ",  df_svdpp[df_svdpp.Iu < 18].sqr_err.mean()** .5)
print("KNN Baseline (item-item) ", df_knnbaseline[df_knnbaseline.Iu < 18].sqr_err.mean()** .5)
print("BaselineOnly             ",df_baselineonly[df_baselineonly.Iu < 18].sqr_err.mean()** .5 )
print("KNN Baseline (user-user) ",df_knn_user[df_knn_user.Iu < 18].sqr_err.mean()** .5)

--------------------------RMSE-----------------------
KNN Basic                 1.1998253947989697
SVD                       1.0549483774463828
SVDpp                     1.0083634724152428
KNN Baseline (item-item)  0.9896562169806813
BaselineOnly              1.0612306019619604
KNN Baseline (user-user)  1.1082756354422056


In [None]:
print("--------------------------MAE-----------------------")
print("KNN Basic                 ",df_knn[df_knn.Iu > 1000].err.mean())
print("SVD                       ", df_svd[df_svd.Iu > 1000].err.mean())
print("SVDpp                     ",  df_svdpp[df_svdpp.Iu > 1000].err.mean())
print("KNN Baseline (item-item)  ", df_knnbaseline[df_knnbaseline.Iu > 1000].err.mean())
print("BaselineOnly              ",df_baselineonly[df_baselineonly.Iu > 1000].err.mean() )
print("KNN Baseline (user-user)  ",df_knn_user[df_knn_user.Iu > 1000].err.mean() )

--------------------------MAE-----------------------
KNN Basic                  0.7118277630004157
SVD                        0.6349197611192368
SVDpp                      0.626063757313411
KNN Baseline (item-item)   0.6120430789383057
BaselineOnly               0.6306031032475772
KNN Baseline (user-user)   0.6330297364319998


In [None]:
print("--------------------------RMSE-----------------------")
print("KNN Basic                ",df_knn[df_knn.Iu > 1000].sqr_err.mean()** .5)
print("SVD                      ", df_svd[df_svd.Iu > 1000].sqr_err.mean()** .5)
print("SVDpp                    ",  df_svdpp[df_svdpp.Iu > 1000].sqr_err.mean()** .5)
print("KNN Baseline (item-item) ", df_knnbaseline[df_knnbaseline.Iu > 1000].sqr_err.mean()** .5)
print("BaselineOnly             ",df_baselineonly[df_baselineonly.Iu > 1000].sqr_err.mean()** .5 )
print("KNN Baseline (user-user) ",df_knn_user[df_knn_user.Iu > 1000].sqr_err.mean()** .5)

--------------------------RMSE-----------------------
KNN Basic                 0.9174613388905646
SVD                       0.8207944406250214
SVDpp                     0.8136491891525117
KNN Baseline (item-item)  0.789275629286978
BaselineOnly              0.799990922710614
KNN Baseline (user-user)  0.8198697577732832


In [None]:
iid_df = traindf.groupby(['userId'],as_index=False).movieId.count()
iid_df.movieId.max()

2158