# Group: Sang Yoon (Andy) Hwang, Anthony, Santosh Cheruku

The recommmender system we are building recommends movies to the users. It applies raw mean as well as baseline predictors. It also compares both and shows their overall accuracy.


In [14]:


import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split

# Just create dataset with this format --> user, movie_title, rating and then use pivot/melt to create matrix
data = {'user':['Tom','Sally','Vincio','Andy','Mike'], 'Titanic':[5,4,2,5, None],'Batman':[5,5,2,4,2.5], 'Superman':[1,1.5,2,None,2], 'Spiderman':[None, 4,2,3,2.5], 'Avengers':[4,5,3, None, None] }
df = pd.DataFrame(data)
df



Unnamed: 0,user,Titanic,Batman,Superman,Spiderman,Avengers
0,Tom,5.0,5.0,1.0,,4.0
1,Sally,4.0,5.0,1.5,4.0,5.0
2,Vincio,2.0,2.0,2.0,2.0,3.0
3,Andy,5.0,4.0,,3.0,
4,Mike,,2.5,2.0,2.5,


In [15]:

# long-form dataframe
df2 = pd.melt(df, 
               id_vars='user', 
               var_name='movie', 
               value_name='rating')
df2

Unnamed: 0,user,movie,rating
0,Tom,Titanic,5.0
1,Sally,Titanic,4.0
2,Vincio,Titanic,2.0
3,Andy,Titanic,5.0
4,Mike,Titanic,
5,Tom,Batman,5.0
6,Sally,Batman,5.0
7,Vincio,Batman,2.0
8,Andy,Batman,4.0
9,Mike,Batman,2.5


In [16]:


train2, test2 = train_test_split(df2, test_size = 0.2, random_state = 42)

train2 = pd.pivot_table(train2, index = 'user', columns = 'movie', values = 'rating' ).reset_index()
train2

movie,user,Avengers,Batman,Spiderman,Superman,Titanic
0,Andy,,,3.0,,5.0
1,Mike,,2.5,2.5,2.0,
2,Sally,5.0,5.0,,,4.0
3,Tom,4.0,5.0,,1.0,
4,Vincio,3.0,2.0,2.0,2.0,2.0


In [17]:


test2 = pd.pivot_table(test2, index = 'user', columns = 'movie', values = 'rating' ).reset_index()
test2


movie,user,Batman,Spiderman,Superman,Titanic
0,Andy,4.0,,,
1,Sally,,4.0,1.5,
2,Tom,,,,5.0


In [18]:

raw_avg2 = train2.sum()[1:].sum() / train2.iloc[:,1:].notnull().sum().sum()
raw_avg2

3.125

In [19]:



error_test2 = test2.iloc[:,1:] - raw_avg2
sq_error_test2 = error_test2 ** 2 
MSE_test2 = sq_error_test2.sum().sum() / test2.iloc[:,1:].notnull().sum().sum()
RMSE_test2 = np.sqrt(MSE_test2)
RMSE_test2


1.3863170633011772

In [20]:

error_train2 = train2.iloc[:,1:] - raw_avg2
sq_error_train2 = error_train2 ** 2 
MSE_train2 = sq_error_train2.sum().sum() / train2.iloc[:,1:].notnull().sum().sum()
RMSE_train2 = np.sqrt(MSE_train2)
RMSE_train2


1.3050383136138188

In [21]:

#bias_user using df2
sum_item_user2 = train2.sum(axis=1)
n_item_user2 = train2.iloc[:,1:].notnull().sum(axis=1)

bias_user2 = (sum_item_user2 / n_item_user2) - raw_avg2
bias_user2.index = list(train2.user)
bias_user2


Andy      0.875000
Mike     -0.791667
Sally     1.541667
Tom       0.208333
Vincio   -0.925000
dtype: float64

In [22]:

#bias_movie using df2
sum_item_movie2 = train2.iloc[:,1:].sum(axis=0)
n_item_movie2 = train2.iloc[:,1:].notnull().sum(axis=0)

bias_movie2 = (sum_item_movie2 / n_item_movie2) - raw_avg2
bias_movie2


movie
Avengers     0.875000
Batman       0.500000
Spiderman   -0.625000
Superman    -1.458333
Titanic      0.541667
dtype: float64

In [23]:

#using df2 - calculate baseline RMSE for testset
bias_movie_df2 = pd.DataFrame(bias_movie2).T
base_line_train2 = pd.DataFrame(columns = bias_movie2.index)
for i in train2['user']:
    base_line_train2 = pd.concat([bias_movie_df2 + bias_user2[i] + raw_avg2, base_line_train2])
    
base_line_train2.index = train2.index
base_line_train2

movie,Avengers,Batman,Spiderman,Superman,Titanic
0,3.075,2.7,1.575,0.741667,2.741667
1,4.208333,3.833333,2.708333,1.875,3.875
2,5.541667,5.166667,4.041667,3.208333,5.208333
3,3.208333,2.833333,1.708333,0.875,2.875
4,4.875,4.5,3.375,2.541667,4.541667


In [24]:



error_test2_baseline = test2 - base_line_train2
sq_error_test2_baseline = error_test2_baseline ** 2
MSE_test2_baseline = sq_error_test2_baseline.sum().sum() / test2.iloc[:,1:].notnull().sum().sum()
RMSE_test2_baseline = np.sqrt(MSE_test2_baseline)
RMSE_test2_baseline

0.9410672871207929

In [25]:


error_train2_baseline = train2 - base_line_train2
sq_error_train2_baseline = error_train2_baseline ** 2
MSE_train2_baseline = sq_error_train2_baseline.sum().sum() / train2.iloc[:,1:].notnull().sum().sum()
RMSE_train2_baseline = np.sqrt(MSE_train2_baseline)
RMSE_train2_baseline

1.468317016557089

In [26]:
print('RMSE - raw average: training', RMSE_train2)
print('RMSE - baseline predictor: training', RMSE_train2_baseline)
print('RMSE - raw average: test', RMSE_test2)
print('RMSE - baseline predictor: test', RMSE_test2_baseline)

RMSE - raw average: training 1.3050383136138188
RMSE - baseline predictor: training 1.468317016557089
RMSE - raw average: test 1.3863170633011772
RMSE - baseline predictor: test 0.9410672871207929
