In [0]:
# import packages
import numpy as np 
import pandas as pd 
import os 
from matplotlib import pyplot as plt
import warnings
import time
from scipy.sparse import coo_matrix
warnings.filterwarnings('ignore')

from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [0]:
# data_directory = r'/Users/rover/Desktop/2020 Winter/ECE 219/Project 3/ml-latest-small'
# os.chdir(data_directory)

# links = pd.read_csv('links.csv')
# movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv') # only movie rating is used in this project
tags = pd.read_csv('tags.csv') # used for movie genre

num_movie = len(ratings['movieId'].unique())
num_user = len(ratings['userId'].unique())

# create rating matrix R
matrix = coo_matrix((ratings.rating.to_list(), (ratings.userId.to_list(), ratings.movieId.to_list()))).toarray()
R = pd.DataFrame(index = range(611), columns= ratings['movieId'].unique(), data = matrix[:, ratings.movieId.unique()])
R = R.drop(R.index[0])
R.replace(0, np.nan, inplace = True)

### Naive collaborative filtering

In [0]:
# prepare data
data = ratings.drop(columns = ['timestamp'])

# count the number of ratings for each movie
movie_rate_frequency = pd.DataFrame(index = ratings.movieId.unique(), data = num_user - R.isna().sum(axis = 0))
movie_rate_frequency = movie_rate_frequency.sort_values(by = 0, ascending=False)

# compute the rating variance of each movie
movie_rate_var = pd.DataFrame(index = ratings.movieId.unique(), data = R.var(axis = 0))
movie_rate_var.replace(np.nan, 0, inplace = True)

# prepare trimmed movieId
movie_frevar = pd.DataFrame(index = ratings.movieId.unique())
movie_frevar['frequency'] = num_user - R.isna().sum(axis = 0)
movie_frevar['variance'] = R.var(axis = 0)
popular_movieId = movie_rate_frequency[movie_frevar['frequency'] > 2].index.tolist()
popular_movieId = list(map(str, popular_movieId))

unpopular_movieId = movie_rate_frequency[movie_frevar['frequency'] <= 2].index.tolist()
unpopular_movieId = list(map(str, unpopular_movieId))

highvar_movieId = movie_rate_frequency[(movie_frevar['frequency'] >= 5) & (movie_frevar['variance'] >= 2)].index.tolist()
highvar_movieId = list(map(str, highvar_movieId))

In [0]:
class naive_kfold():
    def __init__(self, data):
        n_user = data.shape[0]
        n_movie = data.shape[1]
        
        self.model = None
        self.data = data

    def pred(self, train, test):
        preds = np.empty([test.shape[0],1])
        avai_user = train['userId'].unique()
        for i in range(test.shape[0]):
            if i in avai_user:
                preds[i,0] = train.loc[train['userId'] == test.iloc[i,0],'rating'].mean()
            else: preds[i, 0] = 0  
        return preds

    def cross_validation(self, n_folds, trim_flag = False, trim_list = None):
        # define k-fold cross validation
        kf = KFold(n_splits = n_folds, shuffle=True)

        # define results dict and list
        results_cache = {}
        rmse_score, mse_score = [], []

        # perform k-fold and store the results
        for train_idx, test_idx in kf.split(self.data):
            train = self.data.iloc[train_idx,:]
            test = self.data.iloc[test_idx, :]
            if trim_flag:
                test = self.trim(test, trim_list)

            # if the trimmed test set is empty, jump to next k-fold
            if len(test) == 0:
                continue 

            y_true = test.iloc[:, 2].values
            preds = self.pred(train, test)
            rmse_score.append(np.sqrt(mean_squared_error(y_true, preds)))
            mse_score.append(mean_absolute_error(y_true, preds))

        results_cache['mean_rmse'], results_cache['mean_mse'], results_cache['rmse_hist'], results_cache['mse_hist'] = np.mean(rmse_score[rmse_score!=0]), np.mean(mse_score[mse_score!=0]), rmse_score, mse_score
        return results_cache

    def trim(self, testset, movieId_list):
        '''
        testset: list, testing data that needed to be trimmed
        trim_list: list, movieId that satisfy the trim requeirement 
        '''
        temp_df = pd.DataFrame(columns = ['userId', 'movieId', 'rate'], data = testset.values)
        temp = temp_df[temp_df['movieId'].isin(movieId_list)]
        
        return temp

$\textbf{Question 30}$: Design a naive collaborative filter to predict the ratings of the movies in the MovieLens dataset and evaluate it’s performance using 10-fold cross validation. Compute the average RMSE by averaging the RMSE across all 10 folds. Report the average RMSE.
Note that in this case, when performing the cross-validation, there is no need to calculate μi’s for the training folds each time. You are only asked to use a sin- gle set of μi’s calculated on the entire dataset and validate on 10 validation folds.

In [0]:
model_30 = naive_kfold(data)
model_30.cross_validation(n_folds = 10)

{'mean_mse': 3.3405297086383636,
 'mean_rmse': 3.5522342023797266,
 'mse_hist': [3.352426243377296,
  3.3405297086383636,
  3.3355216505541017,
  3.3366561018334555,
  3.3472948468994743,
  3.3462838014627314,
  3.350853945675127,
  3.3351887348678324,
  3.349276236330121,
  3.3211768011702056],
 'rmse_hist': [3.561840506612431,
  3.5522342023797266,
  3.549620006740572,
  3.5461106184598044,
  3.561606815312605,
  3.5598752417702397,
  3.5575782130465714,
  3.54517016417337,
  3.563271542594135,
  3.5347993652047878]}

$\textbf{Question 31}$: Design a naive collaborative filter to predict the ratings of the movies in the popular movie trimmed test set and evaluate it’s performance using 10-fold cross validation. Compute the average RMSE by averaging the RMSE across all 10 folds. Report the average RMSE.

In [0]:
model_31 = naive_kfold(data)
model_31.cross_validation(n_folds = 10, trim_flag = True, trim_list = popular_movieId)

{'mean_mse': 3.350986388205029,
 'mean_rmse': 3.5624200630861975,
 'mse_hist': [3.3501967106283956,
  3.350986388205029,
  3.3374566079184564,
  3.357701782042459,
  3.340107398359884,
  3.3619533597100206,
  3.3418790953360467,
  3.3621873497745742,
  3.3360158561061786,
  3.3430847741190974],
 'rmse_hist': [3.5612780825721013,
  3.5624200630861975,
  3.5511545163887406,
  3.568866842743934,
  3.5549658591417326,
  3.572796844641089,
  3.561648445070146,
  3.575292585934002,
  3.547347867752401,
  3.5565346796258477]}


$\textbf{Question 32}$: Design a naive collaborative filter to predict the ratings of the movies in the unpopular movie trimmed test set and evaluate it’s performance using 10-fold cross validation. Compute the average RMSE by averaging the RMSE across all 10 folds. Report the average RMSE.

In [0]:
model_32 = naive_kfold(data)
model_32.cross_validation(n_folds = 10, trim_flag = True, trim_list = unpopular_movieId)

{'mean_mse': 0.765975315556907,
 'mean_rmse': 0.9997054257294251,
 'mse_hist': [0.8148933287336415,
  0.765975315556907,
  0.7345275653988816,
  0.77086412175423,
  0.7962002261416907,
  0.7703281263139748,
  0.7692652907567019,
  0.7999632873402649,
  0.7510400783620659,
  0.7951742721967167],
 'rmse_hist': [1.0875860112436508,
  0.9997054257294251,
  0.9360661205569328,
  1.0054746190137538,
  1.034585900871471,
  1.0495400925322207,
  1.0096012492307467,
  1.0520261398461157,
  1.0059606299242305,
  1.1163527768990111]}

$\textbf{Question 33}$: Design a naive collaborative filter to predict the ratings of the movies in the high variance movie trimmed test set and evaluate it’s perfor- mance using 10-fold cross validation. Compute the average RMSE by averaging the RMSE across all 10 folds. Report the average RMSE.

In [0]:
model_33 = naive_kfold(data)
model_33.cross_validation(n_folds = 10, trim_flag = True, trim_list = highvar_movieId)

{'mean_mse': 1.2073732916984454,
 'mean_rmse': 1.6150685925287664,
 'mse_hist': [1.1172064608612293,
  1.2073732916984454,
  0.9719613637791761,
  1.1747884069718746,
  1.0882521530959879,
  1.1299496440678933,
  1.178022400764197,
  0.9892896688228854,
  1.3384616510672305,
  1.1853921492175394],
 'rmse_hist': [1.4329787452156835,
  1.6150685925287664,
  1.2261890522136918,
  1.4868804342378517,
  1.3610538943435169,
  1.3686406739755181,
  1.4803595813969181,
  1.3197322906245583,
  1.6333547300003497,
  1.4453067056534157]}