# Recommendation Systems Assignment

### MIE451/1513 UofT

### Getting MovieLens data

* Download the movielens 100k dataset from this link: [ml-100k.zip](http://files.grouplens.org/datasets/movielens/ml-100k.zip)

* Upload ml-100k.zip

* Extract using the following cell:

### Imports

In [2]:
# import required libraries
import os
import os.path
import numpy as np
import pandas as pd
from math import sqrt
from heapq import nlargest
from tqdm import trange
from tqdm import tqdm
from scipy import stats
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error

## Support functions and variables

In [None]:
#!unzip ml-100k.zip -d .

In [3]:
MOVIELENS_DIR = "ml-100k"

In [4]:
!ls {MOVIELENS_DIR}

README
allbut.pl
mku.sh
u.data
u.genre
u.info
u.item
u.occupation
u.user
u1.base
u1.test
u2.base
u2.test
u3.base
u3.test
u4.base
u4.test
u5.base
u5.test
ua.base
ua.test
ub.base
ub.test


In [5]:
def getData(folder_path, file_name):
    fields = ['userID', 'itemID', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join(folder_path, file_name), sep='\t', names=fields)
    return data 

In [148]:
rating_df = getData(MOVIELENS_DIR, 'u.data')

In [7]:
num_users = len(rating_df.userID.unique())
num_items = len(rating_df.itemID.unique())
print("Number of users:", num_users)
print("Number of items:", num_items)

Number of users: 943
Number of items: 1682


## Q1

### (a) 

In [8]:
def dataPreprocessor(rating_df, num_users, num_items):
    """
        INPUT: 
            data: pandas DataFrame. columns=['userID', 'itemID', 'rating' ...]
            num_row: int. number of users
            num_col: int. number of items
            
        OUTPUT:
            matrix: 2D numpy array. 
            
        NOTE 1: see where something very similar is done in the lab in function 'buildUserItemMatrix'    
            
        NOTE 2: data can have more columns, but your function should ignore 
              additional columns.
    """
    ########### your code goes here ###########
    ### no change from lab ???? confirm
    # Initialize a of size (numUsers, numItems) to zeros
    matrix = np.zeros((num_users, num_items))
    
    # Populate the matrix based on the dataset
    for (index, userID, itemID, rating, timestamp) in rating_df.itertuples():
        matrix[userID-1, itemID-1] = rating
    
    
    ###########         end         ###########
    return matrix

In [9]:
dataPreprocessor(rating_df, num_users, num_items)

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

### (b)

In [172]:
class BaseLineRecSys(object):
    def __init__(self, method, processor=dataPreprocessor):
        """
            method: string. From ['popularity','useraverage']
            processor: function name. dataPreprocessor by default
        """
        self.method_name = method
        self.method = self._getMethod(self.method_name)
        self.processor = processor
        self.pred_column_name = self.method_name
        
    def _getMethod(self, method_name):
        """
            Don't change this
        """
        switcher = {
            'popularity': self.popularity,
            'useraverage': self.useraverage,
        }
        
        return switcher[method_name]
    
    @staticmethod
    def useraverage(train_matrix, num_users, num_items):
        """
            INPUT:
                train_matrix: 2D numpy array.
                num_users: int. Number of Users.
                num_items: int. Number of Items.
            OUTPUT:
                predictionMatrix: 2D numpy array.
                
            NOTE: see where something very similar is done in the lab in function 'predictByUserAverage'    
        """
        
        predictionMatrix = np.zeros((num_users, num_items))
        ########### your code goes here ###########
        # Initialize the predicted rating matrix with zeros
        for (user,item), rating in np.ndenumerate(train_matrix):
            # Predict rating for every item that wasn't ranked by the user (rating == 0)
          # if rating == 0:
            # Extract the items the user already rated
            userVector = train_matrix[user, :]
            ratedItems = userVector[userVector.nonzero()]

            # If not empty, calculate average and set as rating for the current item
            if ratedItems.size == 0:
                itemAvg = 0
            else:
                itemAvg = ratedItems.mean()
            predictionMatrix[user, item] = itemAvg
            
            #necessary???? REMOVE MAYBE?
            # report progress every 100 users 
           # if (user % 100 == 0 and item == 1):
            #    print ("calculated %d users" % (user,))


        ###########         end         ###########
        return predictionMatrix
    
    @staticmethod
    def popularity(train_matrix, num_users, num_items):
        """
            INPUT:
                train_matrix: 2D numpy array.
                num_users: int. Number of Users.
                num_items: int. Number of Items.
            OUTPUT:
                predictionMatrix: 2D numpy array.
                
            NOTE: see where something very similar is done in the lab in function 'predictByPopularity'    
        """
        
        predictionMatrix = np.zeros((num_users, num_items))
        ########### your code goes here ###########
        # Initialize the predicted rating matrix with zeros
        # Define function for converting 1-5 rating to 0/1 (like / don't like)
        vf = np.vectorize(lambda x: 1 if x >= 4 else 0)
        # For every item calculate the number of people liked (4-5) divided by the number of people that 
        itemPopularity = np.zeros((num_items))
        for item in range(num_items):
            numOfUsersRated = len(train_matrix[:, item].nonzero()[0])
            numOfUsersLiked = len(vf(train_matrix[:, item]).nonzero()[0])
            if numOfUsersRated == 0:
                itemPopularity[item] = 0
            else:
                itemPopularity[item] = numOfUsersLiked/numOfUsersRated

        for (user,item), rating in np.ndenumerate(train_matrix):
            # Predict rating for every item that wasn't ranked by the user (rating == 0)
           # if rating == 0:
            predictionMatrix[user, item] = itemPopularity[item]

            ##REMOVE???
            # report progress every 100 users
           # if (user % 100 == 0 and item == 1):
            #    print ("calculated %d users" % (user,))

                
        ###########         end         ###########
        #print(predictionMatrix)
        return predictionMatrix    
    
    def predict_all(self, train_df, num_users, num_items):
        
        train_matrix = self.processor(train_df, num_users, num_items)
        self.__model = self.method(train_matrix, num_users, num_items)
        
    def evaluate_test(self, test_df, copy=False):
        
        if copy:
            prediction = test_df.copy()
        else:
            prediction = test_df
            
        prediction[self.pred_column_name] = np.nan
        
        for (index, 
             userID, 
             itemID) in tqdm(prediction[['userID','itemID']].itertuples()):
            prediction.loc[index, self.pred_column_name] = self.__model[userID-1, itemID-1]

        return prediction
        
    def getModel(self):
        """
            return predicted user-item matrix
        """
        return self.__model
    
    def getPredColName(self):
        """
            return prediction column name
        """
        return self.pred_column_name
    
    def reset(self):
        """
            reuse the instance of the class by removing model
        """
        try:
            self.model = None
        except:
            print("You don not have model..")
            

In [13]:
popularity_recsys = BaseLineRecSys('popularity')

In [173]:
popularity_recsys.predict_all(rating_df, num_users, num_items)

[[0.71017699 0.38931298 0.37777778 ... 0.         0.         0.        ]
 [0.71017699 0.38931298 0.37777778 ... 0.         0.         0.        ]
 [0.71017699 0.38931298 0.37777778 ... 0.         0.         0.        ]
 ...
 [0.71017699 0.38931298 0.37777778 ... 0.         0.         0.        ]
 [0.71017699 0.38931298 0.37777778 ... 0.         0.         0.        ]
 [0.71017699 0.38931298 0.37777778 ... 0.         0.         0.        ]]


In [15]:
x = popularity_recsys.getModel()

In [16]:
np.all(x<=1)

True

In [17]:
rating_df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [18]:
popularity_recsys.evaluate_test(rating_df,copy=True).head()

100000it [01:30, 1107.22it/s]


Unnamed: 0,userID,itemID,rating,timestamp,popularity
0,196,242,3,881250949,0.760684
1,186,302,3,891717742,0.804714
2,22,377,1,878887116,0.076923
3,244,51,2,880606923,0.555556
4,166,346,1,886397596,0.611111


In [20]:
average_user_rating_recsys = BaseLineRecSys('useraverage')

In [21]:
average_user_rating_recsys.predict_all(rating_df, num_users, num_items)

In [22]:
average_user_rating_recsys.getModel()

array([[3.61029412, 3.61029412, 3.61029412, ..., 3.61029412, 3.61029412,
        3.61029412],
       [3.70967742, 3.70967742, 3.70967742, ..., 3.70967742, 3.70967742,
        3.70967742],
       [2.7962963 , 2.7962963 , 2.7962963 , ..., 2.7962963 , 2.7962963 ,
        2.7962963 ],
       ...,
       [4.04545455, 4.04545455, 4.04545455, ..., 4.04545455, 4.04545455,
        4.04545455],
       [4.26582278, 4.26582278, 4.26582278, ..., 4.26582278, 4.26582278,
        4.26582278],
       [3.41071429, 3.41071429, 3.41071429, ..., 3.41071429, 3.41071429,
        3.41071429]])

In [23]:
average_user_rating_recsys.evaluate_test(rating_df,copy=True).head()

100000it [01:25, 1171.04it/s]


Unnamed: 0,userID,itemID,rating,timestamp,useraverage
0,196,242,3,881250949,3.615385
1,186,302,3,891717742,3.413043
2,22,377,1,878887116,3.351562
3,244,51,2,880606923,3.651261
4,166,346,1,886397596,3.55


## Q2

### (a)

In [149]:
class SimBasedRecSys(object):

    def __init__(self, base, method, processor=dataPreprocessor):
        """
            base: string. From ['user', 'item']. User-based Similarity or Item-based
            method: string. From ['cosine', 'euclidean', 'somethingelse']
            processor: function name. dataPreprocessor by default
        """
        self.base = base
        self.method_name = method
        self.method = self._getMethod(self.method_name)
        self.processor = processor
        self.pred_column_name = self.base+'-'+self.method_name
    
    def _getMethod(self, method_name):
        """
            Don't change this
        """
        switcher = {
            'cosine': self.cosine,
            'euclidean': self.euclidean,
            'somethingelse': self.somethingelse,
        }
        
        return switcher[method_name]
    
    @staticmethod
    def cosine(matrix):
        """
            cosine similarity
        """
        similarity_matrix = 1 - pairwise_distances(matrix, metric='cosine')
        return similarity_matrix
    
    @staticmethod
    def euclidean(matrix):
        """
            euclidean similarity
        """
        ########### your code goes here ###########

        similarity_matrix = 1 /(1 + pairwise_distances(matrix, metric='euclidean'))
    
        ###########         end         ###########    
        
        return similarity_matrix
    
    @staticmethod
    def somethingelse(matrix): ##gotta fix this puppy up
        """
            manhattan? or super-natural intuition similarity
        """
        ########### your code goes here ###########
        
        similarity_matrix = 1/ (1 +pairwise_distances(matrix, metric='manhattan'))
    

    
    
        ###########         end         ###########        
        return similarity_matrix
        
    def predict_all(self, train_df, num_users, num_items):
        """
            INPUT: 
                data: pandas DataFrame. columns=['userID', 'itemID', 'rating'...]
                num_row: scalar. number of users
                num_col: scalar. number of items
            OUTPUT:
                no return... this method assigns the result to self.model
            
            NOTES:
                self.__model should contain predictions for *all* user and items
                (don't worry about predicting for observed (user,item) pairs,
                 since we won't be using these predictions in the evaluation)
                (see code in for an efficient vectorized example)
        """
        train_matrix = self.processor(train_df, num_users, num_items)
        
        if self.base == 'user':
            ########### your code goes here ###########
            # Initialize the predicted rating matrix with zeros
            temp_matrix = np.zeros(train_matrix.shape)
            temp_matrix[train_matrix.nonzero()] = 1
            temp_matrix[temp_matrix == 0] = 1e-5
            uu_similarity = self.method(train_matrix)
            #if k is not None:
            #    uu_similarity = kNearestNeighbor(uu_similarity, k)
            normalizer = np.matmul(uu_similarity, temp_matrix)
            normalizer[normalizer == 0] = 1e-5
            predictionMatrix = np.matmul(uu_similarity, train_matrix)/normalizer #matches key line in func above but vectorized bb
            #predictionMatrix[temp_matrix.nonzero()] = 0
            #Cold start

            useraverage = np.sum(train_matrix, axis=1)/np.sum(temp_matrix, axis=1)
            columns = np.sum(predictionMatrix, axis=0)
            predictionMatrix[:, columns==0] = predictionMatrix[:, columns==0] + np.expand_dims(useraverage, axis=1)
            self.__model = predictionMatrix

            ###########         end         ###########
            
        elif self.base == 'item':
            ########### your code goes here ###########
            train_matrixT = train_matrix.transpose()
            temp_matrix = np.zeros(train_matrixT.shape)
            temp_matrix[train_matrixT.nonzero()] = 1
            temp_matrix[temp_matrix == 0] = 1e-5
            
            ii_similarity = self.method(train_matrixT)
            #if k is not None: 
            #    ii_similarity = kNearestNeighbor(ii_similarity, k)
            normalizer = np.matmul(ii_similarity, temp_matrix)
            normalizer[normalizer == 0] = 1e-5
            predictionMatrix = np.matmul(ii_similarity, train_matrixT)/normalizer #matches key line in func above but vectorized bb
            #predictionMatrix[temp_matrix.nonzero()] = 0
            #Cold start

            itemaverage = np.sum(train_matrixT, axis=1)/np.sum(temp_matrix, axis=1)
            columns = np.sum(predictionMatrix, axis=0)
            predictionMatrix[:, columns==0] = predictionMatrix[:, columns==0] + np.expand_dims(itemaverage, axis=1)
            self.__model = predictionMatrix.transpose()
            
            
            
            
            

            ###########         end         ###########
        else:
            print('No other option available')
        
    def evaluate_test(self, test_df, copy=False):
        """
            INPUT:
                data: pandas DataFrame. columns=['userID', 'itemID', 'rating'...]
            OUTPUT:
                predictions:  pandas DataFrame. 
                              columns=['userID', 'itemID', 'rating', 'base-method'...]
                              
            NOTE: 1. data can have more columns, but your function should ignore 
                  additional columns.
                  2. 'base-method' depends on your 'base' and 'method'. For example,
                  if base == 'user' and method == 'cosine', 
                  then base-method == 'user-cosine'
                  3. your predictions go to 'base-method' column
        """
        if copy:
            prediction = test_df.copy()
        else:
            prediction = test_df
        prediction[self.pred_column_name] = np.nan
        
        for (index, 
             userID, 
             itemID) in tqdm(prediction[['userID','itemID']].itertuples()):
            prediction.loc[index, self.pred_column_name] = self.__model[userID-1, itemID-1]
    
        return prediction
    
    def getModel(self):
        """
            return predicted user-item matrix
        """
        return self.__model
    
    def getPredColName(self):
        """
            return prediction column name
        """
        return self.pred_column_name
    
    def reset(self):
        """
            reuse the instance of the class by removing model
        """
        try:
            self.model = None
        except:
            print("You do not have model..")

In [96]:
# Examples of how to call similarity functions.
I = np.eye(3)
SimBasedRecSys.cosine(I)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [26]:
SimBasedRecSys.euclidean(I)

array([[1.        , 0.41421356, 0.41421356],
       [0.41421356, 1.        , 0.41421356],
       [0.41421356, 0.41421356, 1.        ]])

In [95]:
SimBasedRecSys.somethingelse(I)

array([[1.        , 0.33333333, 0.33333333],
       [0.33333333, 1.        , 0.33333333],
       [0.33333333, 0.33333333, 1.        ]])

cosine similarity works best, this is because it cares about the angles between the vectors, not the length of the vectors(normalized for length)

### (b)

I implemented manhattan (after trying jaccard to relative failure), I think this could be a good similarity metric because ( since it is same as city block or l1) because it handles sparseness, which in general exists in user ratings.


## Q3

### (a)

In [41]:
user_cosine_recsys = SimBasedRecSys('user','cosine')

In [29]:
user_cosine_recsys.predict_all(rating_df, num_users, num_items)

In [30]:
user_cosine_recsys.getModel()

array([[3.89911175, 3.19022667, 3.0261129 , ..., 2.        , 3.        ,
        3.        ],
       [3.84034456, 3.17139889, 2.92626717, ..., 2.        , 3.        ,
        3.        ],
       [3.87104065, 3.12823798, 3.03250708, ..., 2.        , 3.        ,
        3.        ],
       ...,
       [3.90754645, 3.20227238, 3.05776201, ..., 2.        , 3.        ,
        3.        ],
       [3.91100649, 3.21591021, 2.98854017, ..., 2.        , 3.        ,
        3.        ],
       [3.91593122, 3.24268207, 3.08255897, ..., 0.        , 3.        ,
        3.        ]])

In [31]:
rating_df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [32]:
user_cosine_recsys.evaluate_test(rating_df,copy=True).head()

100000it [01:26, 1153.40it/s]


Unnamed: 0,userID,itemID,rating,timestamp,user-cosine
0,196,242,3,881250949,4.025213
1,186,302,3,891717742,4.142828
2,22,377,1,878887116,1.92208
3,244,51,2,880606923,3.431884
4,166,346,1,886397596,3.424963


### (b)

In [69]:
class CrossValidation(object):
    def __init__(self, metric, data_path=MOVIELENS_DIR):
        """
            INPUT:
                metric: string. from['RMSE','P@K','R@K']
        """
        self.folds = self._getData(MOVIELENS_DIR)
        self.metric_name = metric
        self.metric = self._getMetric(self.metric_name)
        
    def _getMetric(self, metric_name):
        """
            Don't change this
        """
        switcher = {
            'RMSE': self.rmse,
            'P@K': self.patk,
            'R@K': self.ratk,
        }
        
        return switcher[metric_name]
    
    @staticmethod
    def rmse(data, k, num_users, num_items, pred, true='rating'):
        """
            data: pandas DataFrame. 
            pred: string. Column name that corresponding to the prediction
            true: string. Column name that corresponding to the true rating
        """
        return sqrt(mean_squared_error(data[pred], data[true]))
    
    # Precision at k
    def patk(self, data, k, num_users, num_items, pred, true='rating'):
        """
            data: pandas DataFrame. 
            k: top-k items retrived
            pred: string. Column name that corresponding to the prediction
            true: string. Column name that corresponding to the true rating
        """
        prediction = self.getMatrix(data, num_users, num_items, pred)
        testSet =  self.getMatrix(data, num_users, num_items, true)
    
        # Initialize sum and count vars for average calculation
        sumPrecisions = 0
        countPrecisions = 0

        # Define function for converting 1-5 rating to 0/1 (like / don't like)
        vf = np.vectorize(lambda x: 1 if x >= 4 else 0)

        for userID in range(num_users):
            # Pick top K based on predicted rating
            userVector = prediction[userID,:]
            topK = nlargest(k, range(len(userVector)), userVector.take)

            # Convert test set ratings to like / don't like
            userTestVector = vf(testSet[userID,:]).nonzero()[0]

            # Calculate precision
            precision = float(len([item for item in topK if item in userTestVector]))/len(topK)

            # Update sum and count
            sumPrecisions += precision
            countPrecisions += 1

        # Return average P@k
        return float(sumPrecisions)/countPrecisions
    
    # Recall at k
    def ratk(self, data, k, num_users, num_items, pred, true='rating'):
        """
            data: pandas DataFrame. 
            k: top-k items relevant
            pred: string. Column name that corresponding to the prediction
            true: string. Column name that corresponding to the true rating
        """
        prediction = self.getMatrix(data, num_users, num_items, pred)
        testSet =  self.getMatrix(data, num_users, num_items, true)
        # Initialize sum and count vars for average calculation
        sumRecalls = 0
        countRecalls = 0

        # Define function for converting 1-5 rating to 0/1 (like / don't like)
        vf = np.vectorize(lambda x: 1 if x >= 4 else 0)

        for userID in range(num_users):
            # Pick top K based on predicted rating
            userVector = prediction[userID,:]
            topK = nlargest(k, range(len(userVector)), userVector.take)

            # Convert test set ratings to like / don't like
            userTestVector = vf(testSet[userID,:]).nonzero()[0]

            # Ignore user if has no ratings in the test set
            if (len(userTestVector) == 0):
                continue

            # Calculate recall
            recall = float(len([item for item in topK if item in userTestVector]))/len(userTestVector)

            # Update sum and count
            sumRecalls += recall
            countRecalls += 1

        # Return average R@k
        return float(sumRecalls)/countRecalls
    
    @staticmethod
    def getMatrix(rating_df, num_users, num_items, column_name):
        matrix = np.zeros((num_users, num_items))
    
        for (index, userID, itemID, value) in rating_df[['userID','itemID', column_name]].itertuples():
            matrix[userID-1, itemID-1] = value
            
        return matrix
    
    @staticmethod
    def _getData(data_path):
        """
            Don't change this function
        """
        folds = []
        data_types = ['u{0}.base','u{0}.test']
        for i in range(1,6):
            train_set = getData(data_path, data_types[0].format(i))
            test_set = getData(data_path, data_types[1].format(i))
            folds.append([train_set, test_set])
        return folds
    
    def run(self, algorithms, num_users, num_items, k=1):
        """
            5-fold cross-validation
            algorithms: list. a list of algorithms. 
                        eg: [user_cosine_recsys, item_euclidean_recsys]
        """
        
        scores = {}
        for algorithm in algorithms:
            print('Processing algorithm {0}'.format(algorithm.getPredColName()))
            fold_scores = []
            for fold in self.folds:
                algorithm.reset()
                algorithm.predict_all(fold[0], num_users, num_items)
                prediction = algorithm.evaluate_test(fold[1])
                pred_col = algorithm.getPredColName()
                fold_scores.append(self.metric(prediction, k, num_users, num_items, pred_col))
                
            mean = np.mean(fold_scores)
            ci_low, ci_high = stats.t.interval(0.95, len(fold_scores)-1, loc=mean, scale=stats.sem(fold_scores))
            scores[algorithm.getPredColName()] = [fold_scores, mean, ci_low, ci_high]
            
        results = scores    
    
        return results
            

In [31]:
# How to use CrossValidation Class?

In [34]:
# 1. gather your algorithms in previous steps.
#algorithm_instances = [popularity_recsys, 
#                       average_user_rating_recsys, 
#                       user_cosine_recsys]

In [38]:
# 2. Instantiate a CrossValidation instance and assign the measurement that you want to use
# RMSE, P@K, R@K
# Precision at K in this example
cv_patk = CrossValidation('P@K')

In [170]:
# 3. Run CV by giving:
#    1> algorithms just gathered
#    2> number of users in the full dataset
#    3> number of items in the full dataset
#    4> precision or recall at K need a K value, so k=5 means precision at 5 in this example
# Results include independent results from 5 folds, their mean, and confidence interval.
#cv_patk.run(algorithm_instances, num_users, num_items,k=5)

In [52]:
item_cosine_recsys = SimBasedRecSys('item','cosine')

In [53]:
item_cosine_recsys.predict_all(rating_df, num_users, num_items)

In [54]:
item_cosine_recsys.getModel()

array([[3.75429099, 3.66419957, 3.73222997, ..., 3.60248287, 3.79662696,
        3.90232044],
       [3.83658867, 3.80424519, 3.77473905, ..., 3.72798332, 3.9109779 ,
        3.79775927],
       [2.84492718, 2.89389328, 2.84327324, ..., 2.99504451, 3.16444153,
        2.9858119 ],
       ...,
       [4.11427954, 4.0558267 , 4.00963139, ..., 4.        , 3.87872799,
        4.14814803],
       [4.37096823, 4.39679254, 4.33543016, ..., 3.955358  , 4.41891089,
        4.57995134],
       [3.52030345, 3.46948821, 3.52393064, ..., 0.        , 3.6110641 ,
        3.59656861]])

In [55]:
rating_df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [56]:
item_cosine_recsys.evaluate_test(rating_df,copy=True).head()

100000it [01:32, 1079.25it/s]


Unnamed: 0,userID,itemID,rating,timestamp,item-cosine
0,196,242,3,881250949,3.591314
1,186,302,3,891717742,3.344077
2,22,377,1,878887116,2.965365
3,244,51,2,880606923,3.637332
4,166,346,1,886397596,3.333013


In [59]:
algorithm_instances = [item_cosine_recsys,  
                       user_cosine_recsys]

In [60]:
cv_rmse = CrossValidation('RMSE')

In [61]:
cv_rmse.run(algorithm_instances, num_users, num_items)

Processing algorithm item-cosine


20000it [00:14, 1405.28it/s]
20000it [00:13, 1429.90it/s]
20000it [00:14, 1427.45it/s]
20000it [00:14, 1403.90it/s]
20000it [00:14, 1396.94it/s]


Processing algorithm user-cosine


20000it [00:13, 1434.82it/s]
20000it [00:14, 1426.33it/s]
20000it [00:14, 1404.10it/s]
20000it [00:14, 1382.93it/s]
20000it [00:13, 1446.37it/s]


{'item-cosine': [[1.0377631264364244,
   1.0207280585350078,
   1.0101820660011798,
   1.0136832839209695,
   1.0180579656376574],
  1.020082900106248,
  1.0068242686250732,
  1.0333415315874226],
 'user-cosine': [[1.026449013124381,
   1.0214387664779507,
   1.0132940326457187,
   1.0094003999022947,
   1.0161883961525586],
  1.0173541216605808,
  1.009013080226148,
  1.0256951630950135]}

3b) There is no statistically significant better performer, however user-user similarity does perform slightly better.  For this case, if we to run more (k>5) cross validations, we would expect it to be statistically better, because in this data set we have on average 106 ratings per user and only 59 average ratings per item.  Therefore, we have more information about the users than the items.  

## Q4

### (a)

In [63]:
algorithm_instances = [popularity_recsys, 
                       average_user_rating_recsys, 
                       user_cosine_recsys,
                       item_cosine_recsys]

In [65]:
cv_ratk = CrossValidation('R@K')

In [66]:
cv_patk.run(algorithm_instances, num_users, num_items,k=5)

Processing algorithm popularity
[[0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 ...
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]]


20000it [00:14, 1408.65it/s]


[[0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 ...
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]]


20000it [00:14, 1419.75it/s]


[[0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 ...
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]]


20000it [00:13, 1504.21it/s]


[[0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 ...
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]]


20000it [00:13, 1505.34it/s]


[[0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 ...
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]]


20000it [00:13, 1518.37it/s]


Processing algorithm useraverage


20000it [00:13, 1435.03it/s]
20000it [00:14, 1399.19it/s]
20000it [00:14, 1420.96it/s]
20000it [00:14, 1397.92it/s]
20000it [00:16, 1222.65it/s]


Processing algorithm user-cosine


20000it [00:16, 1214.04it/s]
20000it [00:16, 1196.61it/s]
20000it [00:17, 1125.82it/s]
20000it [00:17, 1175.16it/s]
20000it [00:13, 1471.68it/s]


Processing algorithm item-cosine


20000it [00:13, 1481.16it/s]
20000it [00:14, 1402.63it/s]
20000it [00:15, 1285.02it/s]
20000it [00:14, 1352.72it/s]
20000it [00:15, 1311.91it/s]


{'item-cosine': [[0.34316012725344736,
   0.483563096500532,
   0.6021208907741271,
   0.6248144220572649,
   0.6074231177094392],
  0.5322163308589621,
  0.3837005215009889,
  0.6807321402169354],
 'popularity': [[0.36924708377518656,
   0.4965005302226948,
   0.6152704135737019,
   0.6426299045599162,
   0.6292682926829279],
  0.5505832449628855,
  0.40544114481568705,
  0.6957253451100839],
 'user-cosine': [[0.37179215270413657,
   0.503923647932133,
   0.621633085896077,
   0.6483563096500541,
   0.6335100742311777],
  0.5558430540827157,
  0.40959849499983714,
  0.7020876131655943],
 'useraverage': [[0.30604453870625714,
   0.4305408271474029,
   0.5321314952279973,
   0.5520678685047737,
   0.5474019088016986],
  0.4736373276776259,
  0.3419993013451059,
  0.6052753540101459]}

In [67]:
cv_ratk.run(algorithm_instances, num_users, num_items,k=5)

Processing algorithm popularity
[[0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 ...
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]]


20000it [00:14, 1404.69it/s]


[[0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 ...
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]]


20000it [00:14, 1424.40it/s]


[[0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 ...
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]]


20000it [00:14, 1412.93it/s]


[[0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 ...
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]]


20000it [00:14, 1400.95it/s]


[[0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 ...
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]]


20000it [00:13, 1460.09it/s]


Processing algorithm useraverage


20000it [00:14, 1419.14it/s]
20000it [00:13, 1448.86it/s]
20000it [00:13, 1514.00it/s]
20000it [00:14, 1376.94it/s]
20000it [00:14, 1378.26it/s]


Processing algorithm user-cosine


20000it [00:14, 1405.19it/s]
20000it [00:14, 1401.54it/s]
20000it [00:16, 1229.26it/s]
20000it [00:16, 1239.08it/s]
20000it [00:15, 1255.73it/s]


Processing algorithm item-cosine


20000it [00:15, 1318.57it/s]
20000it [00:15, 1325.38it/s]
20000it [00:15, 1305.40it/s]
20000it [00:15, 1332.62it/s]
20000it [00:15, 1270.08it/s]


{'item-cosine': [[0.3277711938444533,
   0.4237782250680911,
   0.5191391022223312,
   0.5448659224612776,
   0.5593011306991799],
  0.4749711148590666,
  0.35357317503649865,
  0.5963690546816346],
 'popularity': [[0.3466588624187514,
   0.4274468698270901,
   0.5269205125667804,
   0.5518738761026849,
   0.5674793185065369],
  0.4840758878843688,
  0.3671373629798323,
  0.6010144127889052],
 'user-cosine': [[0.34778041993806913,
   0.4314035774468209,
   0.5293633772333985,
   0.5553818201403046,
   0.5674144230096255],
  0.4862687235536437,
  0.3694473610987218,
  0.6030900860085656],
 'useraverage': [[0.30505841002027845,
   0.39554692074366876,
   0.48030412192442223,
   0.5045885853815734,
   0.5211179870422066],
  0.44132320502242983,
  0.32931026359142457,
  0.5533361464534351]}

In [68]:
cv_rmse.run(algorithm_instances, num_users, num_items)

Processing algorithm popularity
[[0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 ...
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]
 [0.71018277 0.38095238 0.37333333 ... 0.         0.         0.        ]]


20000it [00:17, 1165.91it/s]


[[0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 ...
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]
 [0.71787709 0.40186916 0.36923077 ... 0.         0.         0.        ]]


20000it [00:16, 1244.48it/s]


[[0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 ...
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]
 [0.70655271 0.39449541 0.42666667 ... 0.         0.         0.        ]]


20000it [00:16, 1200.05it/s]


[[0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 ...
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]
 [0.71745152 0.38613861 0.375      ... 0.         0.         0.        ]]


20000it [00:14, 1374.38it/s]


[[0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 ...
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]
 [0.69859155 0.38235294 0.34246575 ... 0.         0.         0.        ]]


20000it [00:14, 1374.85it/s]


Processing algorithm useraverage


20000it [00:15, 1267.19it/s]
20000it [00:14, 1334.58it/s]
20000it [00:14, 1384.94it/s]
20000it [00:14, 1366.21it/s]
20000it [00:14, 1391.50it/s]


Processing algorithm user-cosine


20000it [00:14, 1406.07it/s]
20000it [00:14, 1399.97it/s]
20000it [00:14, 1394.51it/s]
20000it [00:14, 1384.66it/s]
20000it [00:14, 1371.18it/s]


Processing algorithm item-cosine


20000it [00:14, 1384.85it/s]
20000it [00:14, 1395.58it/s]
20000it [00:14, 1387.35it/s]
20000it [00:14, 1391.98it/s]
20000it [00:14, 1386.58it/s]


{'item-cosine': [[1.0377631264364244,
   1.0207280585350078,
   1.0101820660011798,
   1.0136832839209695,
   1.0180579656376574],
  1.020082900106248,
  1.0068242686250732,
  1.0333415315874226],
 'popularity': [[3.177941281084362,
   3.1750480150769977,
   3.147474655005899,
   3.146164503024159,
   3.1488360007536382],
  3.1590928909890112,
  3.139292746995387,
  3.1788930349826354],
 'user-cosine': [[1.026449013124381,
   1.0214387664779507,
   1.0132940326457187,
   1.0094003999022947,
   1.0161883961525586],
  1.0173541216605808,
  1.009013080226148,
  1.0256951630950135],
 'useraverage': [[1.0629951276561334,
   1.0467467492319966,
   1.0328964562995389,
   1.0366575971298078,
   1.0392923504800367],
  1.0437176561595025,
  1.0289303496379316,
  1.0585049626810734]}

| P@K |  Mean   |  CI   |
|------|------|-------|
|  ii-sim  | 0.53 | [0.38,0.68] |
|  uu-sim  | 0.56 | [0.41,0.70] |
|  user avg  | 0.47 | [0.34,0.60] |
|  pop  | 0.55 | [0.41,0.70] |

| R@K |  Mean   |  CI   |
|------|------|-------|
|  ii-sim  | 0.47 | [0.35,0.59] |
|  uu-sim  | 0.49 | [0.37,0.60] |
|  user avg  | 0.44 | [0.33,0.55] |
|  pop  | 0.48 | [0.37,0.60] |

| RMSE |  Mean   |  CI   |
|------|------|-------|
|  ii-sim  | 1.020 | [1.007,1.033] |
|  uu-sim  | 1.017 | [1.009,1.026] |
|  user avg  | 1.044 | [1.029,1.059] |
|  pop  | 3.159 | [3.139,3.179] |

  

### (b)

popularity can not necessarily be evaluated based on RMSE.  This  is because popularity cares about the ranking of a movie (do more users like it than dislike it), whereas the other metrics create a rating that is based around the ratings of other users (either by average, or similarity).  This makes RMSE which is evaluating the distance from the actual rating to the predicted rating not very useful, as the popularity is calculated using a different scale.  

### (c)

RMSE: uu-sim - this is bc it is mean centric (probability based on average of users around you unlike pop), and has most descriptive & personalized information (more rankings users than items, more personalized than user average). 
R@K: uu-sim - same as above
P@K: uu-sim - same as above
 

### (d)

good performance on RMSE implies good performance on ranking metrics bc, if RMSE is low, that means the predictions have low deviation from the true ratings, and therefore you are likely to have a similar ranking order (since you rank by rating).  Hi ranking results however do not imply good RSME, because you may be able to get the top 5 results correct (have correct ranking), but your ratings could be far off from the true ratings (ex if true top 3 have these IDs and ratings: 1:5, 2:4.9, 3:4.5, and predicted top 3 have: 1:.9, 2:.85, 3:.8, the ranking is correct, but the ratings are hugely different from the true ratings).

## Q5

### (a)

In [107]:
#def userTopK(prediction, moviesDataset, userID, k):
#    # Pick top K based on predicted rating
#    userVector = prediction[userID+1,:]
#    topK = nlargest(k, range(len(userVector)), userVector.take)
#    namesTopK = list(map(lambda x: moviesDataset[moviesDataset.movieID == x+1]["movieTitle"].values[0], topK))
 #   return namesTopK

In [169]:
fieldsMovies = ['movieID', 'movieTitle', 'releaseDate', 'videoReleaseDate', 'IMDbURL', 'unknown', 'action', 'adventure',
          'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'filmNoir', 'horror',
          'musical', 'mystery', 'romance','sciFi', 'thriller', 'war', 'western']
moviesDF = pd.read_csv(os.path.join(MOVIELENS_DIR, 'u.item'), sep='|', names=fieldsMovies, encoding='latin-1')


#moviesDF.head()

In [125]:
#userTopK(user_cosine_recsys.getModel(), moviesDF, 144, 70)


In [None]:
#'Princess Bride, The (1987)'
#'Wizard of Oz, The (1939)'
#'Rear Window (1954)'

In [162]:
def itemTopK(prediction, moviesDataset, movieTitle, k):
    # Pick top K based on predicted rating
    index = moviesDF.index[moviesDF['movieTitle'] == movieTitle].values
    itemVector = prediction[:, index]
    topK = nlargest(k+1, range(len(itemVector)), itemVector.take)
    topK = topK[1:k+1]
    namesTopK = list(map(lambda x: moviesDataset[moviesDataset.movieID == x+1]["movieTitle"].values[0], topK))
    return namesTopK

In [136]:
item_cosine_recsys2 = SimBasedRecSys('item','cosine')

In [138]:
item_cosine_recsys2.cosine(rating_df.transpose())

array([[1.        , 0.68717082, 0.82397582, 0.86639358],
       [0.68717082, 1.        , 0.71691882, 0.78964707],
       [0.82397582, 0.71691882, 1.        , 0.95268874],
       [0.86639358, 0.78964707, 0.95268874, 1.        ]])

In [167]:
#itemTopK(item_cosine_recsys.getModel(), moviesDF, 'Princess Bride, The (1987)', 5)

In [168]:
train_matrix2 = item_cosine_recsys2.processor(rating_df, num_users, num_items)

In [164]:
itemTopK(item_cosine_recsys2.cosine(train_matrix2.transpose()), moviesDF, 'Princess Bride, The (1987)', 5)

1682


['Raiders of the Lost Ark (1981)',
 'Empire Strikes Back, The (1980)',
 'Monty Python and the Holy Grail (1974)',
 'Back to the Future (1985)',
 'Indiana Jones and the Last Crusade (1989)']

In [165]:
itemTopK(item_cosine_recsys2.cosine(train_matrix.transpose()), moviesDF, 'Wizard of Oz, The (1939)', 5)

1682


['E.T. the Extra-Terrestrial (1982)',
 'Forrest Gump (1994)',
 'Gone with the Wind (1939)',
 "It's a Wonderful Life (1946)",
 'Raiders of the Lost Ark (1981)']

In [166]:
itemTopK(item_cosine_recsys2.cosine(train_matrix.transpose()), moviesDF, 'Rear Window (1954)', 5)

1682


['Vertigo (1958)',
 'Psycho (1960)',
 'Casablanca (1942)',
 'Sting, The (1973)',
 'Graduate, The (1967)']

Incase the autograder destroys me:
movie: princess bride |
similar movies: ['Raiders of the Lost Ark (1981)',
 'Empire Strikes Back, The (1980)',
 'Monty Python and the Holy Grail (1974)',
 'Back to the Future (1985)',
 'Indiana Jones and the Last Crusade (1989)']
    movie: wizard of oz |
similar movies: ['E.T. the Extra-Terrestrial (1982)',
 'Forrest Gump (1994)',
 'Gone with the Wind (1939)',
 "It's a Wonderful Life (1946)",
 'Raiders of the Lost Ark (1981)']
     movie: rear window |
 similar movies:['Vertigo (1958)',
 'Psycho (1960)',
 'Casablanca (1942)',
 'Sting, The (1973)',
 'Graduate, The (1967)']


### (b)

HELL YES I CAN JUSTIFY THE SIMILARITIES, just look at them ^^^^^^^^^
-for each movie, the related titles follow either the era, genre, director, etc.  it makes sense that these titles appear bc it shows that users tend to watch and rate similar movies similarly BOOM this question took me WAY longer than it shouldve you sneaky cosine similarity you

## Q6 [GRAD ONLY]

### (a)

# Validation

In [174]:
# Constants for validation only
ROW_NUM = 943
COL_NUM = 1682
RATING_COL = 'rating'

### dataPreprocessor

In [175]:
def validateDataPreprocessor(path=MOVIELENS_DIR, getData=getData, getMatrix=CrossValidation.getMatrix):
    validation_df = getData(MOVIELENS_DIR, 'u1.test')
    try:
        matrix = getMatrix(validation_df, ROW_NUM, COL_NUM, RATING_COL)
    except:
        print('dataPreprocessor function has error')
        return
    try:
        assert(matrix.shape == (ROW_NUM,COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape (943,1682)".format(matrix.shape)
    except Exception as e:
        print(e)
    return validation_df

In [176]:
validation_df = validateDataPreprocessor()

## Baseline Recommendation Systems

### Popularity Based Recommendation

In [73]:
def validatePopularityRecSys(validation_df=validation_df, BaseLineRecSys = BaseLineRecSys):
    popularity_recsys = BaseLineRecSys('popularity')
    try:
        popularity_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
    except Exception as e:        
        print('popularity function has error')
        print(e)
        return
    try:
        predictionMatrix = popularity_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [177]:
validatePopularityRecSys()

[[0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 ...
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]]


### User Average Based Recommendation

In [75]:
def validateUserAverRecSys(validation_df=validation_df, BaseLineRecSys = BaseLineRecSys):
    useraverage_recsys = BaseLineRecSys('average_user_rating')
    try:
        useraverage_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
    except:
        print('useraverage function has error')
        return
    try:
        predictionMatrix = useraverage_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [178]:
validatePopularityRecSys()

[[0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 ...
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]
 [0.71014493 0.42307692 0.4        ... 0.         0.         0.        ]]


## Similary Based Recommendation Systems

### Euclidean Similarity Function

In [78]:
def validateEuclidean(validation_df=validation_df, getMatrix=CrossValidation.getMatrix):
    matrix = getMatrix(validation_df, ROW_NUM, COL_NUM, RATING_COL)
    try:
        sim_matrix = SimBasedRecSys.euclidean(matrix)
        assert(sim_matrix.shape == (ROW_NUM, ROW_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(sim_matrix.shape,ROW_NUM,ROW_NUM)
        assert(np.any(sim_matrix <= 1)),\
               "Exist similarity value that is not less or equal to 1."
    except Exception as e:
        print(e)        

In [179]:
validateEuclidean()

### Customized Similarity Function (test somethingelse function)

In [80]:
def validateCustomizedSim(validation_df=validation_df, getMatrix=CrossValidation.getMatrix):
    matrix = getMatrix(validation_df, ROW_NUM, COL_NUM, RATING_COL)
    try:
        sim_matrix = SimBasedRecSys.somethingelse(matrix)
        assert(sim_matrix.shape == (ROW_NUM, ROW_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(sim_matrix.shape,ROW_NUM,ROW_NUM)
        assert(np.any(sim_matrix <= 1)),\
               "Exist similarity value that is not less or equal to 1."
    except Exception as e:
        print(e) 

In [180]:
validateCustomizedSim()

### User-User Similarity Based Recommendation System

In [82]:
def validateUUSimBasedRecSys(validation_df=validation_df, dataPreprocessor=dataPreprocessor):
    try:
        user_cosine_recsys = SimBasedRecSys('user','cosine', dataPreprocessor)
    except:
        print("Framework error, please make sure you are using given yml file.")
        return
    
    try:
        user_cosine_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
        predictionMatrix = user_cosine_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [181]:
validateUUSimBasedRecSys()


### Item-Item Similarity Based Recommendation System

In [84]:
def validateIISimBasedRecSys(validation_df=validation_df, dataPreprocessor=dataPreprocessor):
    try:
        user_cosine_recsys = SimBasedRecSys('item','cosine', dataPreprocessor)
    except:
        print("Framework error, please make sure you are using given yml file.")
        return
    
    try:
        user_cosine_recsys.predict_all(validation_df, ROW_NUM, COL_NUM)
        predictionMatrix = user_cosine_recsys.getModel()
        assert(predictionMatrix.shape == (ROW_NUM, COL_NUM)),\
        "Shape of matrix{0} doesn't match predefined shape ({1},{2})"\
        .format(predictionMatrix.shape,ROW_NUM, COL_NUM)
    except Exception as e:
        print(e)

In [182]:
validateIISimBasedRecSys()