# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [5]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [10]:
# your code
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import NearestNeighbors

def compute_mae_using_pearson(user_ratings_train, user_ratings_test):
    # 1. Deal with NaN values in training data
    user_ratings_train_noNan = user_ratings_train.fillna(0)

    # Compute Pearson correlation between all pairs of users in the training set
    pearson_sim_train = 1 - pairwise_distances(user_ratings_train_noNan, metric="correlation")

    # Train a k-nearest neighbors model on the Pearson similarity matrix
    train_model = NearestNeighbors(n_neighbors=5)
    train_model.fit(pearson_sim_train)

    # Get distances and indices of the 5 nearest neighbors for each user
    neighbors_distance, neighbors_ind = train_model.kneighbors()
    neighbors_ind += 1

    predictions = []
    actual = []

    # 3. For each entry in the testing data with values, find neighbors in training with Pearson correlation
    for user_id, row in user_ratings_test.iterrows():
        for movie, rating in row.iteritems():
            # 2. Only need to predict ratings for entries in testing data with values (NOT NaN)
            if not pd.isnull(rating):
                predicted_rating = 0
                sum_of_sim = 0

                # 4. Use the formula from slides to make predictions
                for x in range(0, 5):
                    neigh_id = neighbors_ind[user_id - 1][x]
                    neigh_rating = user_ratings_train.loc[neigh_id, movie]
                    if not pd.isnull(neigh_rating):
                        neigh_distance = neighbors_distance[user_id - 1][x]
                        sum_of_sim += neigh_distance
                        predicted_rating += neigh_distance * neigh_rating

                # Normalize the predicted rating
                if sum_of_sim != 0:
                    predicted_rating = predicted_rating / sum_of_sim

                # 5. Save the original ground truth ratings in the testing data
                actual.append(rating)
                # 6. Save the predictions
                predictions.append(predicted_rating)

    # 7. Apply MAE function on these two lists
    mae = mean_absolute_error(predictions, actual)
    return mae

# Test the function with your user_ratings_train and user_ratings_test dataframes
mae = compute_mae_using_pearson(user_ratings_train, user_ratings_test)
print('MAE: ' + str(mae))

MAE: 1.7196495860528949


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [11]:
# your code
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import NearestNeighbors

def compute_mae_using_cosine(user_ratings_train, user_ratings_test):
    # 1. Deal with NaN values in training data
    user_ratings_train_noNan = user_ratings_train.fillna(0)

    # Compute cosine similarity between all pairs of items in the training set
    cosine_sim_train = 1 - pairwise_distances(user_ratings_train_noNan.T, metric="cosine")

    # Train a k-nearest neighbors model on the cosine similarity matrix
    train_model = NearestNeighbors(n_neighbors=5)
    train_model.fit(cosine_sim_train)

    # Get distances and indices of the 5 nearest neighbors for each item
    neighbors_distance, neighbors_ind = train_model.kneighbors()

    predictions = []
    actual = []

    # 3. For each entry in the testing data with values, find items in training with cosine correlation
    for user_id, row in user_ratings_test.iterrows():
        for movie, rating in row.iteritems():
            # 2. Only need to predict ratings for entries in testing data with values (NOT NaN)
            if not pd.isnull(rating):
                predicted_rating = 0
                sum_of_sim = 0
                
                # Get the index of the movie in the training data
                movie_index = user_ratings_train.columns.get_loc(movie)

                # 4. Use the formula from slides to make predictions
                for x in range(0, 5):
                    neigh_id = neighbors_ind[movie_index][x]
                    neigh_rating = user_ratings_train.iloc[user_id - 1, neigh_id]
                    if not pd.isnull(neigh_rating):
                        neigh_distance = neighbors_distance[movie_index][x]
                        sum_of_sim += neigh_distance
                        predicted_rating += neigh_distance * neigh_rating

                # Normalize the predicted rating
                if sum_of_sim != 0:
                    predicted_rating = predicted_rating / sum_of_sim

                # 5. Save the original ground truth ratings in the testing data
                actual.append(rating)
                # 6. Save the predictions
                predictions.append(predicted_rating)

    # 7. Apply MAE function on these two lists
    mae = mean_absolute_error(predictions, actual)
    return mae

# Test the function with your user_ratings_train and user_ratings_test dataframes
mae = compute_mae_using_cosine(user_ratings_train, user_ratings_test)
print('MAE: ' + str(mae))

MAE: 1.595714302471743
