# Collaborative Filtering

In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

small_data = pd.read_csv('../ml-20m/ratings.csv', sep=',', encoding='latin-1', nrows=15000, 
                      usecols=['userId', 'movieId','rating'])

# Fill NaN values in user_id and movie_id column with 0
small_data['userId'] = small_data['userId'].fillna(0)
small_data['movieId'] = small_data['movieId'].fillna(0)

# Replace NaN values in rating column with average of all values
small_data['rating'] = small_data['rating'].fillna(small_data['rating'].mean())

In [2]:
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(small_data, test_size=0.5, shuffle=False)

In [3]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.to_numpy(dtype=np.float, copy=True)
test_data_matrix = test_data.to_numpy(dtype=np.float,copy=True)

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix.shape)

(7500, 3)
(7500, 3)


In [4]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
parwise_user = pairwise_distances(train_data, metric='correlation')
user_correlation = 1 - parwise_user
user_correlation[np.isnan(user_correlation)] = 0

# Item Similarity Matrix
# item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
# item_correlation[np.isnan(item_correlation)] = 0

train_data.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [5]:
def predict(ratings, similarity, type='user'):
    
    if type == 'user':
        print(ratings)
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [6]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

user_prediction = predict(train_data_matrix, user_correlation, type='user')
# item_prediction = predict(train_data_matrix, item_correlation, type='item')

print("prediction:", user_prediction)

[[1.000e+00 2.000e+00 3.500e+00]
 [1.000e+00 2.900e+01 3.500e+00]
 [1.000e+00 3.200e+01 3.500e+00]
 ...
 [6.600e+01 7.450e+02 5.000e+00]
 [6.600e+01 1.148e+03 5.000e+00]
 [6.600e+01 1.175e+03 5.000e+00]]
prediction: [[ 1348.03511016 -2721.99344455  1380.4583344 ]
 [-2019.74241051  4102.49967923 -2049.25726872]
 [-2018.18310743  4102.39163936 -2047.70853193]
 ...
 [-1747.61021736  4340.92495043 -1777.31473306]
 [-1615.35559439  4479.38482345 -1645.02922906]
 [-1606.44153836  4488.55540886 -1636.1138705 ]]


In [7]:
# print('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
# print('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)))

print('User-based CF RMSE - Test Data: ' + str(rmse(user_prediction, test_data_matrix)))
# print('Item-based CF RMSE - Test Data: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE - Test Data: 11676.570087134485


User-based CF RMSE - Test Data: 15596.083426390725
Item-based CF RMSE - Test Data: 18101.866866058295

User-based CF RMSE - Test Data: 11849.808978496218
Item-based CF RMSE - Test Data: 13381.23167982257