# Collaborative Filtering

In [1]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

small_data = pd.read_csv('ml-latest-small/ratings.csv', sep=',', encoding='latin-1', nrows=2000, 
                      usecols=['userId', 'movieId','rating', 'timestamp'])


In [2]:
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(small_data, test_size=0.2)

train_data = train_data.reset_index()
test_data = test_data.reset_index()

In [15]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.to_numpy(dtype=np.float, copy=True)
test_data_matrix = test_data.to_numpy(dtype=np.float,copy=True)

from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
parwise_user = pairwise_distances(train_data, metric='correlation')
user_correlation = 1 - parwise_user
user_correlation[np.isnan(user_correlation)] = 0

# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0

# print(train_data_matrix)
# print("----")
# print(train_data)
print(user_correlation)
print(pairwise_distances(train_data, metric='correlation'))

train_data.head()

print(train_data_matrix)

[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]
[[0.00000000e+00 1.67177383e-12 2.42707110e-09 ... 5.77460302e-12
  1.37179157e-12 4.49951187e-12]
 [1.67177383e-12 0.00000000e+00 2.55543298e-09 ... 1.33437705e-12
  5.96334093e-12 6.98219260e-13]
 [2.42707110e-09 2.55543298e-09 0.00000000e+00 ... 2.65998723e-09
  2.31458241e-09 2.63752364e-09]
 ...
 [5.77460302e-12 1.33437705e-12 2.65998723e-09 ... 0.00000000e+00
  1.22154509e-11 1.46993528e-13]
 [1.37179157e-12 5.96334093e-12 2.31458241e-09 ... 1.22154509e-11
  0.00000000e+00 1.05643272e-11]
 [4.49951187e-12 6.98219260e-13 2.63752364e-09 ... 1.46993528e-13
  1.05643272e-11 0.00000000e+00]]
[[1.90000000e+02 1.00000000e+00 2.94800000e+03 5.00000000e+00
  9.64982191e+08]
 [3.77000000e+02 4.00000000e+00 1.19700000e+03 5.00000000e+00
  9.64538763e+08]
 [1.24900000e+03 1.00000000e+01 1.09487000e+05 5.00000000e-01
  1.45539809e+09]
 ...

In [11]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        
        print("mean user rating:",mean_user_rating)
        
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [12]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

print(item_prediction)

# RMSE on the train data
print('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)))

mean user rating: [1.92997067e+08 1.92908069e+08 2.91101768e+08 ... 2.91123873e+08
 2.01514969e+08 1.69110778e+08]
[[ 2.26381932e+08  2.36519376e+08  2.30970217e+08 -1.77002359e+07
   3.48221730e+08]
 [ 2.26277886e+08  2.36410691e+08  2.30863085e+08 -1.76919942e+07
   3.48061499e+08]
 [ 3.41437559e+08  3.56725812e+08  3.48412959e+08 -2.67024487e+07
   5.25208246e+08]
 ...
 [ 3.41483834e+08  3.56775603e+08  3.48402233e+08 -2.66994921e+07
   5.25271212e+08]
 [ 2.36372976e+08  2.46957792e+08  2.41164620e+08 -1.84815181e+07
   3.63590034e+08]
 [ 1.98364270e+08  2.07247100e+08  2.02383282e+08 -1.55094556e+07
   3.05124513e+08]]
User-based CF RMSE: 99863221.1024596
Item-based CF RMSE: 393072439.25011086


In [6]:
# Usando test_data
print('User-based CF RMSE - Test Data: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE - Test Data: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE - Test Data: 121647191.42959222
Item-based CF RMSE - Test Data: 414701244.5631586
