# Collaborative Filtering

In [23]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

small_data = pd.read_csv('ml-latest-small/ratings.csv', sep=',', encoding='latin-1', nrows=2000, 
                      usecols=['userId', 'movieId','rating'])

# Fill NaN values in user_id and movie_id column with 0
small_data['userId'] = small_data['userId'].fillna(0)
small_data['movieId'] = small_data['movieId'].fillna(0)

# Replace NaN values in rating column with average of all values
small_data['rating'] = small_data['rating'].fillna(small_data['rating'].mean())

In [24]:
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(small_data, test_size=0.2)

train_data = train_data.reset_index()
test_data = test_data.reset_index()

In [29]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.to_numpy(dtype=np.float, copy=True)
test_data_matrix = test_data.to_numpy(dtype=np.float,copy=True)

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix.shape)


(1600, 4)
(400, 4)


In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
parwise_user = pairwise_distances(train_data, metric='correlation')
user_correlation = 1 - parwise_user
user_correlation[np.isnan(user_correlation)] = 0

# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0

# print(train_data_matrix)
# print("----")
# print(train_data)
print(user_correlation)
print(pairwise_distances(train_data, metric='correlation'))

print(train_data_matrix)
train_data.head()

###### 

In [26]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    
    if type == 'user':
        print(ratings)
        mean_user_rating = ratings.mean(axis=1)
        
        print("mean user rating:",mean_user_rating)
        
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [27]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

print(item_prediction)

# RMSE on the train data
print('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)))

[[1.1490e+03 1.0000e+01 6.2660e+03 3.0000e+00]
 [1.4380e+03 1.5000e+01 1.7200e+02 1.0000e+00]
 [1.3040e+03 1.1000e+01 1.5840e+03 5.0000e+00]
 ...
 [1.2160e+03 1.0000e+01 7.9091e+04 5.0000e+00]
 [1.3000e+01 1.0000e+00 2.3100e+02 5.0000e+00]
 [1.1730e+03 1.0000e+01 3.1685e+04 4.5000e+00]]
mean user rating: [ 1857.      406.5     726.    ... 20080.5      62.5    8218.125]
[[ 9.26770591e+02  8.52210757e+02  4.85643739e+03 -5.11689103e+02]
 [ 6.86879029e+02  6.92231249e+02  2.75700805e+02 -5.16426609e+01]
 [ 7.09903866e+02  6.96387464e+02  1.32988533e+03 -1.53245411e+02]
 ...
 [ 5.46008578e+03  4.44923905e+03  5.99573122e+04 -6.12515075e+03]
 [ 2.07194271e+01  1.78911498e+01  1.75814654e+02 -1.37119572e+01]
 [ 2.50936348e+03  2.10801235e+03  2.40889785e+04 -2.47032678e+03]]
User-based CF RMSE: 10787.615685251136
Item-based CF RMSE: 3818.002688110833


In [28]:
# Usando test_data
print('User-based CF RMSE - Test Data: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE - Test Data: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE - Test Data: 14563.623577758292
Item-based CF RMSE - Test Data: 16885.761356949275
