# Collaborative Filtering

# Import packages
import os
import pandas as pd

# Define file directories
MOVIELENS_DIR = ''
USER_DATA_FILE = './ml-1m/users.dat'
MOVIE_DATA_FILE = './ml-1m/movies.dat'
RATING_DATA_FILE = './ml-1m/ratings.dat'

# Specify User's Age and Occupation Column
AGES = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
OCCUPATIONS = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }

# Define csv files to be saved into
USERS_CSV_FILE = 'users.csv'
MOVIES_CSV_FILE = 'movies.csv'
RATINGS_CSV_FILE = 'ratings.csv'

# Read the Ratings File
ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['user_id'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movie_id'].drop_duplicates().max()

# Process ratings dataframe for Keras Deep Learning model
# Add user_emb_id column whose values == user_id - 1
ratings['user_emb_id'] = ratings['user_id'] - 1
# Add movie_emb_id column whose values == movie_id - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1

print(len(ratings), 'ratings loaded')


# Save into ratings.csv
ratings.to_csv(RATINGS_CSV_FILE, 
               sep='\t', 
               header=True, 
               encoding='latin-1', 
               columns=['user_id', 'movie_id', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])
print ('Saved to', RATINGS_CSV_FILE)

# Read the Users File
users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])
print (len(users), 'descriptions of', max_userid, 'users loaded.')

# Save into users.csv
users.to_csv(USERS_CSV_FILE, 
             sep='\t', 
             header=True, 
             encoding='latin-1',
             columns=['user_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])
print ('Saved to', USERS_CSV_FILE)

movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movie_id', 'title', 'genres'])
print (len(movies), 'descriptions of', max_movieid, 'movies loaded.')

# Save into movies.csv
movies.to_csv(MOVIES_CSV_FILE, 
              sep='\t', 
              header=True, 
              columns=['movie_id', 'title', 'genres'])
print ('Saved to', MOVIES_CSV_FILE)

In [226]:
# Import libraries
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

small_data = pd.read_csv('.ratings.csv', sep=',', encoding='latin-1', nrows=20000, 
                      usecols=['user_id', 'movie_id','rating'])

# Fill NaN values in user_id and movie_id column with 0
small_data['user_id'] = small_data['user_id'].fillna(0)
small_data['movie_id'] = small_data['movie_id'].fillna(0)

# Replace NaN values in rating column with average of all values
small_data['rating'] = small_data['rating'].fillna(small_data['rating'].mean())

ValueError: Usecols do not match columns, columns expected but not found: ['user_id', 'rating', 'movie_id']

In [None]:
from sklearn import model_selection
train_data, test_data = model_selection.train_test_split(small_data, test_size=0.3, shuffle=False)

In [None]:
# Create two user-item matrices, one for training and another for testing
train_data_matrix = train_data.to_numpy(dtype=np.float, copy=True)
test_data_matrix = test_data.to_numpy(dtype=np.float,copy=True)

# Check their shape
print(train_data_matrix.shape)
print(test_data_matrix.shape)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
parwise_user = pairwise_distances(train_data, metric='correlation')
user_correlation = 1 - parwise_user
user_correlation[np.isnan(user_correlation)] = 0

# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0

# print(train_data_matrix)
# print("----")
# print(train_data)
print(user_correlation)
print(pairwise_distances(train_data, metric='correlation'))

print(train_data_matrix)
train_data.head()

In [None]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    
    if type == 'user':
        print(ratings)
        mean_user_rating = ratings.mean(axis=1)
        
        print("mean user rating:",mean_user_rating)
        
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')

print(item_prediction)

# RMSE on the train data
print('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)))

In [None]:
# Usando test_data
print('User-based CF RMSE - Test Data: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE - Test Data: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE - Test Data: 15596.083426390725
Item-based CF RMSE - Test Data: 18101.866866058295

User-based CF RMSE - Test Data: 11849.808978496218
Item-based CF RMSE - Test Data: 13381.23167982257