In [None]:
#recommender systems rely on relatively heavier datasets

In [None]:
#Recommender system using Collaborative Filtering

In [None]:
import numpy as np
import pandas as pd

In [None]:
#labeling the data set
column_names = ['user_id','item_id','rating','timestamp']
df = pd.read_csv('u.data',sep='\t',names=column_names)

In [None]:
df.head()

In [None]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head()

In [None]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

In [None]:
#find out the number of unique movies and users
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()

print('Num. of Users: '+str(n_users))
print('Num of Movies:'+ str(n_items))

In [9]:
#Train test split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_data, test_data = train_test_split(df, test_size=0.25)

# Memory-Based Collaborative Filtering

In [29]:
# item-item collaborative filtering
# user-tem collaborative filtering

In [30]:
#create two user-item matrices for train and test
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [31]:
#calculate pairwise distance from sklearn
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [32]:
# Prediction

In [33]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [34]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

# Evaluation

In [36]:
#Root Mean Squared Metric (RMSE)

In [37]:
#we are only concerned with predcited ratings in the test dataset, we can filter out all other elements in the predcition matrix
#prediction[ground_truth.nonzero()]

In [38]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction,ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction,ground_truth))

In [39]:
print('User based CF RMSE: ' + str(rmse(user_prediction,test_data_matrix)))
print('Item based CF RMSE: ' +str(rmse(item_prediction,test_data_matrix)))

User based CF RMSE: 3.12392669711323
Item based CF RMSE: 3.453308664011944


# Model-Based Collaborating Filtering
Memory based algorithms are easy to implement and produce reasonable quality of prediction. However, they don't reall align with the real world scenarios as they don't perform well in situations where a new user is added to the data base. Model-based CF is better under such circumstances but suffer in case a user has no ratings. The goal of MF is to learn the latent preferences of users and the latent attributes of items from known ratings (learn features that describe the characteristics of ratings) to then predict the unknown ratings through the dot product of the latent features of users and items. When you have a very sparse matrix, with a lot of dimensions, by doing matrix factorization you can restructure the user-item matrix into low-rank structure, and you can represent the matrix by the multiplication of two low-rank matrices, where the rows contain the latent vector. You fit this matrix to approximate your original matrix, as closely as possible, by multiplying the low-rank matrices together, which fills in the entries missing in the original matrix.

In [41]:
# Model Based Collab Filtering

In [44]:
#It is based on Matrix Factorization and works as an unsupervised learning method. Matrix Factorization deals better with scalability and sparsity conditions and as result is used widely for recommendation systems.

In [45]:
#calculate the sparsity of movie lens data set
sparsity = round(1.0-len(df)/float(n_users*n_items),3)
print('The sparsity level of MovieLens100K is ' + str(sparsity*100)+ '%')

The sparsity level of MovieLens100K is 93.7%


To give an example of the learned latent preferences of the users and items: let's say for the MovieLens dataset you have the following information: (user id, age, location, gender, movie id, director, actor, language, year, rating). By applying matrix factorization the model learns that important user features are age group (under 10, 10-18, 18-30, 30-90), location and gender, and for movie features it learns that decade, director and actor are most important. Now if you look into the information you have stored, there is no such feature as the decade, but the model can learn on its own. The important aspect is that the CF model only uses data (user_id, movie_id, rating) to learn the latent features. If there is little data available model-based CF model will predict poorly, since it will be more difficult to learn the latent features.

Models that use both ratings and content features are called Hybrid Recommender Systems where both Collaborative Filtering and Content-based Models are combined. Hybrid recommender systems usually show higher accuracy than Collaborative Filtering or Content-based Models on their own: they are capable to address the cold-start problem better since if you don't have any ratings for a user or an item you could use the metadata from the user or item to make a prediction.

# SVD
Singular Value Decomposition

In [48]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User based CF MSE:' + str(rmse(X_pred,test_data_matrix)))

User based CF MSE:2.7155442132464973


In [None]:
#the standard collaborative method performs poorly in case of sparse data