In [1]:
import numpy as np
import pandas as pd
from math import sqrt

import scipy.sparse as sp
from scipy.sparse.linalg import svds

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances

from operator import itemgetter

In [2]:
SHOW_DATA = True

## Load data

In [3]:
# Users file
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

# Ratings file
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# Train + Test ratings
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=ratings_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=ratings_cols, encoding='latin-1')

# Items file
items_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
              'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
              'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=items_cols, encoding='latin-1')

In [4]:
if SHOW_DATA:
    print "Users:", users.shape, "\n", users.head(), "\n\n"
    print "Ratings:", ratings.shape, "\n", ratings.head(), "\n\n"
    print "Items:", items.shape, "\n", items.head()

Users: (943, 5) 
   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213 


Ratings: (100000, 4) 
   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596 


Items: (1682, 24) 
   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                 

In [5]:
# Train data
user_item_train = np.zeros((users.shape[0], items.shape[0]))

for _, i in ratings_train.iterrows():
    user = i["user_id"] - 1  # ids start at 1!
    item = i["movie_id"] - 1  # ids start at 1!
    rating = i["rating"]
    
    user_item_train[user, item] = rating

# Test Data
user_item_test = np.zeros((users.shape[0], items.shape[0]))

for _, i in ratings_test.iterrows():
    user = i["user_id"] - 1  # ids start at 1!
    item = i["movie_id"] - 1  # ids start at 1!
    rating = i["rating"]
    
    user_item_test[user, item] = rating

## Memory-based collaborative filtering

In [6]:
# Compute the item-item similarity based on the Cosine-Similarity distance
item_similarity = pairwise_distances(user_item_train.T, metric='cosine')

In [7]:
# Compute the item recomendations per user
recommendations_memory = user_item_train.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])

## Model-based collaborative filtering

In [8]:
# Taken from the Netflix competition paper
# http://buzzard.ups.edu/courses/2014spring/420projects/math420-UPS-spring-2014-gower-netflix-SVD.pdf

# Using SVD, the User-Item matrix is decomposed into two different matrices:
#   U(i, j) - the importance that user i attributes to features j
#   D(i, j) - the features i of item j

# Compute the SVD of the train data matrix
u, s, vt = svds(user_item_train, k=20)

# Produce a diagonal matrix from S's entries
s_diag_matrix = np.diag(s)

# Compute the prediction
recommendations_model = np.dot(np.dot(u, s_diag_matrix), vt)

## Comparing the methods

In [9]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [10]:
# Some examples
def get_info_by_user_id(user_id, amount):
    """ Print user info and _amount_ favorite movies titles """
    
    user = users[users["user_id"] == user_id]
    print "User", user_id, "Info:\n\t", user, "\n\n"

    user_1_movies = []

    user_1_ratings = ratings[ratings["user_id"] == user_id]
    for _, rating in user_1_ratings.iterrows():
        movie_id = rating["movie_id"]
        movie = items[items["movie id"] == movie_id]

        user_1_movies.append({
            "movie_name": movie["movie title"].values[0],
            "rating": rating["rating"]
        })

    # Sort movies array by rating
    user_1_movies = sorted(user_1_movies, key=itemgetter('rating'), reverse=True)
    print "User", user_id, "Favorite Movies:"
    for movie in user_1_movies[:amount]:
        print "\t", movie
        
    print "\n\n"

        
def get_top_recomendations(recommendations, user_id, amount):
    """ Print user _amount_ recommended movies titles """
    
    user_recommendations = recommendations[user_id]
    
    recommended_movies = np.argsort(user_recommendations)
    recommended_movies = recommended_movies[::-1]  # revert to get most recommended movie first
    
    print "User", user_id, "Recommended Movies:"
    for movie_id in recommended_movies[:amount]:
        print "\t", items[items["movie id"] == movie_id]["movie title"].values[0]

        
def run_for_user(user_id, prediction, amount=5):
    get_info_by_user_id(user_id, amount)
    get_top_recomendations(prediction, user_id, amount)
    print "\n----------------------------------------------\n"

In [11]:
methods = [{"name": "memory", "prediction": recommendations_memory},
           {"name": "model", "prediction": recommendations_model}]

for method in methods:
    print "Method:", method["name"], "\n\n"
    print "RMSE:", rmse(method["prediction"], user_item_test), "\n"
    run_for_user(1, method["prediction"])

Method: memory 


RMSE: 3.60394279098 

User 1 Info:
	   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711 


User 1 Favorite Movies:
	{'rating': 5, 'movie_name': u'Groundhog Day (1993)'}
	{'rating': 5, 'movie_name': u'Delicatessen (1991)'}
	{'rating': 5, 'movie_name': u'Pillow Book, The (1995)'}
	{'rating': 5, 'movie_name': u'Horseman on the Roof, The (Hussard sur le toit, Le) (1995)'}
	{'rating': 5, 'movie_name': u'Shawshank Redemption, The (1994)'}



User 1 Recommended Movies:
	Vie est belle, La (Life is Rosey) (1987)
	Careful (1992)
	Frankie Starlight (1995)
	Show, The (1995)
	Condition Red (1995)

----------------------------------------------

Method: model 


RMSE: 2.82580756945 

User 1 Info:
	   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711 


User 1 Favorite Movies:
	{'rating': 5, 'movie_name': u'Groundhog Day (1993)'}
	{'rating': 5, 'movie_name': u'Delicatessen (1991)'}
	{'rating': 5, 'movie_name': u'Pillow Book,