### Read the data

In [16]:
# Item-based collaborative filtering
# 
# References
#
# [1] M.Deshpande and G. Karypis. Item-based top-n recommendation algorithms. ACM Trans. Inf. Syst., 22(1):143-177, 2004.
# [2] B.M. Sarwar, G. Karypis, J.A. Konstan, and J. Reidl. Item-based collaborative filtering recommendation algorithms. In Proceedings of the 10th International World Wide Web Conference, pages 285-295, 2001.
# [3] http://www.awesomestats.in/python-recommending-movies/

In [17]:
import numpy
import pandas as pd

# read data
rating_df = pd.read_csv('data/ml-100k/u.data', sep='\t', names=['user', 'item', 'rate', 'time'])
numpy.random.seed(42)
msk = numpy.random.rand(len(rating_df)) < 0.7
rating_df_train = rating_df[msk]
rating_df_test = rating_df[~msk]

### Build movies similarity matrix

In [18]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

# TASK 1: compute movies similarities
# HINT: use the pairwise_distance method (and the correlation metric)
#
def compute_movies_similarities (method='cosine'):
    """ compute the similarity between movies.  output a dataframe.
    """
    rating_mat = rating_df.pivot( index='item', columns='user', values = "rate" ).reset_index(drop=True)
    if method == 'adjusted_cosine':
        rating_mat_avg = rating_mat.mean(axis=1)
        rating_mat -= rating_mat_avg
    if method == 'pearson':
        rating_mat_avg = rating_mat.mean(axis=0)
        rating_mat -= rating_mat_avg
    rating_mat.fillna( 0, inplace = True )
    movie_sim = 1 - pairwise_distances( rating_mat.as_matrix(), metric="correlation" )
    # bonus point: disregard similarities when less than N users are in common
    return pd.DataFrame( movie_sim )

In [19]:
# TASK 2: find top-k similar movies to a given movie
# HINT:   use the sort_values method
#
def get_similar_movies( sim_df, movieid, topN = 5 ):
    """ get top-N similar movies given an input movie (movieid) and a similarity matrix (sim_df)
        sim_df is the output of compute_movies_similarities()
    """
    movies_df = pd.read_csv( "data/ml-100k/u.item", delimiter = '\|', header = None, engine='python' )
    movies_df = movies_df.iloc[:,:2]
    movies_df.columns = ['movieid', 'title']
    movies_df['similarity'] = sim_df.iloc[movieid -1]
    movies_df.columns = ['movieid', 'title', 'similarity']
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
    return top_n

In [20]:
movie_sim_df = compute_movies_similarities()

In [21]:
get_similar_movies (movie_sim_df, 7, 10)

Unnamed: 0,movieid,title,similarity
6,7,Twelve Monkeys (1995),1.0
99,100,Fargo (1996),0.448059
116,117,"Rock, The (1996)",0.42351
55,56,Pulp Fiction (1994),0.392567
10,11,Seven (Se7en) (1995),0.391409
150,151,Willy Wonka and the Chocolate Factory (1971),0.389149
239,240,Beavis and Butt-head Do America (1996),0.380095
272,273,Heat (1995),0.379296
404,405,Mission: Impossible (1996),0.369299
23,24,Rumble in the Bronx (1995),0.36552


### Predicting ratings

In [22]:
def pivot_data (df):
    """ pivot a rating dataset to index data by item x user
    """
    return df.pivot( index='item', columns='user', values = "rate" ).reset_index(drop=True)

In [23]:
def get_rating (df_pivoted, user, item):
    """ fetch a user x item rating.  return NaN if it does not exist
        df_pivoted is the output of pivot_data()
     """
    try:
        df_pivoted.iloc [item-1][user]
    except:
        return numpy.nan
    return df_pivoted.iloc [item-1][user]

In [24]:
def rmse(predictions, targets):
    """ compute the RMSE between predictions and targets 
    """
    return numpy.sqrt(((numpy.asarray(predictions) - numpy.asarray(targets)) ** 2).mean())

In [25]:
# TASK 3: predict rating for user x item
# HINT:   iterate over similar movies, get their rating and compute the predicted rating
#         use the following methods: get_similar_movies, get_rating
#
def predict_rating (rating_df_pivoted, movie_sim_df, user_id, item_id):
    """ predict rating for a user (user_id) and a movie (item_id) given:
        - movie_sim_df: movie similarities, as output by compute_movies_similarities()
        - rating_df_pivoted: a pivoted version of the rating data, as output by pivot_data()
    """
    similar_movies = get_similar_movies (movie_sim_df, item_id, 10)
    sim_ratings = []
    sim_scores = []
    for row2 in similar_movies[1:].itertuples(): # use 1: to skip obvious similar movie (itself)
        _, item_id_2, _, similarity = row2
        sim_rating = get_rating (rating_df_pivoted, user_id, item_id_2) # random.randint(1,5)
        if not numpy.isnan (sim_rating):
            sim_ratings.append (sim_rating)
            sim_scores.append (similarity)
    if len(sim_ratings) > 0:
            return numpy.dot(sim_ratings, sim_scores) / numpy.sum (sim_scores)
    return numpy.nan

In [26]:
# TASK 4: evaluate the rating prediction on the testing set
# HINT:   use the predict_rating and rmse methods
#
def evaluate (rating_df_pivoted, movie_sim_df, num_ratings):
    """ predict ratings for the testing set and compute RMSE
    """
    predicted_ratings = []
    true_ratings = []
    for row in rating_df_test[:num_ratings].itertuples():
        _, user_id, item_id, rating, _ = row
        predicted_rating = predict_rating (rating_df_pivoted, movie_sim_df, user_id, item_id)
        if not numpy.isnan (predicted_rating):
            predicted_ratings.append (predicted_rating)
            true_ratings.append (rating)

    rmse_val = rmse (predicted_ratings, true_ratings)
    return (rmse_val, len(predicted_ratings))


In [27]:
rating_df_train_pivoted = pivot_data (rating_df_train)
rating_df_pivoted = pivot_data(rating_df)

movie_sim_df_cosine = compute_movies_similarities()
movie_sim_df_pearson = compute_movies_similarities('pearson')
movie_sim_df_adjusted_cosine = compute_movies_similarities('adjusted_cosine')

num_ratings = 1000
(rmse_val, num_ratings) = evaluate (rating_df_train_pivoted, movie_sim_df_cosine, num_ratings)
print (rmse_val, num_ratings)
(rmse_val, num_ratings) = evaluate (rating_df_train_pivoted, movie_sim_df_pearson, num_ratings)
print (rmse_val, num_ratings)
(rmse_val, num_ratings) = evaluate (rating_df_train_pivoted, movie_sim_df_adjusted_cosine, num_ratings)
print (rmse_val, num_ratings)


1.0098831809090643 903
1.0238954576222492 612
0.9547451974524848 480
