In [2]:
import numpy as np
import scipy
from scipy import spatial
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
import time
from lmafit import lmafit_mc_adp

In [3]:
INVALID = 0.0

In [4]:
def RMSE(x, y):
    assert(x.shape == y.shape)
    return np.linalg.norm(x - y) / np.sqrt(x.shape[0])

def similarity_matrix_cosine(X):
    '''
    :param X: Ratings matrix. X[i, j] represents the rating of user i for item j.    
    :return: n x n cosine similarity matrix
    '''
    dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(X, 'cosine'))
    return 1 - dist

def similarity_matrix_knn(X, k, dim=2):
    '''
    :param X: Ratings matrix. X[i, j] represents the rating of user i for item j.    
    :return: n x n KNN similarity matrix
    '''
    n = X.shape[0]
    if k > n:
        raise IndexError('Only {} points, cannot have {} neighbors'
                         .format(n, k))
    sim = np.zeros((n, n))
    dist = spatial.distance_matrix(X, X, dim)
    for i in range(n):
        # ind is the indices of the closest k points to point i
        ind = np.argpartition(dist[i], -k)[:k]
        sim[i, ind] = 1.0 / k
    return sim

In [5]:
def get_rating(user_id, movie_id, ratings, sim):
    n = ratings.shape[0]
    sim_values = sim[user_id, :]
    movie_ratings = ratings[:, movie_id]
    
    # Use only valid ratings
    valid_ind = movie_ratings != INVALID
    sim_values = sim_values[valid_ind]
    movie_ratings = movie_ratings[valid_ind]
    pred_rating = sim_values.dot(movie_ratings)
    total = sim_values.sum()
    return pred_rating / total if total > 0 else pred_rating

In [143]:
def test_collaborative_filtering(ratings, p, sim_measure='cosine', k=5, dim=2):
    '''
    :param ratings: Ratings matrix, where ratings[i][j] represents 
    user i's rating for movie j.
    :param sim_measure: Similarity measure to use
    :param p: Fraction of data to use as test-set
    :param k: In case of kNN, value of k to use
    :param dim: In case of kNN, value of dim to use
    
    :return: RMSE error of predictions
    '''
    # Get test indices
    valid_ind = np.where(ratings != INVALID)
    N = valid_ind[0].shape[0]
    test_subset = np.random.choice(np.arange(N), int(p * N), replace=False)
    test_ind = valid_ind[0][test_subset], valid_ind[1][test_subset]
    num_test = test_subset.shape[0]
    
    # Make the test indices invalid
    train_ratings = np.array(ratings)
    train_ratings[test_ind] = INVALID
    
    # Train on train set
    if sim_measure.lower() == 'cosine':
        sim = similarity_matrix_cosine(train_ratings)
    elif sim_measure.lower() == 'knn':
        sim = similarity_matrix_knn(train_ratings, k=k, dim=dim)
    else:
        raise ValueError('Unknown similarity measure {}'
                         .format(sim_measure))
    pred_ratings = np.zeros((num_test,))
    
    # Compute error
    true_ratings = ratings[test_ind]
    for i, (u, m) in enumerate(zip(*test_ind)):
        pred_ratings[i] = get_rating(u, m, train_ratings, sim)
    
    pred_ratings = pred_ratings.clip(1, 5)
    return RMSE(true_ratings, pred_ratings)


def test_matrix_completion(ratings, p, k):
    '''
    :param ratings: Ratings matrix, where ratings[i][j] represents 
    user i's rating for movie j.
    :param p: Fraction of data to use as test-set
    :param k: Estimated rank of matrix to use for LMaFit
    
    :return: RMSE error of predictions
    '''
    # Get test indices
    valid_ind = np.where(ratings != INVALID)
    N = valid_ind[0].shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    num_test = int(p * N)
    test_ind = valid_ind[0][indices[:num_test]], valid_ind[1][indices[:num_test]]
    train_ind = valid_ind[0][indices[num_test:]], valid_ind[1][indices[num_test:]]
        
    # Make the test indices invalid
    train_ratings = np.array(ratings)
    train_ratings[test_ind] = INVALID
    
    # Run LMaFit
    a, b, _ = lmafit_mc_adp(ratings.shape[0], ratings.shape[1], k, train_ind, ratings[train_ind], None)
    completed = a.dot(b)
    
    # Compute error
    completed = completed.clip(1, 5)
    return RMSE(ratings[test_ind], completed[test_ind])

In [7]:
true_ratings = np.load('1M_ratings_np.npy')

In [145]:
%time test_collaborative_filtering(ratings=true_ratings[:, :], p=0.2, sim_measure='cosine', k=40)

CPU times: user 1min 53s, sys: 1.88 s, total: 1min 55s
Wall time: 2min


0.97467607518076849

In [149]:
%time test_matrix_completion(ratings=true_ratings[:, :], p=0.5, k=2)

CPU times: user 54.6 s, sys: 11.5 s, total: 1min 6s
Wall time: 39.2 s


0.90755631147466764

## With Genres

In [8]:
import pandas as pd
import numpy as np
import csv

movie_file = "ml-1m/movies.dat"
m_names = ["bad_index", "Title", "Genre"]
movies = pd.read_csv(movie_file, nrows=1000000, header=None, names=m_names, sep="::", engine='python')
movie_to_index = dict((m,i) for i,m in zip(movies.index, movies["bad_index"]))
movie_to_genre = dict((i,g) for i,g in zip(movies.index, movies["Genre"]))

In [50]:
genre_to_int = {
    'Action' : 0,
    'Adventure' : 1,
    'Animation' : 2,
    'Children\'s' : 3,
    'Comedy' : 4,
    'Crime' : 5,
    'Documentary' : 6,
    'Drama' : 7,
    'Fantasy' : 8,
    'Film-Noir' : 9, 
    'Horror' : 10,
    'Musical' : 11,
    'Mystery' : 12,
    'Romance' : 13,
    'Sci-Fi' : 14,
    'Thriller' : 15,
    'War' : 16,
    'Western' : 17, 
}
NUM_GENRES = 18

def get_genres_for_movie(movie, curr):
    inds = [genre_to_int[genre] for genre in movie_to_genre[movie].split('|')]
    for i in inds:
        curr[i] += 1
    return curr

def get_top_k_genres_for_user(mat, user, k):
    movies = mat[user]
    genre_prefs = np.zeros(len(genre_to_int))
    average_rating = movies[np.nonzero(movies)].mean()
    for i, movie in enumerate(movies):
        if movie > average_rating:
            genre_prefs = get_genres_for_movie(i, genre_prefs)
    return np.flip(np.argsort(genre_prefs), axis=0)[:k]

In [42]:
def genre_similarity_matrix_by_overlap(X, top_k):
    '''
    :param X: Ratings matrix. X[i, j] represents the rating of user i for item j.    
    :param top_k: Top-k genres will be considered
    :return: n x n genre similarity matrix
    '''
    n = X.shape[0]  # Number of users
    genre_ratings = np.zeros((NUM_GENRES, n, n), dtype=np.bool)
    for u in range(n):
        best_genres = get_top_k_genres_for_user(X, u, top_k)
        genre_ratings[best_genres, u, :] = True

    # Now, genre_ratings[i, u, :] == True iff user u has genre i in their top-k genres
    # overlap_by_genre[i, u, v] == True iff users u and v both have genre i in their top-k genres
    overlap_by_genre = np.logical_and(genre_ratings, genre_ratings.transpose((0, 2, 1)))
    assert(overlap_by_genre.shape == (NUM_GENRES, n, n))

    return overlap_by_genre.sum(axis=0)

In [140]:
def test_collaborative_filtering_with_genres(ratings, p, sim_measure='cosine', k=5, dim=2, top_k=3):
    '''
    :param ratings: Ratings matrix, where ratings[i][j] represents 
    user i's rating for movie j.
    :param sim_measure: Similarity measure to use
    :param p: Fraction of data to use as test-set
    :param k: In case of kNN, value of k to use
    :param dim: In case of kNN, value of dim to use
    :param top_k: Top-k genres will be considered
    
    :return: RMSE error of predictions
    '''
    # Get test indices
    valid_ind = np.where(ratings != INVALID)
    N = valid_ind[0].shape[0]
    test_subset = np.random.choice(np.arange(N), int(p * N), replace=False)
    test_ind = valid_ind[0][test_subset], valid_ind[1][test_subset]
    num_test = test_subset.shape[0]
    
    # Make the test indices invalid
    train_ratings = np.array(ratings)
    train_ratings[test_ind] = INVALID
    
    # Compute the genre similarity matrix
    genre_sim = genre_similarity_matrix_by_overlap(train_ratings, top_k=top_k)
    
    # Compute similarity matrix
    if sim_measure.lower() == 'cosine':
        sim = similarity_matrix_cosine(train_ratings)
    elif sim_measure.lower() == 'knn':
        sim = similarity_matrix_knn(train_ratings, k=k, dim=dim)
    else:
        raise ValueError('Unknown similarity measure {}'
                         .format(sim_measure))
    
    # Combine the regular similarity and the genre similarity
    sim += 0.1 * genre_sim
    
    pred_ratings = np.zeros((num_test,))
    
    # Compute error
    true_ratings = ratings[test_ind]
    for i, (u, m) in enumerate(zip(*test_ind)):
        pred_ratings[i] = get_rating(u, m, train_ratings, sim)
    
    pred_ratings = pred_ratings.clip(1, 5)
    return RMSE(true_ratings, pred_ratings)

In [141]:
# R = true_ratings[:1500, :3000]
R = true_ratings

In [142]:
%time test_collaborative_filtering_with_genres(ratings=R, p=0.2, sim_measure='cosine', k=40, top_k=7)

CPU times: user 2min 11s, sys: 3.71 s, total: 2min 15s
Wall time: 2min 21s


0.97799486945741843

In [144]:
%time test_collaborative_filtering(ratings=R, p=0.2, sim_measure='cosine', k=40)

CPU times: user 1min 59s, sys: 1.76 s, total: 2min 1s
Wall time: 2min 6s


0.97446782978933677