In [1]:
import numpy as np
import scipy
from scipy import spatial
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
import time
from lmafit import lmafit_mc_adp

In [2]:
INVALID = 0.0

In [3]:
def RMSE(x, y):
    assert(x.shape == y.shape)
    return np.linalg.norm(x - y) / np.sqrt(x.shape[0])

def similarity_matrix_cosine(X):
    '''
    :param X: Ratings matrix. X[i, j] represents the rating of user i for item j.    
    :return: n x n cosine similarity matrix
    '''
    dist = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(X, 'cosine'))
    return 1 - dist

def similarity_matrix_knn(X, k, dim=2):
    '''
    :param X: Ratings matrix. X[i, j] represents the rating of user i for item j.    
    :return: n x n KNN similarity matrix
    '''
    n = X.shape[0]
    if k > n:
        raise IndexError('Only {} points, cannot have {} neighbors'
                         .format(n, k))
    sim = np.zeros((n, n))
    dist = spatial.distance_matrix(X, X, dim)
    for i in range(n):
        # ind is the indices of the closest k points to point i
        ind = np.argpartition(dist[i], -k)[:k]
        sim[i, ind] = 1.0 / k
    return sim

In [4]:
def get_rating(user_id, movie_id, ratings, sim):
    n = ratings.shape[0]
    sim_values = sim[user_id, :]
    movie_ratings = ratings[:, movie_id]
    
    # Use only valid ratings
    valid_ind = movie_ratings != INVALID
    sim_values = sim_values[valid_ind]
    movie_ratings = movie_ratings[valid_ind]
    pred_rating = sim_values.dot(movie_ratings)
    total = sim_values.sum()
    return pred_rating / total if total > 0 else pred_rating

In [5]:
def test_collaborative_filtering(ratings, p, sim_measure='cosine', k=5, dim=2):
    '''
    :param ratings: Ratings matrix, where ratings[i][j] represents 
    user i's rating for movie j.
    :param sim_measure: Similarity measure to use
    :param p: Fraction of data to use as test-set
    :param k: In case of kNN, value of k to use
    :param dim: In case of kNN, value of dim to use
    
    :return: RMSE error of predictions
    '''
    # Get test indices
    valid_ind = np.where(ratings != INVALID)
    N = valid_ind[0].shape[0]
    test_subset = np.random.choice(np.arange(N), int(p * N), replace=False)
    test_ind = valid_ind[0][test_subset], valid_ind[1][test_subset]
    num_test = test_subset.shape[0]
    
    # Make the test indices invalid
    train_ratings = np.array(ratings)
    train_ratings[test_ind] = INVALID
    
    # Train on train set
    if sim_measure.lower() == 'cosine':
        sim = similarity_matrix_cosine(train_ratings)
    elif sim_measure.lower() == 'knn':
        sim = similarity_matrix_knn(train_ratings, k=k, dim=dim)
    else:
        raise ValueError('Unknown similarity measure {}'
                         .format(sim_measure))
    pred_ratings = np.zeros((num_test,))
    
    # Compute error
    true_ratings = ratings[test_ind]
    for i, (u, m) in enumerate(zip(*test_ind)):
        pred_ratings[i] = get_rating(u, m, train_ratings, sim)
    
    pred_ratings = pred_ratings.clip(1, 5).round()
    return RMSE(true_ratings, pred_ratings)
        

def test_matrix_completion(ratings, p, k):
    '''
    :param ratings: Ratings matrix, where ratings[i][j] represents 
    user i's rating for movie j.
    :param p: Fraction of data to use as test-set
    :param k: Estimated rank of matrix to use for LMaFit
    
    :return: RMSE error of predictions
    '''
    # Get test indices
    valid_ind = np.where(ratings != INVALID)
    N = valid_ind[0].shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    num_test = int(p * N)
    test_ind = valid_ind[0][indices[:num_test]], valid_ind[1][indices[:num_test]]
    train_ind = valid_ind[0][indices[num_test:]], valid_ind[1][indices[num_test:]]
        
    # Make the test indices invalid
    train_ratings = np.array(ratings)
    train_ratings[test_ind] = INVALID
    
    # Run LMaFit
    a, b, _ = lmafit_mc_adp(ratings.shape[0], ratings.shape[1], k, train_ind, ratings[train_ind], None)
    completed = a.dot(b)
    
    # Compute error
    completed = completed.clip(1, 5).round()
    return RMSE(ratings[test_ind], completed[test_ind])

In [6]:
true_ratings = np.load('1M_ratings_np.npy')

In [8]:
%time test_collaborative_filtering(ratings=true_ratings[:, :], p=0.2, sim_measure='cosine', k=40)

CPU times: user 1min 49s, sys: 1.32 s, total: 1min 50s
Wall time: 1min 52s


1.0144727694719067

In [7]:
%time test_matrix_completion(ratings=true_ratings[:, :], p=0.2, k=20)

CPU times: user 2min 9s, sys: 22.3 s, total: 2min 32s
Wall time: 1min 23s


0.99656660590248558