In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import scipy.sparse as sparse

In [14]:
# Read the ratings csv file into a pandas Dataframe
filename = '../../data/ml-20m/ratings.csv'
# filename = '../../data/ml-latest-small/ratings.csv'
df = pd.read_csv(filename)

n_users = df['userId'].unique().shape[0]
n_items = df['movieId'].unique().shape[0]
print("Number of unique users: %d" % n_users)
print("Number of unique movies: %d" % n_items)


Number of unique users: 138493
Number of unique movies: 26744


In [15]:
def split_data(df):
    """ Split the data into training, validation and test partitions by random sampling.

        80% of the data is randomly sampled to be the training partition.
        10% is held out as a validation dataset to tune the hyperparameters.
        10% is held out as a test partition to test the final performance of the model.

        Args
            df: pandas dataframe object containing the dataset

        Returns
            df_train: Dataframe corresponding to training partition
            df_valid: Dataframe corresponding to validation partition
            df_test: Dataframe corresponding to test partition
    """
    random_seed = 1
    df_train = df.sample(frac=0.8, random_state=random_seed)
    df_rem = df.loc[~df.index.isin(df_train.index)]
    df_valid = df_rem.sample(frac=0.5, random_state=random_seed)
    df_test = df_rem.loc[~df_rem.index.isin(df_valid.index)]
#     logger.info("Shape of training dataframe: " + str(df_train.shape))
#     logger.info("Shape of validation dataframe: " + str(df_valid.shape))
#     logger.info("Sahpe of test dataframe: " + str(df_test.shape))

    return df_train, df_valid, df_test


def create_sparse_coo_matrix(df, n_users, n_items, movie_dict):
    """ Create a scipy sparse coo matrix from the given dataframe 

        Args
            df: Dataframe object to be converted to sparse matrix
            n_users: Number of rows in the sparse matrix
            n_items: Number of columns in the sparse matrix
            movie_dict: Dictionary mapping the movies in the dataset to a movie id

        Returns
            sparse_matrix_coo (scipy.sparse.coo_matrix): Sparse matrix in COO form  
    """

    # Map the movie_ids in the data to the new movie_ids given by the dictionary movie_dict
    movie_id_list = list(map(lambda x: movie_dict[x], df['movieId'].tolist()))
    # Map the user_id in the dataframe to userid - 1 [to account for zero based indexing]
    user_id_list = list(map(lambda x: x - 1, df['userId'].tolist()))
    sparse_matrix_coo = sparse.coo_matrix((df['rating'].tolist(),(user_id_list, movie_id_list)),shape=(n_users,n_items))
#     logger.debug("Shape of created sparse matrix: " + str(sparse_matrix_coo.shape))
#     logger.debug("Number of non_zero elements in the sparse matrix: " + str(sparse_matrix_coo.nnz))
#     logger.debug("Number of entries in the input dataframe:[should match the number of non zero entries in sparse matrix] " + str(df.shape[0]))
    return sparse_matrix_coo

In [16]:
df_train, df_valid, df_test = split_data(df)

In [17]:
# README file for the dataset: http://files.grouplens.org/datasets/movielens/ml-20m-README.html
# User-ids are in the range (1, 138493). We just subract 1 from each userId to convert the range to (0,138492)
# Total number of movies are 27278 but the the range of movieIds is bigger than (1,27278)
# We need to map the movieIds to the range (0,27277)
# Only movies with at least one rating or tag are included in the dataset. As we see above, the number of unique movies
# for which we have atleast one rating is 26744 
ind = 0
movie_list = [] # List which is reverse of movie_dict, contains original movieId at index 'new id'
movie_dict = {}   # Dictionary from original movieId to new id
for movieId in df['movieId'].unique():
    movie_list.append(movieId)
    movie_dict[movieId] = ind
    ind += 1


In [18]:
# Create sparse matrix for the training, validation and test data
sparse_train_coo = create_sparse_coo_matrix(df_train, n_users, n_items, movie_dict)
sparse_valid_coo = create_sparse_coo_matrix(df_valid, n_users, n_items, movie_dict)
sparse_test_coo = create_sparse_coo_matrix(df_test, n_users, n_items, movie_dict)

In [19]:
# Convert sparse matrices to CSR form
sparse_train_csr = sparse_train_coo.tocsr()
sparse_valid_csr = sparse_valid_coo.tocsr()
sparse_test_csr = sparse_test_coo.tocsr()

In [20]:
# Ignore
valid_data = sparse_valid_csr.data
print(type(valid_data))
print(valid_data.shape)

<class 'numpy.ndarray'>
(2000026,)


In [21]:
# Global Average
global_avg = sparse_train_csr.sum()/sparse_train_csr.nnz

actual_valid = sparse_valid_csr.data
pred_valid = np.ones(actual_valid.shape)
pred_valid *= global_avg

valid_mse = mean_squared_error(pred_valid, actual_valid)

actual_test = sparse_test_csr.data
pred_test = np.ones(actual_test.shape)
pred_test *= global_avg

test_mse = mean_squared_error(pred_test, actual_test)

print("Global Average Baseline: Validation MSE: " + str(valid_mse))
print("Global Average Baseline: Test MSE: " + str(test_mse))

Global Average Baseline: Validation MSE: 1.10685057249
Global Average Baseline: Test MSE: 1.10811437239


In [22]:
# User Average Baseline
data = sparse_train_csr.data
indices = sparse_train_csr.indices
indptr = sparse_train_csr.indptr

user_sum = sparse_train_csr.sum(axis=1)

data_useravg = np.empty(shape=(indptr.shape[0] - 1,),dtype=np.float64)
for user_num in range(indptr.shape[0] - 1):
    data_useravg[user_num] = user_sum[user_num,0] / (indptr[user_num + 1] - indptr[user_num] + 1e-9)
    
print(data_useravg.shape)
print(data_useravg[:10])

indptr_valid = sparse_valid_csr.indptr
pred_valid_ua = np.empty(shape=actual_valid.shape,dtype=np.float64)

for user_num in range(indptr_valid.shape[0] - 1):
    pred_valid_ua[indptr_valid[user_num]: indptr_valid[user_num + 1]] = data_useravg[user_num]
    
indptr_test = sparse_test_csr.indptr
pred_test_ua = np.empty(shape=actual_test.shape,dtype=np.float64)

for user_num in range(indptr_test.shape[0] - 1):
    pred_test_ua[indptr_test[user_num]: indptr_test[user_num + 1]] = data_useravg[user_num]


ua_valid_mse = mean_squared_error(pred_valid_ua, actual_valid)
ua_test_mse = mean_squared_error(pred_test_ua, actual_test)

print("User Average Baseline: Validation MSE: " + str(ua_valid_mse))
print("User Average Baseline: Test MSE: " + str(ua_test_mse))

(138493,)
[ 3.70143885  3.96        4.13548387  3.65217391  4.29411765  3.63157895
  3.31390135  3.76271186  3.11111111  3.93333333]
User Average Baseline: Validation MSE: 0.930433018601
User Average Baseline: Test MSE: 0.930988017135


In [23]:
# Movie Average Baseline
sparse_train_transpose = sparse_train_coo.transpose().tocsr()
data = sparse_train_transpose.data
indices = sparse_train_transpose.indices
indptr = sparse_train_transpose.indptr

print(indptr.shape)

movie_sum = sparse_train_transpose.sum(axis=1)

movieavg = np.empty(shape=(indptr.shape[0] - 1,),dtype=np.float64)
for movie in range(indptr.shape[0] - 1):
    movieavg[movie] = movie_sum[movie,0] / (indptr[movie + 1] - indptr[movie] + 1e-9)
    
print(movieavg.shape)
print(movieavg[:10])

pred_valid_ma = []
for j in sparse_valid_coo.col:
    pred_valid_ma.append(movieavg[j])
    
pred_test_ma = []
for j in sparse_test_coo.col:
    pred_test_ma.append(movieavg[j])
    
actual_valid_ma = sparse_valid_coo.data
actual_test_ma = sparse_test_coo.data
    
ma_valid_mse = mean_squared_error(pred_valid_ma, actual_valid_ma)
ma_test_mse = mean_squared_error(pred_test_ma, actual_test_ma)

print("Movie Average Baseline: Validation MSE: " + str(ma_valid_mse))
print("Movie Average Baseline: Test MSE: " + str(ma_test_mse))

(26745,)
(26744,)
[ 3.20721102  3.94608632  3.89419012  4.05198206  4.33258537  3.4085321
  3.53753338  3.86516616  3.49412456  4.18969115]
Movie Average Baseline: Validation MSE: 0.888472832341
Movie Average Baseline: Test MSE: 0.889690036491


In [24]:
# Adjusted Average Method
ubias = data_useravg - global_avg
mbias = movieavg - global_avg

print(ubias[:5])
print(mbias[:5])

pred_valid_aa = []
for (i,j) in zip(sparse_valid_coo.row, sparse_valid_coo.col):
    pred_valid_aa.append(global_avg + ubias[i] + mbias[j])
    
pred_test_aa = []
for (i,j) in zip(sparse_test_coo.row, sparse_test_coo.col):
    pred_test_aa.append(global_avg + ubias[i] + mbias[j])
    
# print(pred_valid_aa[:10])
# print(actual_valid[:10])
# print(pred_test_aa[:10])
    
aa_valid_mse = mean_squared_error(pred_valid_aa, actual_valid_ma)
aa_test_mse = mean_squared_error(pred_test_aa, actual_test_ma)

print("Adjusted Average Baseline: Validation MSE: " + str(aa_valid_mse))
print("Adjusted Average Baseline: Test MSE: " + str(aa_test_mse))

[ 0.175892    0.43445315  0.60993702  0.12662706  0.76857079]
[-0.31833583  0.42053947  0.36864327  0.52643521  0.80703852]
Adjusted Average Baseline: Validation MSE: 0.775389573398
Adjusted Average Baseline: Test MSE: 0.775445696486
