#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, KFold
import time
from scipy.stats import spearmanr

#### Import Data

In [2]:
ratings = pd.read_csv('../../ratings.csv')

#### Make Subsets of Data for Part I

3 subsets: 500 users / 20 books, 2000 users / 50 books, 10000 users / 100 books. Each has a train and test set.

In [3]:
def pick_users_books(df, num_users, num_books):
    user_counts = pd.DataFrame(df.user_id.value_counts()).sort_values('user_id', ascending=False)
    top_10K_users = list(user_counts[0:num_users].index)
    user_filtered_df = df[df.user_id.isin(top_10K_users)]
    filtered_book_counts = pd.DataFrame(user_filtered_df.book_id.value_counts()).sort_values('book_id', 
                                                                                             ascending = False)
    top_100_filtered_books = list(filtered_book_counts[0:num_books].index)
    filtered_df = user_filtered_df[user_filtered_df.book_id.isin(top_100_filtered_books)]
    train, test = train_test_split(filtered_df, test_size = 0.2, random_state=42)
    return train, test
    
def get_all_subsets(df):
    train_500_20, test_500_20 = pick_users_books(df, 500, 20)
    train_2000_50, test_2000_50 = pick_users_books(df, 2000, 50)
    train_10000_100, test_10000_100 = pick_users_books(df, 10000, 100)
    return train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100

In [4]:
train_500_20, test_500_20, train_2000_50, test_2000_50, train_10000_100, test_10000_100 = get_all_subsets(ratings)

#### Implement Matrix Factorization

In [5]:
def preprocess(X_train, X_test):
    
    # create user and book indices starting from 0
    mappings_user = pd.DataFrame({'user_id': sorted(X_train.user_id.unique()), 
                              'user_idx': range(len(X_train.user_id.unique()))})
    mappings_book = pd.DataFrame({'book_id': sorted(X_train.book_id.unique()), 
                              'book_idx': range(len(X_train.book_id.unique()))})
    X_train = pd.merge(X_train, mappings_user, on='user_id')
    X_train = pd.merge(X_train, mappings_book, on='book_id')
    X_test = pd.merge(X_test, mappings_user, on='user_id')
    X_test = pd.merge(X_test, mappings_book, on='book_id')
    
    # create user-item matrix for training data and find non-zero values
    M = np.array(X_train.pivot_table(index = 'user_idx', columns='book_idx', values='rating', fill_value=0))
    mask = M != 0
    
    # use fake data to make test matrix that matches the size of the train matrix
    fake_book = pd.DataFrame({'user_id': sorted(X_train.user_id.unique()), 'rating': 0,
                          'user_idx': range(len(X_train.user_id.unique())), 'book_id': 'XXX', 'book_idx': 100})
    fake_user = pd.DataFrame({'book_id': sorted(X_train.book_id.unique()), 'rating': 0,
                          'book_idx': range(len(X_train.book_id.unique())), 'user_id': 'XXX', 'user_idx': 10000000000})
    X_test = pd.concat([X_test, fake_book, fake_user])
    M_test = X_test.pivot_table(index = 'user_idx', columns='book_idx', values='rating', fill_value=0)
    M_test.drop(100, axis=1, inplace=True)
    M_test.drop(10000000000, axis=0, inplace=True)
    
    # subtract off user means
    means_list = []
    for row in range(M.shape[0]):
        n_ratings = len(np.where(M[row,:] != 0)[0])
        means_list.append(np.sum(M[row,:]) / n_ratings)
    means = np.array(means_list).reshape(-1,1)
    M_norm = (M - means) * mask
    
    return (M_norm, means, np.array(M_test))

In [6]:
def update(R, mask, U, V, alpha, E): # performs one iteration of updating U and V matrices
    U_new = U + (alpha * np.dot(E, V))
    V_new = V + (alpha * np.dot(E.transpose(), U))
    return (U_new, V_new)

In [7]:
def calc_error(R, mask, U, V): 
    # calculates error matrix - difference between prediction and true rating where true rating exists, 0 otherwise
    E = (R - np.dot(U, V.transpose())) * mask
    return E

In [8]:
def calc_loss(E): # calculate value of loss function based on error matrix
    J = 0.5 * np.sum(np.square(E))
    return J

In [9]:
def mat_fact(R, d, alpha):
    # initialize U and V
    U = np.random.randn(R.shape[0], d)
    V = np.random.randn(R.shape[1], d)
    
    # calculate error and loss
    mask = R != 0
    E = calc_error(R, mask, U, V)
    J_prev = calc_loss(E)
    J_ratio = 1
    
    # while not converged, update U and V and recalculate error and loss
    while np.abs(J_ratio) > .00001:
        U, V = update(R, mask, U, V, alpha, E)
        E = calc_error(R, mask, U, V)
        J = calc_loss(E)
        J_ratio = (J_prev - J) / J_prev
        J_prev = J
            
    return (U, V)

In [10]:
def calc_RMSE(M_test, preds): # calculate RMSE using only observed entries
    mask_test = M_test != 0
    preds_masked = mask_test * preds
    rmse = np.sqrt(np.sum(np.sum(np.square(preds_masked - M_test))) / np.sum(np.sum(mask_test)))
    return rmse

#### Grid-Search Parameters
Use only biggest subset

In [11]:
# d_values = [6, 8, 10, 12, 14, 16, 18, 20]
# alpha_values = [0.00005, 0.0001, 0.00015, 0.0002, 0.00025, 0.0003]

In [12]:
# def overall(data, d, alpha):
#     k_fold = KFold(n_splits=3)
#     rmses = []
#     for train_indices, test_indices in k_fold.split(data):
#         X_train = data.iloc[train_indices]
#         X_test = data.iloc[test_indices]
#         M_norm, means, X_test = preprocess(X_train, X_test)
#         U, V = mat_fact(M_norm, d, alpha)
#         preds = np.dot(U, V.transpose()) + means
#         rmse = calc_RMSE(X_test, preds)
#         rmses.append(rmse)
#     return np.mean(rmses)

In [13]:
# grid_results = {}
# for d in d_values:
#     print('Current d value: ', d)
#     for alpha in alpha_values:
#         print('Current alpha value: ', alpha)
#         rmse = overall(train_10000_100, d, alpha)
#         grid_results[(d, alpha)] = rmse

In [14]:
# print('The best parameters were {}, with average RMSE {}.'.format(min(pd.DataFrame(grid_results, index=[0])), 
#                                         grid_results[min(pd.DataFrame(grid_results, index=[0]))]))

# RESULT: The best parameters were (6, 5e-05), with average RMSE 0.9587896929775073.

#### Train 3 Subsets to Observe Scalability

In [15]:
def calc_MAE(M_test, preds): # calculate MAE using only observed entries
    mask_test = M_test != 0
    preds_masked = mask_test * preds
    mae = np.sum(np.sum(preds_masked - M_test)) / np.sum(np.sum(mask_test))
    return mae

In [16]:
def calc_spearman(M_test, preds): # calculate spearman coefficient using only observed entries
    spearmans = []
    for i in range(len(M_test)):
        mask = M_test[i,:] != 0
        if sum(mask) > 1: # can't calculate if there's only one item rated
            M_test_mask = M_test[i, mask]
            preds_mask = preds[i, mask]
            spearman = spearmanr(M_test_mask, preds_mask)[0]
            if np.isnan(spearman) == False: # spearman is NaN if all true ratings are the same! exclude these
                spearmans.append(spearman)
    return np.mean(spearmans)

In [17]:
def train_test(X_train, X_test):
    M_norm, means, X_test = preprocess(X_train, X_test)
    start_time = time.time()
    U, V = mat_fact(M_norm, 6, 5e-05)
    end_time = time.time()
    preds = np.dot(U, V.transpose()) + means
    rmse = calc_RMSE(X_test, preds)
    mae = calc_MAE(X_test, preds)
    spearman = calc_spearman(X_test, preds)
    return (end_time-start_time, rmse, mae, spearman)

In [18]:
subsets = [(train_500_20, test_500_20, 500, 20), (train_2000_50, test_2000_50, 2000, 50), 
           (train_10000_100, test_10000_100, 10000, 100)]

In [19]:
for (X_train, X_test, n_users, n_items) in subsets:
    elapsed_time, rmse, mae, spearman = train_test(X_train, X_test)
    print('With a subset of {} users and {} items, training took place in {} seconds and gave an RMSE of {}, an MAE of {}, and an average Spearman correlation of {}.'.format(n_users, n_items, elapsed_time, rmse, mae, spearman))

With a subset of 500 users and 20 items, training took place in 0.15121197700500488 seconds and gave an RMSE of 0.9439499598101843, an MAE of -0.05204821267857118, and an average Spearman correlation of -0.01074360085755853.
With a subset of 2000 users and 50 items, training took place in 0.3641390800476074 seconds and gave an RMSE of 0.9591612401078207, an MAE of 0.009276199057852564, and an average Spearman correlation of -0.013477897587679134.
With a subset of 10000 users and 100 items, training took place in 1.823033094406128 seconds and gave an RMSE of 0.9491814761445283, an MAE of 0.0029730844555269187, and an average Spearman correlation of 0.004043909768605445.
