In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.sparse import csc_matrix, csr_matrix
import Utilities as u

In [3]:
df_sparse = u.load_movielens_sparse()

# Latent Factors, Taken from Chapter 3

In [4]:
def construct_factors(df_sparse,num_factors):
    '''Constructs the num_factors user factors and item factors for the sparse matrix'''
    m = df_sparse.shape[0]
    n = df_sparse.shape[1]
    U = np.random.rand(m,num_factors)
    V = np.random.rand(n,num_factors)
    return U,V

def error(df_sparse,U,V):
    '''Calculates the matrix portion of the error term'''
    d = df_sparse.nonzero()
    rows = d[0].reshape(1,-1).flatten()
    cols = d[1].reshape(1,-1).flatten()
    e = df_sparse[d] - np.matmul(U,V.T)[d]
    e = np.asarray(e).flatten()
    e = csc_matrix((e,(rows,cols)))
    return e

def frobenius_norm(df_sparse,U,V):
    e = error(df_sparse,U,V)
    e = e.power(2)
    return e.sum()

def cost_function(df_sparse,U,V,l):
    '''Calculates the cost function to minimize, l is the regularization term'''
    e = frobenius_norm(df_sparse,U,V)
    usum = np.square(U).sum()
    vsum = np.square(V).sum()
    J = 0.5 * e + (l/2)*usum + (l/2)*vsum
    return J

def update_factors(df_sparse,U,V,l,alpha):
    '''alpha is learning rate, l is regularization parameter'''
    nz = df_sparse.nonzero()
    E = error(df_sparse,U,V)
    U_temp = U*(1-alpha * l) + alpha * (E * V)
    V_temp = V*(1-alpha * l) + alpha * (E.T * U)
    return U_temp,V_temp

def fit(df_sparse,num_factors,learning_rate,regularization_rate):
    U,V = construct_factors(df_sparse,num_factors)
    J = cost_function(df_sparse,U,V,regularization_rate)
    prev = J
    iter_ctr = 0
    while True:
        iter_ctr += 1
        U,V = update_factors(df_sparse,U,V,regularization_rate,learning_rate)
        J = cost_function(df_sparse,U,V,regularization_rate)
        pct_change = J/prev-1
        if abs(pct_change) < 0.001 or iter_ctr > 1000:
            break
        prev = J
    return U,V
    

In [None]:
U,V = fit(df_sparse,10,0.0001,0.8)
pred_rat = np.matmul(U,V.T)

In [None]:
e = error(df_sparse,U,V)

In [5]:
train,test = u.train_test_split(df_sparse,10)

U,V = fit(train,10,0.0001,0.8)
pred_rat = np.matmul(U,V.T)

# Calculating Error Rates

In [18]:
def mse(err):
    '''Calculates mean squared error of a sparse matrix'''
    squared_error = err.power(2)
    s = squared_error.sum()
    n = err.nnz
    return s/n

def rmse(err):
    '''Calculates root mean squared error of a sparse matrix'''
    m = mse(err)
    return np.sqrt(m)

def mae(err):
    abs_err = err_test.multiply(err_test.sign())
    n = err.nnz
    return abs_err.sum()/n
    

In [19]:
err_train = error(train,U,V)
err_test = error(test,U,V)
print(rmse(err_train))
print(rmse(err_test))
print(mae(err_train))
print(mae(err_test))

0.9139774177
0.955594162026
0.0826120435754
0.743541436996
