In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import numpy as np
from scipy.optimize import fmin_cg


def normalize_ratings(ratings):
    """
    Given an array of user ratings, subtract the mean of each product's ratings
    :param ratings: 2d array of user ratings
    :return: (normalized ratings array, the calculated means)
    """
    mean_ratings = np.nanmean(ratings, axis=0)
    return ratings - mean_ratings, mean_ratings


def cost(X, *args):
    """
    Cost function for low rank matrix factorization
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The cost with the current P and Q matrices
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate current cost
    return (np.sum(np.square(mask * (np.dot(P, Q) - ratings))) / 2) + ((regularization_amount / 2.0) * np.sum(np.square(Q.T))) + ((regularization_amount / 2.0) * np.sum(np.square(P)))


def gradient(X, *args):
    """
    Calculate the cost gradients with the current P and Q.
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The gradient with the current X
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate the current gradients for both P and Q
    P_grad = np.dot((mask * (np.dot(P, Q) - ratings)), Q.T) + (regularization_amount * P)
    Q_grad = np.dot((mask * (np.dot(P, Q) - ratings)).T, P) + (regularization_amount * Q.T)

    # Return the gradients as one rolled-up array as expected by fmin_cg
    return np.append(P_grad.ravel(), Q_grad.ravel())


def low_rank_matrix_factorization(ratings, mask=None, num_features=15, regularization_amount=0.01):
    """
    Factor a ratings array into two latent feature arrays (user features and product features)

    :param ratings: Matrix with user ratings to factor
    :param mask: A binary mask of which ratings are present in the ratings array to factor
    :param num_features: Number of latent features to generate for users and products
    :param regularization_amount: How much regularization to apply
    :return: (P, Q) - the factored latent feature arrays
    """
    num_users, num_products = ratings.shape

    # If no mask is provided, consider all 'NaN' elements as missing and create a mask.
    if mask is None:
        mask = np.invert(np.isnan(ratings))

    # Replace NaN values with zero
    ratings = np.nan_to_num(ratings)

    # Create P and Q and fill with random numbers to start
    np.random.seed(0)
    P = np.random.randn(num_users, num_features)
    Q = np.random.randn(num_products, num_features)

    # Roll up P and Q into a contiguous array as fmin_cg expects
    initial = np.append(P.ravel(), Q.ravel())

    # Create an args array as fmin_cg expects
    args = (num_users, num_products, num_features, ratings, mask, regularization_amount)

    # Call fmin_cg to minimize the cost function and this find the best values for P and Q
    X = fmin_cg(cost, initial, fprime=gradient, args=args, maxiter=3000)

    # Unroll the new P and new Q arrays out of the contiguous array returned by fmin_cg
    nP = X[0:(num_users * num_features)].reshape(num_users, num_features)
    nQ = X[(num_users * num_features):].reshape(num_products, num_features)

    return nP, nQ.T


def RMSE(real, predicted):
    """
    Calculate the root mean squared error between a matrix of real ratings and predicted ratings
    :param real: A matrix containing the real ratings (with 'NaN' for any missing elements)
    :param predicted: A matrix of predictions
    :return: The RMSE as a float
    """

    return np.sqrt(np.nanmean(np.square(real - predicted)))


In [8]:
raw_training_dataset_df = pd.read_csv('datasets/new_df.csv', sep=",", error_bad_lines=False, encoding="latin-1")

In [6]:
raw_testing_dataset_df = pd.read_csv('datasets/new_df_test.csv', sep=",", error_bad_lines=False, encoding="latin-1")

In [9]:
raw_training_dataset_df, raw_testing_dataset_df = train_test_split(raw_training_dataset_df, test_size=0.3)

In [10]:
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='recipes_id', aggfunc=np.max).fillna(0)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='recipes_id', aggfunc=np.max).fillna(0)

In [12]:
U, M = low_rank_matrix_factorization(ratings_training_df.as_matrix(),
                                                                    num_features=10,
                                                                    regularization_amount=1.1)


Optimization terminated successfully.
         Current function value: 241.037496
         Iterations: 76
         Function evaluations: 128
         Gradient evaluations: 128


In [21]:
predicted_ratings = np.matmul(U, M)
print(predicted_ratings);

[[  1.34168813e-01   4.78652267e+00  -3.17905098e-02   1.12040046e+01
    2.19046540e-02  -4.69371396e-03   1.54779400e+01  -6.87777185e-02
    3.43434793e-02   1.05258809e-02   1.47988614e-01   3.79376158e+00
   -4.19282159e-02   4.66833512e+00   5.67816441e-02  -6.26108597e-04
    4.77105272e+00  -7.02655755e-03   6.60485399e-03  -2.84693745e-02]
 [ -1.29310268e-02   7.21510821e+00  -2.71602738e-03   1.77281589e-01
    8.57683943e+00  -4.47758140e-02   4.20563381e-02   2.06329364e-02
    7.85295306e-03   3.42230608e-02  -5.80101357e-02   3.62971439e+00
   -5.15471442e-02   7.38672144e-02   4.54680508e+00  -5.96991741e-03
    3.68538202e-02   1.26981366e-03   1.50992427e-03   1.49892678e-01]
 [ -1.11417807e-02  -1.27222380e-02   1.83586064e-02  -1.87716552e-03
    4.72393982e-02   2.90441621e+01   6.87114341e-03  -2.99544920e-03
   -1.94121871e-02   2.64864025e+01   3.60479360e-03  -6.59600417e-03
    6.86002987e-02  -7.82191877e-04   1.87161179e-03   3.87255492e+00
    1.00336510e-03

In [22]:
rmse_training = RMSE(ratings_training_df.as_matrix(),
                                                    predicted_ratings)
print(rmse_training);

0.245967470984


In [24]:
rmse_testing = RMSE(ratings_testing_df.as_matrix(),
                                                   predicted_ratings.reshape(9,16))

ValueError: cannot reshape array of size 180 into shape (9,16)