In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)
from sklearn.metrics import mean_squared_error
import time


In [2]:
names = ['user_id', 'movie_id', 'rating', 'timestamp']
df = pd.read_csv('./ml-100k/u.data', sep='\t', names=names)
print(df.head())
print(df.shape)


   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596
(100000, 4)


In [3]:
n_users = df["user_id"].unique().shape[0]
n_movies = df["movie_id"].unique().shape[0]
ratings = np.zeros((n_users, n_movies))
for row in df.itertuples():
    ratings[row[1] - 1, row[2] - 1] = row[3]
print(f"ratings = {ratings}")
W = ratings.copy()
W[W > 0] = 1
print(f"W = {W}")
print(W.shape)


ratings = [[5. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
W = [[1. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
(943, 1682)


In [4]:
def train_test_split(ratings, s=100, r=1000):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    found = False

    l = []
    for movie in range(ratings.shape[1]):
        l.append(np.sum(ratings[:, movie] != 0))
    l = np.array(l)
    top_r = np.argsort(-l)[:r]
    while not found:
        for user in range(ratings.shape[0]):
            test_ratings = np.random.choice(top_r.nonzero()[0], size=s, replace=False)
            train[user, test_ratings] = 0.
            test[user, test_ratings] = ratings[user, test_ratings]
        found = True
        for movie in range(ratings.shape[1]):
            if np.all(train[:, movie] == 0):
                found = False
                break

    mean_imputed_ratings = train.copy()
    
    for col in range(train.shape[1]):
        non_zero_cols = train[train[:, col] != 0, col]
        mean_imputed_ratings[:, col][test[:, col] != 0] = non_zero_cols.mean()

    # Check if train and test sets are disjoint
    assert(np.all((train * test) == 0)) 
    return train, test, mean_imputed_ratings


In [5]:
def als_step(ratings, W, latent, fixed, lmd, cat="user", basic=False):
    
    n, k, d = latent.shape[0], latent.shape[1], fixed.shape[0]
    lambdaI = lmd * np.eye(k)
    if not basic:
        for i in range(latent.shape[0]):
            if cat == "user":
                W_i, x_i = W[i, :].reshape(1, W[i, :].shape[0]), ratings[i, :]                
            elif cat == "movie":
                W_i, x_i = W[:, i].reshape(1, W[:, i].shape[0]), ratings[:, i]
            latent[i, :] = np.linalg.solve(((fixed.T * W_i).dot(fixed) + lambdaI), (fixed.T * W_i).dot(x_i))
    else:
        fTf = fixed.T.dot(fixed)
        for i in range(latent.shape[0]):
            if cat == "user":
                x_i = ratings[i, :]                
            elif cat == "movie":
                x_i = ratings[:, i]
            latent[i, :] = np.linalg.solve((fTf + lambdaI), fixed.T.dot(x_i))


In [6]:
def update_u_v(ratings, W, users, movies, epochs=10, n_factors=5, lmd=10, basic=False, debug=False):
    epoch = 0
    print(f"Incremental epochs = {epochs}")
    while epoch < epochs:
        als_step(ratings, W, users, movies, lmd, "user", basic)
        als_step(ratings, W, movies, users, lmd, "movie", basic)
        epoch += 1


In [7]:
def get_predictions(users, movies):
    predictions = np.zeros((users.shape[0], movies.shape[0]))
    for i in range(users.shape[0]):
        for j in range(movies.shape[0]):
            predictions[i, j] = users[i, :].dot(movies[j, :])
    return predictions


In [8]:
def get_mse(predictions, truths, tests=None):
    if tests is not None:
        non_zero_predictions = predictions[tests.nonzero()].flatten()
        non_zero_truths = truths[tests.nonzero()].flatten()
    else:
        non_zero_predictions = predictions[truths.nonzero()].flatten()
        non_zero_truths = truths[truths.nonzero()].flatten()
    return mean_squared_error(non_zero_predictions, non_zero_truths)


In [9]:
def get_best_hyperparameters(train, test, mean_imputed_ratings, epochs_list, k_list=[40], lmd_list=[0.1], basic=False, debug=False):
    start_time = time.time()
    
    epochs_list.sort()

    n, d = train.shape

    best_hyper_and_error = {}
    best_hyper_and_error["k"] = k_list[0]
    best_hyper_and_error["lambda"] = lmd_list[0]
    best_hyper_and_error["epochs"] = 0
    best_hyper_and_error["train_error"] = np.inf
    best_hyper_and_error["test_error"] = np.inf
    best_hyper_and_error["mean_error"] = np.inf
    
    W = train.copy()
    W[W > 0] = 1
    
    for k in k_list:
        for lmd in lmd_list:
            print(f"k = {k}    lambda = {lmd}")
            train_error = []
            test_error = []
            mean_error = []

            users = np.random.random((n, k))
            movies = np.random.random((d, k))
            prev = 0
            for (i, epochs) in enumerate(epochs_list):
                if debug:
                    print(f"Total epochs = {epochs}")

                update_u_v(train, W, users, movies, epochs - prev, k, lmd, basic, debug)

                predictions = get_predictions(users, movies)

                train_error.append(get_mse(predictions, train))
                test_error.append(get_mse(predictions, test))
                mean_error.append(get_mse(predictions, mean_imputed_ratings, test))
                if debug:
                    print(f"Train error = {train_error[-1]}")
                    print(f"Test error = {test_error[-1]}")
                    print(f"Mean error = {mean_error[-1]}")
                prev = epochs
            min_test_error_index = np.argmin(test_error)
            if test_error[min_test_error_index] < best_hyper_and_error["test_error"]:
                best_hyper_and_error["k"] = k
                best_hyper_and_error["lambda"] = lmd
                best_hyper_and_error["epochs"] = epochs_list[min_test_error_index]
                best_hyper_and_error["train_error"] = train_error[min_test_error_index]
                best_hyper_and_error["test_error"] = test_error[min_test_error_index]
                best_hyper_and_error["mean_error"] = mean_error[min_test_error_index]
                if debug:
                    print("Current optimal hyperparameters are")
                    print(pd.Series(best_hyper_and_error))
            if debug:
                print(f"Time elapsed = {time.strftime('%Hh %Mm %Ss', time.gmtime(time.time() - start_time))}")
                print()
    return best_hyper_and_error


In [None]:
epochs_list = [1, 2, 5, 10]
k_list = [5, 10, 20, 40, 80]
lmd_list = [0.1, 2, 5, 10, 25, 50, 100]
train, test, mean_imputed_ratings = train_test_split(ratings, 100, 1000)

best_hyper_and_error = get_best_hyperparameters(train, test, mean_imputed_ratings, epochs_list, k_list, lmd_list, False, True)
print(best_hyper_and_error)


k = 5    lambda = 0.1
Total epochs = 1
Incremental epochs = 1
Train error = 0.7866717134258924
Test error = 0.9777389549871205
Mean error = 0.29828981025650775
Total epochs = 2
Incremental epochs = 1
Train error = 0.694584944798435
Test error = 0.961878999205918
Mean error = 0.3291409454061807
Total epochs = 5
Incremental epochs = 3
Train error = 0.6292339358546601
Test error = 0.9137805501109458
Mean error = 0.39100401025976644
Total epochs = 10
Incremental epochs = 5
Train error = 0.6113806263230467
Test error = 0.9026566105892969
Mean error = 0.42920858386947197
Current optimal hyperparameters are
k               5.000000
lambda          0.100000
epochs         10.000000
train_error     0.611381
test_error      0.902657
mean_error      0.429209
dtype: float64
Time elapsed = 00h 00m 03s

k = 5    lambda = 2
Total epochs = 1
Incremental epochs = 1
Train error = 0.8085990706358384
Test error = 0.9364964370884464
Mean error = 0.2458962175755709
Total epochs = 2
Incremental epochs = 1
Tr

Train error = 1.4640639342812618
Test error = 1.3614794964974006
Mean error = 0.6050634893280934
Time elapsed = 00h 00m 56s

k = 10    lambda = 100
Total epochs = 1
Incremental epochs = 1
Train error = 2.4090028220165935
Test error = 2.251952068792267
Mean error = 1.4604436838911699
Total epochs = 2
Incremental epochs = 1
Train error = 2.418732728078368
Test error = 2.265693065216821
Mean error = 1.4598358122107293
Total epochs = 5
Incremental epochs = 3
Train error = 2.4185585510771723
Test error = 2.2659634010967156
Mean error = 1.459826306634325
Total epochs = 10
Incremental epochs = 5
Train error = 2.41844509166565
Test error = 2.26587618448622
Mean error = 1.4597332695673604
Time elapsed = 00h 01m 01s

k = 20    lambda = 0.1
Total epochs = 1
Incremental epochs = 1
Train error = 0.5352089904973121
Test error = 1.4502504028803362
Mean error = 0.8297309508335018
Total epochs = 2
Incremental epochs = 1
Train error = 0.40542950509482184
Test error = 1.4252044645648354
Mean error = 0.89