In [1]:
import sys
import numpy as np
sys.path.append('../')
from tensor_factorization import initialize_factorization, D, fine_tune, Lambda
from tensor_factorization import evaluate
from FactorizationRatingsAprox import FactorizationRatingsAprox
from split_data import split_data
from sparse_array import NDSparseArray
from datetime import datetime
# setting path

In [2]:
model_path = "../factorization_movies_model.pkl"

In [3]:
dataset_path = "../datasets/movies/ratings_small.csv"
train_path = "../datasets/movies/ratings_small_train.csv"
test_path = "../datasets/movies/ratings_small_test.csv"

In [4]:
split_data(dataset_path, 0.99)

In [4]:
# Load csv
train_data = np.loadtxt(train_path, delimiter=",", skiprows=0, dtype="i4,i4,f,i4")
test_data = np.loadtxt(test_path, delimiter=",", skiprows=0, dtype="i4,i4,f,i4")
all_data = np.loadtxt(dataset_path, delimiter=",", skiprows=1, dtype="i4,i4,f,i4")
train_data[:6]

array([(472, 2470, 4. ,  944321548), (150, 1198, 4. , 1114306013),
       (624, 2369, 3. , 1019128415), ( 19,  379, 3. ,  855193845),
       (450, 2542, 4.5, 1475737111), (362, 1208, 3.5, 1218567130)],
      dtype=[('f0', '<i4'), ('f1', '<i4'), ('f2', '<f4'), ('f3', '<i4')])

In [5]:
# Prepare the data
def transform_data(data):
    # Put timestamps into "bins" of year and month only
    _timestamps = [line[3] for line in data]
    _timestamps = [datetime.fromtimestamp(timestamp) for timestamp in _timestamps]
    _timestamps = [(timestamp.year-1995)*12+timestamp.month for timestamp in _timestamps]
    # _timestamps = [timestamp.hour for timestamp in _timestamps]

    # Load user Ids and movieIds
    _user_ids = [line[0] for line in data]
    _movie_ids = [line[1] for line in data]
    _ratings = [line[2] for line in data]

    return _user_ids, _movie_ids, _ratings, _timestamps

def make_sparse(_SP, _Y_shape):
    _user_ids, _movie_ids, _ratings, _timestamps = _SP
    # Create sparse array
    Y = NDSparseArray(_Y_shape)
    for i in range(len(_user_ids)):
        Y[_user_ids[i], _movie_ids[i], _timestamps[i]] = _ratings[i]

    return Y

user_ids, movie_ids, ratings, timestamps = transform_data(all_data)
Y_shape = [max(user_ids) + 10, max(movie_ids) + 1, max(timestamps) + 1]
Y = make_sparse(transform_data(train_data), Y_shape)
Y_test = make_sparse(transform_data(test_data), Y_shape)
Y[564, 1831, 71]  # 71 = (2000-1995) * 12 + 11

1.0

In [6]:
# Create new matrices for factorization
U, M, C, S = initialize_factorization(Y, D(20, 50, 5), Lambda(0.1, 0.1, 0.1, 0.1))

In [7]:
# Define average error functon
def get_mae(_U, _M, _C, _S, _Y_test: NDSparseArray):
    error_sum = 0
    n = len(_Y_test.elements)
    for i, j, k in _Y_test.indexes():
        rating = _Y_test[i, j, k]
        evalRating = evaluate(_U, _M, _C, _S, i, j, k)
        error = abs(rating - evalRating)
        error_sum += error
    MAE = error_sum / n
    return MAE

In [11]:
# Train on train dataset
la = Lambda(0.000001, 0.0000001, 0.000001, 0.000001)  # Learning rate
for t in range(3):
    def coef(s):
        return 0.01 * 1 / (30 ** 0.5)
    U, M, C, S = fine_tune(U, M, C, S, Y, coef, la)
    print("Train: ", get_mae(U, M, C, S, Y))
    print("Test: ", get_mae(U, M, C, S, Y_test))

0.7136140268383545 99003/99003     
Train:  0.6763020457318929
Test:  0.7255363793853257
0.6927259372758419 99003/99003     
Train:  0.664436973969987
Test:  0.7222579349242101
0.684305667138462 99003/99003      
Train:  0.6605550124565056
Test:  0.7312647203412331


In [12]:
U, M, C, S = fine_tune(U, M, C, S, Y, coef, la)
print("Train: ", get_mae(U, M, C, S, Y))
print("Test: ", get_mae(U, M, C, S, Y_test))

0.6792340186429755 99003/99003     
Train:  0.6580793004566786
Test:  0.7190968772501566


In [13]:
# Save this object as FactorizationRatingsAprox pickle
new_user_id = Y.shape[0] - 1
obj = FactorizationRatingsAprox(U, M, C, S, Y, new_user_id)
obj.to_file(model_path)

In [16]:
# Test if it is saving corectly
obj_load = FactorizationRatingsAprox.from_file(model_path)
get_mae(obj_load.U, obj_load.M, obj_load.C, obj_load.S, Y_test)

0.7190968772501566

In [15]:
# Get Top 10 recomendations if these movies were wathched
ratings = [
    [1029, 3.0],
    [1061, 3.0],
    [1129, 2.0],
    [1172, 4.0],
    [1263, 2.0],
    [1287, 2.0],
    [1293, 2.0],
    [1339, 3.5]
]
obj_load.evaluate(ratings[:10])

KeyError: 2470