In [1]:
import sys
import numpy as np
sys.path.append('../')
from tensor_factorization import initialize_factorization, D, fine_tune, Lambda
from tensor_factorization import evaluate
from FactorizationRatingsAprox import FactorizationRatingsAprox
from split_data import split_data
from sparse_array import NDSparseArray
from datetime import datetime
# setting path

In [2]:
model_path = "../factorization_movies_model.pkl"

In [3]:
dataset_path = "../datasets/movies/ratings_small.csv"
train_path = "../datasets/movies/ratings_small_train.csv"
test_path = "../datasets/movies/ratings_small_test.csv"

In [4]:
split_data(dataset_path, 0.99)

In [5]:
# Load csv
train_data = np.loadtxt(train_path, delimiter=",", skiprows=0, dtype="i4,i4,f,i4")
test_data = np.loadtxt(test_path, delimiter=",", skiprows=0, dtype="i4,i4,f,i4")
all_data = np.loadtxt(dataset_path, delimiter=",", skiprows=1, dtype="i4,i4,f,i4")
train_data[:6]

array([(472, 2470, 4. ,  944321548), (150, 1198, 4. , 1114306013),
       (624, 2369, 3. , 1019128415), ( 19,  379, 3. ,  855193845),
       (450, 2542, 4.5, 1475737111), (362, 1208, 3.5, 1218567130)],
      dtype=[('f0', '<i4'), ('f1', '<i4'), ('f2', '<f4'), ('f3', '<i4')])

In [6]:
# Prepare the data
def transform_data(data):
    # Put timestamps into "bins" of year and month only
    _timestamps = [line[3] for line in data]
    _timestamps = [datetime.fromtimestamp(timestamp) for timestamp in _timestamps]
    _timestamps = [(timestamp.year-1995)*12+timestamp.month for timestamp in _timestamps]
    # _timestamps = [timestamp.hour for timestamp in _timestamps]

    # Load user Ids and movieIds
    _user_ids = [line[0] for line in data]
    _movie_ids = [line[1] for line in data]
    _ratings = [line[2] for line in data]

    return _user_ids, _movie_ids, _ratings, _timestamps

def make_sparse(_SP, _Y_shape):
    _user_ids, _movie_ids, _ratings, _timestamps = _SP
    # Create sparse array
    Y = NDSparseArray(_Y_shape)
    for i in range(len(_user_ids)):
        Y[_user_ids[i], _movie_ids[i], _timestamps[i]] = _ratings[i]

    return Y

user_ids, movie_ids, ratings, timestamps = transform_data(all_data)
Y_shape = [max(user_ids) + 10, max(movie_ids) + 1, max(timestamps) + 1]
Y = make_sparse(transform_data(train_data), Y_shape)
Y_test = make_sparse(transform_data(test_data), Y_shape)
Y[564, 1831, 71]  # 71 = (2000-1995) * 12 + 11

1.0

In [7]:
# Create new matrices for factorization
U, M, C, S = initialize_factorization(Y, D(20, 50, 5), Lambda(0.1, 0.1, 0.1, 0.1))

In [8]:
# Define average error functon
def get_mae(_U, _M, _C, _S, _Y_test: NDSparseArray):
    error_sum = 0
    n = len(_Y_test.elements)
    for i, j, k in _Y_test.indexes():
        rating = _Y_test[i, j, k]
        evalRating = evaluate(_U, _M, _C, _S, i, j, k)
        error = abs(rating - evalRating)
        error_sum += error
    MAE = error_sum / n
    return MAE

In [9]:
# Train on train dataset
la = Lambda(0.000001, 0.0000001, 0.000001, 0.000001)  # Learning rate
for t in range(5):
    def coef(s):
        return 0.01 * 1 / (30 ** 0.5)
    U, M, C, S = fine_tune(U, M, C, S, Y, coef, la)
    print("Train: ", get_mae(U, M, C, S, Y))
    print("Test: ", get_mae(U, M, C, S, Y_test))

1.4602379992500596 99003/99003     
Train:  0.7191937449046043
Test:  0.7591732833686596
0.7145027645596232 99003/99003     
Train:  0.6790961686967548
Test:  0.7337014786642824
0.6926415033769125 99003/99003     
Train:  0.666480945035268
Test:  0.7286124303009864
0.6845453127506254 99003/99003     
Train:  0.661790543048865
Test:  0.7163108624411547
0.6803914643841268 99003/99003     
Train:  0.6575257803503232
Test:  0.7246143872464532


In [10]:
U, M, C, S = fine_tune(U, M, C, S, Y, coef, la)
print("Train: ", get_mae(U, M, C, S, Y))
print("Test: ", get_mae(U, M, C, S, Y_test))

0.6762252992947649 99003/99003     
Train:  0.6553885860702672
Test:  0.7211334913312598


In [15]:
# Save this object as FactorizationRatingsAprox pickle
new_user_id = Y.shape[0] - 1
obj = FactorizationRatingsAprox(U, M, C, S, Y.shape, new_user_id)
obj.to_file(model_path)

In [13]:
# Test if it is saving corectly
obj_load = FactorizationRatingsAprox.from_file(model_path)
get_mae(obj_load.U, obj_load.M, obj_load.C, obj_load.S, Y_test)

0.7211334913312598

In [14]:
# Get Top 10 recomendations if these movies were wathched
ratings = [
    [1029, 3.0],
    [1061, 3.0],
    [1129, 2.0],
    [1172, 4.0],
    [1263, 2.0],
    [1287, 2.0],
    [1293, 2.0],
    [1339, 3.5]
]
obj_load.evaluate(ratings[:10])

1.2157990290740082 8/8     
0.8086621205235007 8/8     
0.7270643535671233 8/8     
0.6747731806280051 8/8      
0.6340716046036107 8/8     


[[1172, 3.373065065209053],
 [73290, 3.219979234769365],
 [309, 3.143388901644209],
 [5114, 3.137096152025177],
 [80, 3.1260975623384106],
 [5238, 3.1241447844502197],
 [1859, 3.101746917336289],
 [9010, 3.093870321181367],
 [7116, 3.085164787628055],
 [116897, 3.07780226192594],
 [759, 3.075926812111441],
 [8264, 3.0604781422649583],
 [93040, 3.0591057072233774],
 [1939, 3.0412915413808],
 [5475, 3.03902152664593],
 [6273, 3.0382809415465353],
 [3030, 3.0338756591833964],
 [926, 3.0329117012955047],
 [6918, 3.032551745801409],
 [7087, 3.025326881384684],
 [2920, 3.0196887678086295],
 [59995, 3.0181667431557964],
 [2318, 3.0175317991378523],
 [26587, 3.0088268247326218],
 [2064, 3.0070234435844467],
 [2066, 3.0042843354293547],
 [2938, 3.00369299117819],
 [1860, 2.997340102779173],
 [296, 2.988019860132126],
 [858, 2.9824025154627796],
 [7075, 2.9802465551282014],
 [97957, 2.9788841384643474],
 [27803, 2.9786091994746213],
 [3310, 2.9784899263514983],
 [2563, 2.9774317097500704],
 [125