In [69]:
import sys
import numpy as np
sys.path.append('../')
from tensor_factorization import initialize_factorization, D, fine_tune, Lambda
from tensor_factorization import evaluate
from FactorizationRatingsAprox import FactorizationRatingsAprox
from split_data import split_data
from sparse_array import NDSparseArray
from datetime import datetime
# setting path

In [70]:
model_path = "../factorization_movies_model.pkl"

In [71]:
dataset_path = "../datasets/movies/ratings_small.csv"
train_path = "../datasets/movies/ratings_small_train.csv"
test_path = "../datasets/movies/ratings_small_test.csv"

In [72]:
split_data(dataset_path, 0.99)

In [73]:
# Load csv
train_data = np.loadtxt(train_path, delimiter=",", skiprows=0, dtype="i4,i4,f,i4")
test_data = np.loadtxt(test_path, delimiter=",", skiprows=0, dtype="i4,i4,f,i4")
all_data = np.loadtxt(dataset_path, delimiter=",", skiprows=1, dtype="i4,i4,f,i4")
train_data[:6]

array([(306, 1258, 4. ,  948834931), (105, 3006, 4. , 1085640181),
       (423, 5291, 4.5, 1353690457), (263, 4406, 3. , 1117842764),
       (564, 2313, 5. ,  974713583), (430, 1923, 4.5, 1111488767)],
      dtype=[('f0', '<i4'), ('f1', '<i4'), ('f2', '<f4'), ('f3', '<i4')])

In [103]:
# Prepare the data
def transform_data(data):
    # Put timestamps into "bins" of year and month only
    _timestamps = [line[3] for line in data]
    _timestamps = [datetime.fromtimestamp(timestamp) for timestamp in _timestamps]
    # _timestamps = [(timestamp.year-1995)*12+timestamp.month for timestamp in _timestamps]
    _timestamps = [timestamp.hour for timestamp in _timestamps]

    # Load user Ids and movieIds
    _user_ids = [line[0] for line in data]
    _movie_ids = [line[1] for line in data]
    _ratings = [line[2] for line in data]

    return _user_ids, _movie_ids, _ratings, _timestamps

def make_sparse(_SP, _Y_shape):
    _user_ids, _movie_ids, _ratings, _timestamps = _SP
    # Create sparse array
    Y = NDSparseArray(_Y_shape)
    for i in range(len(_user_ids)):
        Y[_user_ids[i], _movie_ids[i], _timestamps[i]] = _ratings[i]

    return Y

user_ids, movie_ids, ratings, timestamps = transform_data(all_data)
Y_shape = [max(user_ids) + 10, max(movie_ids) + 1, max(timestamps) + 1]
Y = make_sparse(transform_data(train_data), Y_shape)
Y_test = make_sparse(transform_data(test_data), Y_shape)
Y[564, 1831, 71]  # 71 = (2000-1995) * 12 + 11

0

In [104]:
# Create new matrices for factorization
U, M, C, S = initialize_factorization(Y, D(20, 30, 10), Lambda(0.1, 0.1, 0.1, 0.1))

In [105]:
# Define average error functon
def get_mae(_U, _M, _C, _S, _Y_test: NDSparseArray):
    error_sum = 0
    n = len(_Y_test.elements)
    for i, j, k in _Y_test.indexes():
        rating = _Y_test[i, j, k]
        evalRating = evaluate(_U, _M, _C, _S, i, j, k)
        error = abs(rating - evalRating)
        error_sum += error
    MAE = error_sum / n
    return MAE

In [106]:
# Train on train dataset
la = Lambda(0.000001, 0.0000001, 0.000001, 0.000001)  # Learning rate
for t in range(3):
    def coef(s):
        return 0.01 * 1 / (30 ** 0.5)
    U, M, C, S = fine_tune(U, M, C, S, Y, coef, la)
    print("Train: ", get_mae(U, M, C, S, Y))
    print("Test: ", get_mae(U, M, C, S, Y_test))

1.0878697506967212 99003/99003     
Train:  0.7052804353558156
Test:  0.7557024810908054
0.7206559922971517 99003/99003     
Train:  0.6812353428795236
Test:  0.7350498764883253
0.7073887840825074 99003/99003     
Train:  0.679642895338741
Test:  0.7270044524033182


In [46]:
U, M, C, S = fine_tune(U, M, C, S, Y, coef, la)

0.6820264547936019 99003/99003     


In [49]:
print(get_mae(U, M, C, S, Y_test))

0.6777108801116616


In [8]:
# Save this object as FactorizationRatingsAprox pickle
new_user_id = Y.shape[0] - 1
obj = FactorizationRatingsAprox(U, M, C, S, Y.shape, new_user_id)
obj.to_file(model_path)

In [10]:
# Test if it is saving corectly
obj = FactorizationRatingsAprox.from_file(model_path)
get_mae(obj.U, obj.M, obj.C, obj.S, Y_test)

0.6902198204655863

In [8]:
# Get Top 10 recomendations if these movies were wathched
ratings = [
    [1029, 3.0],
    [1061, 3.0],
    [1129, 2.0],
    [1172, 4.0],
    [1263, 2.0],
    [1287, 2.0],
    [1293, 2.0],
    [1339, 3.5]
]
obj.evaluate(ratings)

0.9190950148667214 8/8     
0.7750084121344327 8/8     
0.6774916519001883 8/8      
0.6919980781271202 8/8      
0.6631147275017487 8/8     


[[1172, 3.2281718563862523],
 [899, 3.158024038643328],
 [1221, 3.1322298275288367],
 [8132, 3.115458083868882],
 [318, 3.1137198995269992],
 [994, 3.102026754494994],
 [1228, 3.069376775729906],
 [7502, 3.0594296838507526],
 [1305, 3.0592467515543493],
 [908, 3.056134261026032]]