In [1]:
import numpy as np
import json
import math

import mr.dataset
import mr.io

In [2]:
filename = mr.dataset.download_ml_small()
# filename = mr.dataset.download_ml_25m()

dataset_name = 'ml_small'

In [3]:
M = mr.io.read_movielens_input(filename)

In [5]:
M.shape

(100836, 3)

In [4]:
# MAPPING

row2new_row = dict()
col2new_col = dict()

row_counter = 0
col_counter = 0
for i, nnz in enumerate(M):
    row, col, rating = nnz
    row = int(row)
    col = int(col)
    
    if row not in row2new_row:
        row2new_row[row] = row_counter
        row_counter += 1
    
    if col not in col2new_col:
        col2new_col[col] = col_counter
        col_counter += 1
    
    M[i][0] = row2new_row[row]
    M[i][1] = col2new_col[col]
    

print(row_counter)
print(col_counter)

610
9724


In [None]:
# Save mapping and new matrix

dataset_name = filename.split('.')[0]
np.save(dataset_name, M)

mapping = dict()
mapping['row'] = row2new_row
mapping['col'] = col2new_col

with open('{}_map.json'.format(dataset_name), 'w') as file:
    json.dump(mapping, file)

In [13]:
# Load the matrix

M = np.load('{}.npy'.format(dataset_name))

In [14]:
# Shuffle and split

np.random.shuffle(M)
nnz_count = M.shape[0]

training_count = round(nnz_count * 0.8)
test_count = nnz_count - training_count

M_train = M[:training_count, :]
M_test = M[training_count:, :]

In [15]:
def calc_rmse(y_true, y_predict):
    # parameters should be np.array
    
    return math.sqrt(np.sum((y_predict - y_true) ** 2) / len(y_true))

In [16]:
mean = M_train[:, 2].mean()

row_count = int(M[:, 0].max() + 1)
col_count = int(M[:, 1].max() + 1)

row2mean = np.zeros(row_count)
row2count = np.zeros(row_count)

col2mean = np.zeros(col_count)
col2count = np.zeros(col_count)

for nnz in M_train:
    row, col, rating = nnz
    row = int(row)
    col = int(col)
    row2count[row] += 1
    row2mean[row] += rating
    
    col2count[col] += 1
    col2mean[col] += rating

row2mean[row2count == 0] = mean
col2mean[col2count == 0] = mean
row2count[row2count == 0] = 1
col2count[col2count == 0] = 1

row2mean /= row2count
col2mean /= col2count

row2deviation = row2mean - mean
col2deviation = col2mean - mean

In [17]:
print('Mean of the matrix: {:.3f}'.format(mean))

y_true = M_test[:, 2]
y_predict_rowd = np.zeros(len(y_true))
y_predict_cold = np.zeros(len(y_true))
y_predict_rowcold = np.zeros(len(y_true))
y_predict_mean = np.ones(len(y_true)) * mean

for i, nnz in enumerate(M_test):
    row, col, rating = nnz
    row = int(row)
    col = int(col)
    
    y_predict_rowd[i] = mean + row2deviation[row]
    y_predict_cold[i] = mean + col2deviation[col]
    y_predict_rowcold[i] = mean + row2deviation[row] + col2deviation[col]
    
rmse = calc_rmse(y_true, y_predict_mean)
print('Mean prediction RMSE: {:.5f}'.format(rmse))

rmse = calc_rmse(y_true, y_predict_rowd)
print('Row-average adjusted mean prediction RMSE: {:.5f}'.format(rmse))

rmse = calc_rmse(y_true, y_predict_cold)
print('Col-average adjusted mean prediction RMSE: {:.5f}'.format(rmse))

rmse = calc_rmse(y_true, y_predict_rowcold)
print('Row-average & Column-average adjusted mean prediction RMSE: {:.5f}'.format(rmse))

Mean of the matrix: 3.503
Mean prediction RMSE: 1.05184
Row-average adjusted mean prediction RMSE: 0.94703
Col-average adjusted mean prediction RMSE: 0.98017
Row-average & Column-average adjusted mean prediction RMSE: 0.91202
