In [13]:
import numpy as np
import json
import math

import mr.dataset
import mr.io

In [14]:
filename = mr.dataset.download_ml_small()
filename = mr.dataset.download_ml_25m()

In [15]:
M = mr.io.read_movielens_input(filename)

In [16]:
# MAPPING

row2new_row = dict()
col2new_col = dict()

row_counter = 0
col_counter = 0
for i, nnz in enumerate(M):
    row, col, rating = nnz
    row = int(row)
    col = int(col)
    
    if row not in row2new_row:
        row2new_row[row] = row_counter
        row_counter += 1
    
    if col not in col2new_col:
        col2new_col[col] = col_counter
        col_counter += 1
    
    M[i][0] = row2new_row[row]
    M[i][1] = col2new_col[col]
    

print(row_counter)
print(col_counter)

162541
59047


In [17]:
# Save mapping and new matrix

dataset_name = filename.split('.')[0]
np.save(dataset_name, M)

mapping = dict()
mapping['row'] = row2new_row
mapping['col'] = col2new_col

with open('{}_map.json'.format(dataset_name), 'w') as file:
    json.dump(mapping, file)

In [18]:
# Load the matrix

M = np.load('{}.npy'.format(dataset_name))

In [19]:
# Shuffle and split

np.random.shuffle(M)
nnz_count = M.shape[0]

training_count = round(nnz_count * 0.8)
test_count = nnz_count - training_count

M_train = M[:training_count, :]
M_test = M[training_count:, :]

In [20]:
def calc_rmse(y_true, y_predict):
    # parameters should be np.array
    
    return math.sqrt(np.sum((y_predict - y_true) ** 2) / len(y_true))

In [21]:
mean = M_train[:, 2].mean()

print('Mean of the matrix: {:.3f}'.format(mean))

y_true = M_test[:, 2]
y_predict = mean

rmse = calc_rmse(y_true, y_predict)
print('RMSE using the average of matrix: {:.5f}'.format(rmse))

Mean of the matrix: 3.534
RMSE using the average of matrix: 1.06064


In [22]:
row_count = int(M[:, 0].max() + 1)
col_count = int(M[:, 1].max() + 1)

row2mean = np.zeros(row_count)
row2count = np.zeros(row_count)

col2mean = np.zeros(col_count)
col2count = np.zeros(col_count)

for nnz in M:
    row, col, rating = nnz
    row = int(row)
    col = int(col)
    row2count[row] += 1
    row2mean[row] += rating
    
    col2count[col] += 1
    col2mean[col] += rating

row2mean[row] /= row2count[row]
col2mean[col] /= col2count[col]

In [23]:
y_true = M_test[:, 2]
y_predict = np.zeros(len(y_true))

for i, nnz in enumerate(M_test):
    row, col, rating = nnz
    row = int(row)
    col = int(col)
    
    y_predict = row2mean[row]
    
rmse = calc_rmse(y_true, y_predict)
print('RMSE using the average of row: {:.5f}'.format(rmse))

RMSE using the average of row: 1.06197


In [24]:
y_true = M_test[:, 2]
y_predict = np.zeros(len(y_true))

for i, nnz in enumerate(M_test):
    row, col, rating = nnz
    row = int(row)
    col = int(col)
    
    y_predict = col2mean[col]
    
rmse = calc_rmse(y_true, y_predict)
print('RMSE using the average of columns: {:.5f}'.format(rmse))

RMSE using the average of columns: 1.06074
