## Test code for Matrix Factorization with SGD

In [29]:
from mf_sgd import SGD
import pandas as pd
from sklearn.model_selection import train_test_split

### Load and prepare data

In [30]:
df = pd.read_json(r'D:\Datasets\amazon_reviews\processed\reviews_Toys_and_Games_5.json')
df_train, df_test = train_test_split(df, test_size = 0.3, stratify=df['reviewerID'], random_state=42)

In [31]:
df_train = df_train.copy()
df_test = df_test.copy()

In [32]:
asins_map = {v:k for k,v in enumerate(df_train['asin'].unique())}
reviewers_map = {v:k for k,v in enumerate(df_train['reviewerID'].unique())}

In [33]:
df_train['in_asin'] = df_train['asin'].map(asins_map)
df_train['in_revID'] = df_train['reviewerID'].map(reviewers_map)

df_test['in_asin'] = df_test['asin'].map(asins_map)
df_test['in_revID'] = df_test['reviewerID'].map(reviewers_map)

In [34]:
dataset = df_train[['in_asin', 'in_revID', 'overall']]
df_test = df_test.dropna(subset=['in_asin'])
df_test = df_test.astype({"in_asin": int})

### Train MF with SGD

In [35]:
sgd = SGD(dataset=dataset.values, n_users = len(reviewers_map), n_items = len(asins_map), n_factors=15)

In [36]:
mu, pu, qi, bu, bi = sgd.train(n_epochs=25)

Epoch 1\25
Running SGD...
Epoch 2\25
Running SGD...
Epoch 3\25
Running SGD...
Epoch 4\25
Running SGD...
Epoch 5\25
Running SGD...
Epoch 6\25
Running SGD...
Epoch 7\25
Running SGD...
Epoch 8\25
Running SGD...
Epoch 9\25
Running SGD...
Epoch 10\25
Running SGD...
Epoch 11\25
Running SGD...
Epoch 12\25
Running SGD...
Epoch 13\25
Running SGD...
Epoch 14\25
Running SGD...
Epoch 15\25
Running SGD...
Epoch 16\25
Running SGD...
Epoch 17\25
Running SGD...
Epoch 18\25
Running SGD...
Epoch 19\25
Running SGD...
Epoch 20\25
Running SGD...
Epoch 21\25
Running SGD...
Epoch 22\25
Running SGD...
Epoch 23\25
Running SGD...
Epoch 24\25
Running SGD...
Epoch 25\25
Running SGD...


In [37]:
rmse, mae = sgd.current_error()
print('Train RMSE', rmse)
print('Train MAE', mae)

Train RMSE 0.7403648948497228
Train MAE 0.5440157673812007


### Evaluate on test set

In [41]:
test_preds = sgd.predict_dataset(df_test[['in_asin', 'in_revID', 'overall']].values)

In [43]:
print('RMSE', mean_squared_error(df_test.overall, test_preds) ** 0.5)
print('MAE', mean_absolute_error(df_test.overall, test_preds))

RMSE 0.8954389666073621
MAE 0.6567749586599209
