# CDL for Goodreads datasets

In [1]:
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from helpful_stuff.utils_xp_out import write_to_csv, XPDescription, XPResults, _write_row
from helpful_stuff.utils_metrics import precision_recall_at_k_4arrays
from helpful_stuff.utils_model_out import make_out_dirs
import models.model_cdl_sdae_sgd
import models.mf_sgd
import models.dataset
import os
import itertools as it

from importlib import reload
reload(models.dataset)
reload(models.model_cdl_sdae_sgd)

<module 'models.model_cdl_sdae_sgd' from 'C:\\Users\\irina\\Dev\\master\\models\\model_cdl_sdae_sgd.py'>

In [15]:
### create all necessary dirs for output ###

XP_PATH, U_V_PATH, MODEL_PATH = make_out_dirs(model_name='sdae-sgd', xp_name='goodreads-children') 
print("Out dir of experiment: ", XP_PATH)
print("Out dir of U, V matricies: ", U_V_PATH)
print("Out dir of model parameters: ", MODEL_PATH)

Out dir of experiment:  D:/Models/thesis/sdae-sgd/goodreads-children/
Out dir of U, V matricies:  D:/Models/thesis/sdae-sgd/goodreads-children/pickles/
Out dir of model parameters:  D:/Models/thesis/sdae-sgd/goodreads-children/tf/


## Prepare dataset

In [2]:
df_rates = pd.read_json('D:/Datasets/goodreads_reviews/processed/goodreads_interactions_children.json')
df_rev = pd.read_json('D:/Datasets/goodreads_reviews/processed/goodreads_reviews_children.json')

In [3]:
df_rates.head(2)

Unnamed: 0,book_id,rating,user_id
0,23310161,4,8842281e1d1347389f2ab93d60773d4d
1,18296097,5,8842281e1d1347389f2ab93d60773d4d


In [4]:
df_rev.head(2)

Unnamed: 0,book_id,date_added,date_updated,n_comments,n_votes,rating,read_at,review_id,review_text,review_text_proc,started_at,user_id
0,23310161,Tue Nov 17 11:37:35 -0800 2015,Tue Nov 17 11:38:05 -0800 2015,0,7,4,,f4b4b050f4be00e9283c92a814af2670,Fun sequel to the original.,fun sequel original,,8842281e1d1347389f2ab93d60773d4d
1,17290220,Sat Nov 08 08:54:03 -0800 2014,Wed Jan 25 13:56:12 -0800 2017,0,4,5,Tue Jan 24 00:00:00 -0800 2017,22d424a2b0057b18fb6ecf017af7be92,One of my favorite books to read to my 5 year ...,one favorite book read 5 year old rosie learn ...,,8842281e1d1347389f2ab93d60773d4d


In [5]:
reviews = df_rev.groupby('book_id').review_text_proc.agg(' '.join)
reviews = pd.DataFrame(reviews)
reviews = reviews.reset_index()

In [6]:
ds = models.dataset.DataSet(df_ratings=df_rates, rating_cols=['user_id', 'book_id', 'rating'], df_reviews=reviews, review_cols=['book_id', 'review_text_proc'], noise_reviews=True)

Filled in 135 empty reviews: [2817, 127768, 193412, 194874, 265833]...
Noising of reviews


In [7]:
del reviews, df_rates, df_rev

## Train model

In [16]:
%reset Out 

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (0 entries)


In [18]:
cdl = models.model_cdl_sdae_sgd.CDL(ds, out_path=XP_PATH, k=150, hidden_size=300, 
          drop_ratio=0.2, epochs=150, lambda_w=0.1, lambda_v=0.1, lambda_n=10, lambda_q = 0.1)
mu, pu, qi, bu, bi = cdl.training() #188910

Start training...
EPOCH 1 / 150
SGD LOSS RMSE = 1.0714980669181984, MAE = 0.8009438148904152
MODEL LOSS 362.16577
AUTOENCODER LOSS 302.5493
VALIDATION LOSS 1.079513009195656
EPOCH 2 / 150
SGD LOSS RMSE = 1.0352718244988028, MAE = 0.7720444198722098
MODEL LOSS 301.8861
AUTOENCODER LOSS 291.24866
VALIDATION LOSS 1.0539168808937842
EPOCH 3 / 150
SGD LOSS RMSE = 1.0110275302243314, MAE = 0.7525496243744311
MODEL LOSS 298.16064
AUTOENCODER LOSS 287.43243
VALIDATION LOSS 1.0388654318072208
EPOCH 4 / 150
SGD LOSS RMSE = 0.9921954423320343, MAE = 0.737482211964869
MODEL LOSS 296.4491
AUTOENCODER LOSS 285.43182
VALIDATION LOSS 1.028483054394367
EPOCH 5 / 150
SGD LOSS RMSE = 0.9764573506559011, MAE = 0.7250069135284689
MODEL LOSS 295.3583
AUTOENCODER LOSS 284.03284
VALIDATION LOSS 1.0207312200022134
EPOCH 6 / 150
SGD LOSS RMSE = 0.9625813771962903, MAE = 0.7141163341808446
MODEL LOSS 294.6526
AUTOENCODER LOSS 283.01712
VALIDATION LOSS 1.0146532550034466
EPOCH 7 / 150
SGD LOSS RMSE = 0.9499546428

In [19]:
## dump U and V matricies to pickle files
print("pu shape: %s x %s" % pu.shape)
print("qi shape: %s x %s" % qi.shape)

print("beta_u shape: %s" % bu.shape)
print("beta_i shape: %s" % bi.shape)

with open(U_V_PATH + 'mx.pickle', 'wb') as handle:
    pickle.dump({'mu':mu, 'pu':pu, 'qi':qi, 'bu':bu, 'bi':bi}, handle, protocol=pickle.HIGHEST_PROTOCOL)

pu shape: 474296 x 150
qi shape: 123196 x 150
beta_u shape: 474296
beta_i shape: 123196


In [20]:
with open(U_V_PATH + 'item_ids_map.pickle', 'wb') as handle:
    pickle.dump(ds.iid_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Evaluate model


In [21]:
test_preds = models.mf_sgd.SGD.predict_dataset_with_params(ds.get_test_rating_matrix(), mu, bu, bi, qi, pu)

In [22]:
mse = mean_squared_error(ds.testset.rate__, test_preds) ** 0.5
mae = mean_absolute_error(ds.testset.rate__, test_preds)

print("MSE: %s" % mse)
print("MAE: %s" % mae)

MSE: 0.968854324729682
MAE: 0.697367712130883


In [23]:
k_prec = {}
k_rec = {}

for k in range(0, 200):
    precisions, recalls = precision_recall_at_k_4arrays(ds.testset.uid__, test_preds, ds.testset.rate__, k=k, threshold=3) 
    p_mean = np.mean(list(precisions.values()))
    r_mean = np.mean(list(recalls.values()))
    k_prec[k] = p_mean
    k_rec[k] = r_mean

In [25]:
row = XPResults(dataset='children', xpdata=XPDescription(predictor=None, label='CDL-SDAE', nfactors=150), rmse=mse, mae=mae, precision=k_prec, recall=k_rec)

In [26]:
write_to_csv(row, 'goodreads', 'sdae-sgd')

## Optimization

In [8]:
parent_path = 'D:/Optimizations/master/goodreads/cdl_sdae/'
if not os.path.isdir(parent_path):
    os.mkdir(parent_path)
    
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (3 entries)


In [9]:
params = {
    "hidden_size": [250, 300, 350, 400],
    "k": [150],
    "drop_ratio": [0.2],
    "lambda_q": [0.1],
    "lambda_v": [0.1],
    "lambda_w": [0.1],
    "lambda_n": [10]
}

In [10]:
sorted_keys = sorted(params)
combinations = list(it.product(*(params[key] for key in sorted_keys)))
print("Num of combinations: %s" % len(combinations))

Num of combinations: 4


In [11]:
# write header row
_write_row(parent_path + 'cdl_sdae_sgd.csv', sorted_keys + ['rmse_train', 'mae_train', 'rmse_test', 'mae_test'])

In [12]:
best_rmse = 1000
best_mae = 1000

best_mae_params = None
best_rmse_params = None

i = 0
for ps in combinations:
    tf.reset_default_graph()
    hyper_params = dict(zip(sorted_keys, ps))
    
    i += 1
    print("Start testing %s\%s hyper params %s" % (i, len(combinations),hyper_params))
    cdl = models.model_cdl_sdae_sgd.CDL(ds, out_path=None, epochs=70, **hyper_params)
    
    mu, pu, qi, bu, bi = cdl.training(verbose=False) #188910
    
    preds = models.mf_sgd.SGD.predict_dataset_with_params(ds.get_train_rating_matrix(), mu, bu, bi, qi, pu)
    train_rmse = mean_squared_error(ds.trainset.rate__, preds) ** 0.5
    train_mae = mean_absolute_error(ds.trainset.rate__, preds)
    
    preds = models.mf_sgd.SGD.predict_dataset_with_params(ds.get_test_rating_matrix(), mu, bu, bi, qi, pu)
    test_rmse = mean_squared_error(ds.testset.rate__, preds) ** 0.5
    test_mae =  mean_absolute_error(ds.testset.rate__, preds)
    
    if test_rmse < best_rmse:
        print("New best RMSE (test set): %s" % test_rmse)
        best_rmse = test_rmse
        best_rmse_params = hyper_params
        
    if test_mae < best_mae:
        print("New best MAE (test set): %s" % test_mae)
        best_mae = test_mae
        best_mae_params = hyper_params
        
    print("Stop testing hyper params: ", hyper_params)
    
    # write to file
    _write_row(parent_path + 'cdl_sdae_sgd.csv', [hyper_params[k] for k in sorted_keys] + [train_rmse, train_mae, test_rmse, test_mae] )
    del cdl, preds

Start testing 1\4 hyper params {'lambda_w': 0.1, 'lambda_q': 0.1, 'lambda_v': 0.1, 'k': 150, 'drop_ratio': 0.2, 'lambda_n': 10, 'hidden_size': 250}
Start training...
EPOCH 1 / 70
EPOCH 2 / 70
EPOCH 3 / 70
EPOCH 4 / 70
EPOCH 5 / 70
EPOCH 6 / 70
EPOCH 7 / 70
EPOCH 8 / 70
EPOCH 9 / 70
EPOCH 10 / 70
EPOCH 11 / 70
EPOCH 12 / 70
EPOCH 13 / 70
EPOCH 14 / 70
EPOCH 15 / 70
EPOCH 16 / 70
EPOCH 17 / 70
EPOCH 18 / 70
EPOCH 19 / 70
EPOCH 20 / 70
EPOCH 21 / 70
EPOCH 22 / 70
EPOCH 23 / 70
EPOCH 24 / 70
EPOCH 25 / 70
EPOCH 26 / 70
EPOCH 27 / 70
EPOCH 28 / 70
EPOCH 29 / 70
EPOCH 30 / 70
EPOCH 31 / 70
EPOCH 32 / 70
EPOCH 33 / 70
EPOCH 34 / 70
EPOCH 35 / 70
EPOCH 36 / 70
EPOCH 37 / 70
EPOCH 38 / 70
EPOCH 39 / 70
EPOCH 40 / 70
EPOCH 41 / 70
EPOCH 42 / 70
EPOCH 43 / 70
EPOCH 44 / 70
EPOCH 45 / 70
EPOCH 46 / 70
EPOCH 47 / 70
EPOCH 48 / 70
EPOCH 49 / 70
EPOCH 50 / 70
EPOCH 51 / 70
EPOCH 52 / 70
EPOCH 53 / 70
EPOCH 54 / 70
EPOCH 55 / 70
EPOCH 56 / 70
EPOCH 57 / 70
EPOCH 58 / 70
EPOCH 59 / 70
EPOCH 60 / 70
EPO

In [None]:
print('BEST RMSE %s for params %s' % (best_rmse, best_rmse_params))
print('BEST MAE %s for params %s' % (best_mae, best_mae_params))