# CDL for Goodreads datasets

In [2]:
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from helpful_stuff.utils_xp_out import write_to_csv, XPDescription, XPResults
from helpful_stuff.utils_metrics import precision_recall_at_k_4arrays
from helpful_stuff.utils_model_out import make_out_dirs
import models.model_cdl_sdae_sgd
import models.mf_sgd
import models.dataset

from importlib import reload
reload(models.dataset)
reload(models.model_cdl_sdae_sgd)

<module 'models.model_cdl_sdae_sgd' from 'C:\\Users\\irina\\Dev\\master\\models\\model_cdl_sdae_sgd.py'>

In [3]:
### create all necessary dirs for output ###

XP_PATH, U_V_PATH, MODEL_PATH = make_out_dirs(model_name='sdae-sgd', xp_name='goodreads_test') 
print("Out dir of experiment: ", XP_PATH)
print("Out dir of U, V matricies: ", U_V_PATH)
print("Out dir of model parameters: ", MODEL_PATH)

Out dir of experiment:  D:/Models/thesis/sdae-sgd/goodreads_test/
Out dir of U, V matricies:  D:/Models/thesis/sdae-sgd/goodreads_test/pickles/
Out dir of model parameters:  D:/Models/thesis/sdae-sgd/goodreads_test/tf/


## Prepare dataset

In [4]:
df_rates = pd.read_json('D:/Datasets/goodreads_reviews/processed/goodreads_interactions_children.json')
df_rev = pd.read_json('D:/Datasets/goodreads_reviews/processed/goodreads_reviews_children.json')

In [5]:
df_rates.head(2)

Unnamed: 0,book_id,rating,user_id
0,23310161,4,8842281e1d1347389f2ab93d60773d4d
1,18296097,5,8842281e1d1347389f2ab93d60773d4d


In [6]:
df_rev.head(2)

Unnamed: 0,book_id,date_added,date_updated,n_comments,n_votes,rating,read_at,review_id,review_text,review_text_proc,started_at,user_id
0,23310161,Tue Nov 17 11:37:35 -0800 2015,Tue Nov 17 11:38:05 -0800 2015,0,7,4,,f4b4b050f4be00e9283c92a814af2670,Fun sequel to the original.,fun sequel original,,8842281e1d1347389f2ab93d60773d4d
1,17290220,Sat Nov 08 08:54:03 -0800 2014,Wed Jan 25 13:56:12 -0800 2017,0,4,5,Tue Jan 24 00:00:00 -0800 2017,22d424a2b0057b18fb6ecf017af7be92,One of my favorite books to read to my 5 year ...,one favorite book read 5 year old rosie learn ...,,8842281e1d1347389f2ab93d60773d4d


In [7]:
reviews = df_rev.groupby('book_id').review_text_proc.agg(' '.join)
reviews = pd.DataFrame(reviews)
reviews = reviews.reset_index()

In [8]:
ds = models.dataset.DataSet(df_ratings=df_rates, rating_cols=['user_id', 'book_id', 'rating'], df_reviews=reviews, review_cols=['book_id', 'review_text_proc'])

Filled in 135 empty reviews: [2817, 127768, 193412, 194874, 265833]...


In [10]:
del reviews, df_rates, df_rev

## Train model

In [11]:
%reset Out 

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (4 entries)


In [12]:
cdl = models.model_cdl_sdae_sgd.CDL(ds, out_path=XP_PATH, k=50, hidden_size=250, 
          matrix_noise=0.3, drop_ratio=0.2, epochs=60,
          lambda_w=1, lambda_v=1, lambda_n=10, lambda_q = 10)
mu, pu, qi, bu, bi = cdl.training() #188910

Noising of reviews
Start training...
EPOCH 1 / 60
ALS LOSS RMSE = 1.0773835784579748, MAE = 0.8043562550003153
MODEL LOSS 832.48773
AUTOENCODER LOSS 306.33704
VALIDATION LOSS 1.0766787445791401
EPOCH 2 / 60
ALS LOSS RMSE = 1.047620455870552, MAE = 0.7803526635693512
MODEL LOSS 315.40857
AUTOENCODER LOSS 303.38553
VALIDATION LOSS 1.051371612212707
EPOCH 3 / 60
ALS LOSS RMSE = 1.028919003686254, MAE = 0.764898507203547
MODEL LOSS 311.3222
AUTOENCODER LOSS 303.2918
VALIDATION LOSS 1.0365650646448084
EPOCH 4 / 60
ALS LOSS RMSE = 1.015280502736367, MAE = 0.7535648524303195
MODEL LOSS 310.3518
AUTOENCODER LOSS 303.2943
VALIDATION LOSS 1.0264365236679605
EPOCH 5 / 60
ALS LOSS RMSE = 1.0045973036391522, MAE = 0.7446678850280272
MODEL LOSS 309.63425
AUTOENCODER LOSS 303.25046
VALIDATION LOSS 1.0189580306768156
EPOCH 6 / 60
ALS LOSS RMSE = 0.9958543351477882, MAE = 0.7373792162057118
MODEL LOSS 309.13
AUTOENCODER LOSS 303.2558
VALIDATION LOSS 1.0131775438225445
EPOCH 7 / 60
ALS LOSS RMSE = 0.988

In [13]:
## dump U and V matricies to pickle files
print("pu shape: %s x %s" % pu.shape)
print("qi shape: %s x %s" % qi.shape)

print("beta_u shape: %s" % bu.shape)
print("beta_i shape: %s" % bi.shape)

with open(U_V_PATH + 'mx.pickle', 'wb') as handle:
    pickle.dump({'mu':mu, 'pu':pu, 'qi':qi, 'bu':bu, 'bi':bi}, handle, protocol=pickle.HIGHEST_PROTOCOL)

pu shape: 474296 x 50
qi shape: 123196 x 50
beta_u shape: 474296
beta_i shape: 123196


In [None]:
with open(U_V_PATH + 'item_ids_map.pickle', 'wb') as handle:
    pickle.dump(ds., handle, protocol=pickle.HIGHEST_PROTOCOL)

## Evaluate model


In [14]:
test_preds = models.mf_sgd.SGD.predict_dataset_with_params(ds.get_test_rating_matrix(), mu, bu, bi, qi, pu)

In [15]:
mse = mean_squared_error(ds.testset.rate__, test_preds) ** 0.5
mae = mean_absolute_error(ds.testset.rate__, test_preds)

print("MSE: %s" % mse)
print("MAE: %s" % mae)

MSE: 0.9815240577602424
MAE: 0.7168159081593866


In [None]:
k_prec = {}
k_rec = {}

for k in range(0, 200):
    precisions, recalls = precision_recall_at_k_4arrays(ds.testset.uid__, test_preds, ds.testset.rate__, k=k, threshold=3) 
    p_mean = np.mean(list(precisions.values()))
    r_mean = np.mean(list(recalls.values()))
    k_prec[k] = p_mean
    k_rec[k] = r_mean

In [None]:
row = XPResults(dataset='Goodreads_Kids', xpdata=XPDescription(predictor=None, label='CDL-SDAE-SGD', nfactors=25), rmse=mse, mae=mae, precision=k_prec, recall=k_rec)

In [None]:
write_to_csv(row, 'goodreads', 'sdae_sgd_optim')