# CDL for Goodreads datasets

In [1]:
import pickle
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from helpful_stuff.utils_xp_out import write_to_csv, XPDescription, XPResults
from helpful_stuff.utils_metrics import precision_recall_at_k_4df
from helpful_stuff.utils_model_out import make_out_dirs
import models.model_cdl_sdae_sgd
import models.mf_sgd
import models.dataset

from importlib import reload
reload(models.dataset)
reload(models.model_cdl_sdae_sgd)

<module 'models.model_cdl_sdae_sgd' from 'C:\\Users\\irina\\Dev\\master\\models\\model_cdl_sdae_sgd.py'>

In [2]:
### create all necessary dirs for output ###

XP_PATH, U_V_PATH, MODEL_PATH = make_out_dirs(model_name='sdae-sgd', xp_name='goodreads_test') 
print("Out dir of experiment: ", XP_PATH)
print("Out dir of U, V matricies: ", U_V_PATH)
print("Out dir of model parameters: ", MODEL_PATH)

Out dir of experiment:  D:/Models/thesis/sdae-sgd/goodreads_test/
Out dir of U, V matricies:  D:/Models/thesis/sdae-sgd/goodreads_test/pickles/
Out dir of model parameters:  D:/Models/thesis/sdae-sgd/goodreads_test/tf/


## Prepare dataset

In [None]:
df_rates = pd.read_json('D:/Datasets/goodreads_reviews/processed/goodreads_interactions_children.json')
df_rev = pd.read_json('D:/Datasets/goodreads_reviews/processed/goodreads_reviews_children.json')

In [None]:
df_rates.head(2)

In [None]:
df_rev.head(2)

In [None]:
reviews = df_rev.groupby('book_id').review_text_proc.agg(' '.join)
reviews = pd.DataFrame(reviews)
reviews = reviews.reset_index()

In [None]:
ds = models.dataset.DataSet(df_ratings=df_rates, rating_cols=['user_id', 'book_id', 'rating'], df_reviews=reviews, review_cols=['book_id', 'review_text_proc'])

In [None]:
ds.train_item_num()

In [None]:
ds.review_matrix.shape

In [None]:
del reviews, df_rates, df_rev

## Train model

In [None]:
%reset Out 

In [None]:
cdl = models.model_cdl_sdae_sgd.CDL(ds, out_path=XP_PATH, k=50, hidden_size=250, 
          matrix_noise=0.3, drop_ratio=0.2, epochs=60,
          lambda_w=1, lambda_v=1, lambda_n=10, lambda_q = 10)
mu, pu, qi, bu, bi = cdl.training() #188910

Noising of reviews
Start training...
EPOCH 1 / 50
v_batch (28, 25)
x_batch (28, 10000)
y_batch (28, 10000)
ALS LOSS RMSE = 1.0777410432015937, MAE = 0.8045781860611942
MODEL LOSS 194.34715
AUTOENCODER LOSS 152.82315
VALIDATION LOSS 1.0766537580560767
EPOCH 2 / 50
v_batch (28, 25)
x_batch (28, 10000)
y_batch (28, 10000)
ALS LOSS RMSE = 1.047743944313262, MAE = 0.7803514890220751
MODEL LOSS 155.99887
AUTOENCODER LOSS 153.0903
VALIDATION LOSS 1.051096351345143
EPOCH 3 / 50
v_batch (28, 25)
x_batch (28, 10000)
y_batch (28, 10000)
ALS LOSS RMSE = 1.0288393297004748, MAE = 0.7647227453668742
MODEL LOSS 155.69511
AUTOENCODER LOSS 153.10727
VALIDATION LOSS 1.0361149065310051
EPOCH 4 / 50


In [None]:
## dump U and V matricies to pickle files
print("pu shape: %s x %s" % pu.shape)
print("qi shape: %s x %s" % qi.shape)

print("beta_u shape: %s" % bu.shape)
print("beta_i shape: %s" % bi.shape)

with open(U_V_PATH + 'mx.pickle', 'wb') as handle:
    pickle.dump({'mu':mu, 'pu':pu, 'qi':qi, 'bu':bu, 'bi':bi}, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
ds.review_matrix


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
ds.train_item_num()

123197

In [14]:
len(ds.iid_map)

123196