# CDL for Amazon datasets

In [1]:
import pickle
import tensorflow as tf
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from helpful_stuff.utils_xp_out import write_to_csv, XPDescription, XPResults
from helpful_stuff.utils_metrics import precision_recall_at_k_4arrays
from helpful_stuff.utils_model_out import make_out_dirs
from models.model_cdl_sdae_sgd import CDL
import models.mf_sgd
from models.dataset import DataSet

In [2]:
### create all necessary dirs for output ###

XP_PATH, U_V_PATH, MODEL_PATH = make_out_dirs(model_name='sdae-sgd', xp_name='test') 
print("Out dir of experiment: ", XP_PATH)
print("Out dir of U, V matricies: ", U_V_PATH)
print("Out dir of model parameters: ", MODEL_PATH)

Out dir of experiment:  D:/Models/thesis/sdae-sgd/test/
Out dir of U, V matricies:  D:/Models/thesis/sdae-sgd/test/pickles/
Out dir of model parameters:  D:/Models/thesis/sdae-sgd/test/tf/


## Prepare Data

In [3]:
df = pd.read_json(r'D:\Datasets\amazon_reviews\processed\reviews_Video_Games_5.json')

In [4]:
df.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,summaryProc,unixReviewTime
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,instal game struggle game window live bugs).so...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,pay unlock content i not think,1341792000
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,if like rally car game fun it orient 34;europe...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,good rally game,1372550400


In [5]:
df ['review'] = df.reviewTextProc + ' ' + df.summaryProc
reviews = df.groupby('asin').review.agg(' '.join)
reviews = pd.DataFrame(reviews)
reviews = reviews.reset_index()

In [6]:
ds = DataSet(df_ratings=df, rating_cols=['reviewerID', 'asin', 'overall'], df_reviews=reviews, review_cols=['asin', 'review'])

Filled in 0 empty reviews: []...


In [9]:
del reviews, df

## Training

In [10]:
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (1 entries)


In [11]:
cdl = CDL(ds, out_path=XP_PATH, k=25, hidden_size=100, 
          matrix_noise=0.3, drop_ratio=0.1, epochs=50,
          lambda_w=1, lambda_v=1, lambda_n=10, lambda_q = 10)
mu, pu, qi, bu, bi = cdl.training() #188910

Noising of reviews
Start training...
EPOCH 1 / 50
ALS LOSS RMSE = 1.1505558827879898, MAE = 0.9037118074328621
MODEL LOSS 576.12103
AUTOENCODER LOSS 135.31882
VALIDATION LOSS 1.1592505657665906
EPOCH 2 / 50
ALS LOSS RMSE = 1.1197453489949587, MAE = 0.8779702279376952
MODEL LOSS 135.97928
AUTOENCODER LOSS 132.1906
VALIDATION LOSS 1.1390084504144624
EPOCH 3 / 50
ALS LOSS RMSE = 1.0968466609434462, MAE = 0.8586397172751435
MODEL LOSS 134.65172
AUTOENCODER LOSS 132.1922
VALIDATION LOSS 1.1254126374585418
EPOCH 4 / 50
ALS LOSS RMSE = 1.0783039919143662, MAE = 0.8429767355623358
MODEL LOSS 134.40883
AUTOENCODER LOSS 132.23149
VALIDATION LOSS 1.1153087984142394
EPOCH 5 / 50
ALS LOSS RMSE = 1.06260920690077, MAE = 0.8296863494894282
MODEL LOSS 134.3356
AUTOENCODER LOSS 132.30417
VALIDATION LOSS 1.1073054268318248
EPOCH 6 / 50
ALS LOSS RMSE = 1.0489278196253082, MAE = 0.8180280797593228
MODEL LOSS 134.18665
AUTOENCODER LOSS 132.23872
VALIDATION LOSS 1.1007958763428867
EPOCH 7 / 50
ALS LOSS RMSE

In [12]:
## dump U and V matricies to pickle files
print("pu shape: %s x %s" % pu.shape)
print("qi shape: %s x %s" % qi.shape)

print("beta_u shape: %s" % bu.shape)
print("beta_i shape: %s" % bi.shape)

with open(U_V_PATH + 'mx.pickle', 'wb') as handle:
    pickle.dump({'mu':mu, 'pu':pu, 'qi':qi, 'bu':bu, 'bi':bi}, handle, protocol=pickle.HIGHEST_PROTOCOL)

pu shape: 24303 x 25
qi shape: 10672 x 25
beta_u shape: 24303
beta_i shape: 10672


## Evaluation

In [13]:
test_preds = models.mf_sgd.SGD.predict_dataset_with_params(ds.get_test_rating_matrix(), mu, bu, bi, qi, pu)

In [14]:
mse = mean_squared_error(ds.testset.rate__, test_preds) ** 0.5
mae = mean_absolute_error(ds.testset.rate__, test_preds)

print("MSE: %s" % mse)
print("MAE: %s" % mae)

MSE: 1.0673504415211237
MAE: 0.8100296409187724


In [28]:
k_prec = {}
k_rec = {}

for k in range(0, 200):
    precisions, recalls = precision_recall_at_k_4arrays(ds.testset.uid__, test_preds, ds.testset.rate__, k=k, threshold=3) 
    p_mean = np.mean(list(precisions.values()))
    r_mean = np.mean(list(recalls.values()))
    k_prec[k] = p_mean
    k_rec[k] = r_mean

In [29]:
row = XPResults(dataset='Video Games', xpdata=XPDescription(predictor=None, label='CDL-SDAE-SGD', nfactors=25), rmse=mse, mae=mae, precision=k_prec, recall=k_rec)

In [30]:
write_to_csv(row, 'amazon', 'sdae_sgd_optim')

## Optimization

In [195]:
if not os.path.isdir('./optimiz/'):
    os.mkdir('./optimiz/')
    
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (12 entries)


In [196]:
params = {
    "hidden_size": [100, 150],
    "k": [25, 50],
    "matrix_noise": [0.3],
    "drop_ratio": [0.1],
    "lambda_q": [1, 10, 100],
    "lambda_v": [0.1, 1],
    "lambda_w": [1],
    "lambda_n": [10, 100]
}

# k=50, hidden_size=150, matrix_noise=0.3, drop_ratio=0.1, 
# lambda_w=20, lambda_v=100, lambda_n=10, lambda_q = 0.01

In [197]:
sorted_keys = sorted(params)
combinations = list(it.product(*(params[key] for key in sorted_keys)))

In [198]:
print("Num of combinations: %s" % len(combinations))

Num of combinations: 48


In [199]:
# write header row
write_row('./optimiz/cdl_sdae_sgd_5.csv', sorted_keys + ['rmse_train', 'mae_train', 'rmse_test', 'mae_test'])

In [None]:
for ps in combinations:
    tf.reset_default_graph()
    hyper_params = dict(zip(sorted_keys, ps))
    
    print("Start testing hyper params: ", hyper_params)
    cdl = CDL(ds, out_path=None, epochs=50, **hyper_params)
    
    mu, pu, qi, bu, bi = cdl.training(verbose=False) #188910
    
    preds = mf_sgd.SGD.predict_dataset_with_params(dataset.values, mu, bu, bi, qi, pu)
    train_rmse = mean_squared_error(df_train.overall, preds) ** 0.5
    train_mae = mean_absolute_error(df_train.overall, preds)
    print("MSE (non zero, train set): %s" % train_rmse)
    print("MAE (non zero, train set): %s" % train_mae)
    
    preds = mf_sgd.SGD.predict_dataset_with_params(testset.values, mu, bu, bi, qi, pu)
    test_rmse = mean_squared_error(df_test.overall, preds) ** 0.5
    test_mae = mean_absolute_error(df_test.overall, preds)
    print("MSE (test set): %s" % test_rmse)
    print("MAE (test set): %s" % test_mae)
    
    print("Stop testing hyper params: ", hyper_params)
    
    # write to file
    write_row('./optimiz/cdl_sdae_sgd_5.csv', [hyper_params[k] for k in sorted_keys] + [train_rmse, train_mae, test_rmse, test_mae] )