# CDL

In [110]:
import numpy as np
import pickle
import tensorflow as tf
import pandas as pd
from keras.preprocessing.text import Tokenizer, one_hot
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix
#init random seed
np.random.seed(5)
from sklearn.model_selection import train_test_split
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict, namedtuple
from experiment_out_utils import precision_recall_at_k_4df, write_to_csv, XPData, XPRow, write_row
import itertools as it
from model_out_utils import make_out_dirs
import mf_sgd
from imp import reload
reload(mf_sgd)

<module 'mf_sgd' from 'C:\\Users\\irina\\Dev\\master\\mf_sgd.py'>

In [111]:
### create all necessary dirs for output ###

XP_PATH, U_V_PATH, MODEL_PATH = make_out_dirs(model_name='sdae-sgd', xp_name='sdae_sgd_optimized4') 
print("Out dir of experiment: ", XP_PATH)
print("Out dir of U, V matricies: ", U_V_PATH)
print("Out dir of model parameters: ", MODEL_PATH)

Out dir of experiment:  D:/Models/thesis/sdae-sgd/sdae_sgd_optimized4/
Out dir of U, V matricies:  D:/Models/thesis/sdae-sgd/sdae_sgd_optimized4/pickles/
Out dir of model parameters:  D:/Models/thesis/sdae-sgd/sdae_sgd_optimized4/tf/


## 1. Data Preprocessing

In [139]:
#df = pd.read_json('/home/neopux/UHH/datasets/Video_Games_5_proc.json')
df = pd.read_json(r'D:\Datasets\amazon_reviews\processed\reviews_Toys_and_Games_5.json')

In [140]:
df.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTextProc,reviewTime,reviewerID,reviewerName,summary,summaryProc,unixReviewTime
0,439893577,"[0, 0]",5,I like the item pricing. My granddaughter want...,i like item pricing my granddaughter want mark...,"01 29, 2014",A1VXOAVRGKGEAK,Angie,Magnetic board,magnetic board,1390953600
1,439893577,"[1, 1]",4,Love the magnet easel... great for moving to d...,love magnet easel great move different area wi...,"03 28, 2014",A8R62G708TSCM,Candace,it works pretty good for moving to different a...,work pretty good move different area,1395964800


In [141]:
df_train, df_test = train_test_split(df, test_size = 0.3, stratify=df['reviewerID'], random_state=42)

### Prepare Reviews

In [142]:
train_reviews = df_train.groupby('asin').reviewTextProc.agg(' '.join)
test_reviews = df_test.groupby('asin').reviewTextProc.agg(' '.join)

In [143]:
train_summaries = df_train.groupby('asin').summaryProc.agg(' '.join)
test_summaries = df_test.groupby('asin').summaryProc.agg(' '.join)

In [144]:
total_train_rev = train_summaries + train_reviews
total_test_rev = test_summaries + test_reviews

In [145]:
vectorizer = TfidfVectorizer(max_features=10000)
item_infomation_matrix = vectorizer.fit_transform(total_train_rev.values)

In [146]:
del total_train_rev, total_test_rev, test_summaries, train_summaries, train_reviews, test_reviews

In [147]:
print(item_infomation_matrix.shape)

(11917, 10000)


In [148]:
sorted(vectorizer.vocabulary_.items(), reverse=True)[5940:5960]

[('had', 4059),
 ('hack', 4058),
 ('habitat', 4057),
 ('habit', 4056),
 ('haba', 4055),
 ('ha', 4054),
 ('gyroscope', 4053),
 ('gyro', 4052),
 ('gymnastic', 4051),
 ('gym', 4050),
 ('guys', 4049),
 ('guy', 4048),
 ('gut', 4047),
 ('gus', 4046),
 ('guppy', 4045),
 ('guppies', 4044),
 ('guppie', 4043),
 ('gup', 4042),
 ('guns', 4041),
 ('gunner', 4040)]

In [149]:
list(vectorizer.stop_words_)[0:10]

['hourage',
 'onebuy',
 'haothis',
 'zirndorf',
 'missles',
 'dishonesty',
 'breeds',
 'willlet',
 'soother',
 'punks']

In [150]:
item_infomation_matrix = np.array(item_infomation_matrix.todense())

### Prepare Rating Matrix

In [152]:
df_train = df_train.copy()
asins_map = {v:k for k,v in enumerate(df_train['asin'].unique())}
reviewers_map = {v:k for k,v in enumerate(df_train['reviewerID'].unique())}

df_train['in_asin'] = df_train['asin'].map(asins_map)
df_train['in_revID'] = df_train['reviewerID'].map(reviewers_map)

dataset = df_train[['in_asin', 'in_revID', 'overall']]

trainset, valset = train_test_split(dataset, test_size = 0.2, stratify=dataset['in_revID'], random_state=42)

df_test = df_test.copy()
df_test['value'] = 0
df_test['in_asin'] = df_test['asin'].map(asins_map)
df_test['in_revID'] = df_test['reviewerID'].map(reviewers_map)
df_test = df_test.dropna(subset=['in_asin'])
df_test = df_test.astype({"in_asin": int})

testset = df_test[['in_asin', 'in_revID', 'overall']]

## 2. Building Model

### Masking Noise 

In [153]:
def mask(corruption_level, shape):
    return np.random.binomial(1, 1 - corruption_level, shape)

def add_noise(x , corruption_level ):
    mask_ = mask(corruption_level , x.shape)
    print("Mask shape: " + str(mask_.shape))
    x = np.multiply(x, mask_)
    print("Noising completed..:" + str(x.shape))
    return x

### SDAE & CDL

In [154]:
class CDL():
    def __init__(self , rating_matrix, valid_set, item_infomation_matrix, n_users, n_items, out_path = None, k=10, 
                 epochs=50, batch_size=32, lr=0.001, hidden_size=25, matrix_noise = 0.3,
                drop_ratio=0.1, lambda_w = 1, lambda_n = 1, lambda_v = 1, lambda_q = 0.01):
        self.out_path = out_path
        
        self.k = k
        self.n_input = item_infomation_matrix.shape[1] # dimensionality of text representations - 1000
        self.n_hidden1 = hidden_size
        self.n_hidden2 = self.k
        
        # lambdas for loss calc
        self.lambda_w = lambda_w
        self.lambda_n = lambda_n
        self.lambda_v = lambda_v
        self.lambda_q = lambda_q
        
        self.drop_ratio = drop_ratio
        self.learning_rate = lr
        self.epochs = epochs
        self.batch_size = batch_size
        
        self.num_u = n_users
        self.num_v = n_items
        
        intializer = tf.variance_scaling_initializer()
        self.non_zero_idx = rating_matrix > 0
        
        self.Weights = { 
            'w1' : tf.Variable(intializer([self.n_input, self.n_hidden1]), dtype=tf.float32),
            'w2' : tf.Variable(intializer([self.n_hidden1, self.n_hidden2]), dtype=tf.float32),
            'w3' : tf.Variable(intializer([self.n_hidden2, self.n_hidden1]), dtype=tf.float32),
            'w4' : tf.Variable(intializer([self.n_hidden1, self.n_input]), dtype=tf.float32)   
        }
        self.Biases = {
            #'b1' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'b2' : tf.Variable(tf.random_normal( [self.n_hidden2] , mean=0.0, stddev=1 / self.lambda_w )),
            #'b3' : tf.Variable(tf.random_normal( [self.n_hidden1] , mean=0.0, stddev=1 / self.lambda_w )),
            #'b4' : tf.Variable(tf.random_normal( [self.n_input] , mean=0.0, stddev=1 / self.lambda_w ))
            'b1' : tf.Variable(tf.zeros(self.n_hidden1)),
            'b2' : tf.Variable(tf.zeros(self.n_hidden2)),
            'b3' : tf.Variable(tf.zeros(self.n_hidden1)),
            'b4' : tf.Variable(tf.zeros(self.n_input))
        }
        
        self.item_infomation_matrix = item_infomation_matrix
        self.item_infomation_matrix_noise = add_noise(self.item_infomation_matrix , matrix_noise)
        self.rating_matrix = rating_matrix
        self.valid_set = valid_set
    
        self.build_model()
        self.saver = tf.train.Saver()
        
    def encoder(self , x , drop_ratio):
        w1 = self.Weights['w1']
        b1 = self.Biases['b1']
        L1 = tf.nn.relu(tf.matmul(x, w1) + b1)
        L1 = tf.nn.dropout( L1 , keep_prob= 1 - drop_ratio)
        
        w2 = self.Weights['w2']
        b2 = self.Biases['b2']
        L2 = tf.nn.relu(tf.matmul(L1, w2) + b2)
        L2 = tf.nn.dropout(L2 , keep_prob= 1 - drop_ratio)
        return L2
    
    def decoder(self , x , drop_ratio):
        w3 = self.Weights['w3']
        b3 = self.Biases['b3']
        L3 = tf.nn.relu(tf.matmul(x, w3) + b3)
        L3 = tf.nn.dropout(L3 , keep_prob= 1 - drop_ratio)

        w4 = self.Weights['w4']
        b4 = self.Biases['b4']
        L4 = tf.nn.relu(tf.matmul(L3, w4) + b4)
        L4 = tf.nn.dropout(L4 , keep_prob= 1 - drop_ratio)
        return L4
    
    def build_model(self):
        self.model_X_0 = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        self.model_X_c = tf.placeholder(tf.float32 , shape=(None , self.n_input))
        
        self.model_V = tf.placeholder(tf.float32 , shape=(None , self.k))
        self.model_drop_ratio = tf.placeholder(tf.float32)
        
        self.V_sdae = self.encoder(self.model_X_0 , self.model_drop_ratio)
        self.y_pred = self.decoder(self.V_sdae , self.model_drop_ratio)
        
        self.Regularization = tf.reduce_sum([tf.nn.l2_loss(w) + tf.nn.l2_loss(b) 
                                             for w,b in zip(self.Weights.values() , self.Biases.values())])
        loss_r = 1/2 * self.lambda_w * self.Regularization
        self.loss_a = 1/2 * self.lambda_n * tf.reduce_sum(tf.pow(self.model_X_c - self.y_pred , 2))
        loss_v = 1/2 * self.lambda_v * tf.reduce_sum(tf.pow(self.model_V - self.V_sdae , 2))
        
        self.Loss = loss_r + self.loss_a + loss_v
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.Loss)
        
    
    def training(self):
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
        
        ## define dirs for tensorboard if needed
        if self.out_path != None:
            train_writer = tf.summary.FileWriter('%s/tf/train' % self.out_path, sess.graph)
            test_writer = tf.summary.FileWriter('%s/tf/test' % self.out_path, sess.graph)
            
        val_losses = []
        
        sess.run(tf.global_variables_initializer())
        mf = mf_sgd.SGD(dataset=self.rating_matrix, n_factors=self.k, 
                        n_items=self.num_v, n_users=self.num_u, 
                        lambda_q=self.lambda_q)
        
        for epoch in range(0, self.epochs):
            print("EPOCH %s / %s" % (epoch + 1, self.epochs))
            
            V_sdae = sess.run(self.V_sdae , feed_dict={self.model_X_0 : self.item_infomation_matrix_noise , 
                                                       self.model_drop_ratio : self.drop_ratio})
            # calc and print ALS loss every N epochs
            mu, pu, qi, bu, bi = mf.run_epoch(qi_cdl = V_sdae)
            err_rmse, err_mae = mf.current_error()
            
            val_loss =  mean_squared_error(self.valid_set[:, -1], mf.predict_dataset(self.valid_set)) ** 0.5
            val_losses.append(val_loss)
            
            ## stop early if during last 3 epochs error is only increasing
            if val_losses[-3:] and all(loss > val_losses[-3:][0] for loss in val_losses[-2:]):
                print('Stopping early because %s is larger than %s' % (val_losses[-3:][0], val_losses[-2:]))
                break
            
            auto_losses = []
            model_losses = []
            for i in range(0 , self.item_infomation_matrix.shape[0] , self.batch_size):
                X_train_batch = self.item_infomation_matrix_noise[i : i+self.batch_size]
                y_train_batch = self.item_infomation_matrix[i : i+self.batch_size]
                
                V_batch = qi[i : i + self.batch_size]
                _ , my_loss, auto_loss = sess.run([self.optimizer, self.Loss, self.loss_a] , 
                                       feed_dict={self.model_X_0: X_train_batch , 
                                                  self.model_X_c: y_train_batch, 
                                                  self.model_V: V_batch, 
                                                  self.model_drop_ratio : self.drop_ratio})
                auto_losses.append(auto_loss)
                model_losses.append(my_loss)
            
            print("ALS LOSS RMSE = %s, MAE = %s" % (err_rmse, err_mae)) 
            print("MODEL LOSS %s" % np.mean(model_losses))
            print("AUTOENCODER LOSS %s" % np.mean(auto_losses))  
            print("VALIDATION LOSS %s" % val_loss)
            
            # save log files
            if self.out_path != None:
                # dump summaries
                summary = tf.Summary();
                summary.value.add(tag='Autoencoder Loss', simple_value=np.mean(auto_losses))
                summary.value.add(tag='Model Loss', simple_value=np.mean(model_losses))
                summary.value.add(tag='ALS Loss', simple_value=err_rmse)
                summary.value.add(tag='Val Loss', simple_value=val_loss)
                train_writer.add_summary(summary, epoch + 1)
                # dump model and pickles
                if epoch % 5 == 0:
                    # save tensorflow model
                    self.saver.save(sess, '%s/tf/model_epoch_%s.ckpt' % (self.out_path, epoch))
                    # save matricies and biases
                    with open('%s/pickles/mx_epoch_%s.pickle' % (self.out_path, epoch), 'wb') as handle:
                        pickle.dump({'mu':mu, 'pu':pu, 'qi':qi, 'bu':bu, 'bi':bi}, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
        sess.close()
        return mu, pu, qi, bu, bi

In [155]:
item_infomation_matrix.shape

(11917, 10000)

In [156]:
dataset.shape

(117317, 3)

In [157]:
testset.shape

(50245, 3)

### Training

In [158]:
%reset Out
tf.reset_default_graph()

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (6 entries)


In [159]:
cdl = CDL(trainset.values, valset.values, item_infomation_matrix, n_users=len(reviewers_map), n_items=len(asins_map),
          out_path=XP_PATH, k=25, hidden_size=100, 
          matrix_noise=0.3, drop_ratio=0.1, epochs=50,
          lambda_w=1, lambda_v=1, lambda_n=10, lambda_q = 10)
mu, pu, qi, bu, bi = cdl.training() #188910

Mask shape: (11917, 10000)
Noising completed..:(11917, 10000)
EPOCH 1 / 50
ALS LOSS RMSE = 0.9672284246053714, MAE = 0.7636993889826998
MODEL LOSS 546.3411
AUTOENCODER LOSS 150.94588
VALIDATION LOSS 0.9775386104392274
EPOCH 2 / 50
ALS LOSS RMSE = 0.9454426783442945, MAE = 0.7424814635280944
MODEL LOSS 152.40305
AUTOENCODER LOSS 150.60194
VALIDATION LOSS 0.9645576385203087
EPOCH 3 / 50
ALS LOSS RMSE = 0.9271677479359064, MAE = 0.72457765491275
MODEL LOSS 151.61737
AUTOENCODER LOSS 150.63403
VALIDATION LOSS 0.9542944538057154
EPOCH 4 / 50
ALS LOSS RMSE = 0.9113937737261483, MAE = 0.7090915862235802
MODEL LOSS 151.341
AUTOENCODER LOSS 150.6282
VALIDATION LOSS 0.9459461828721625
EPOCH 5 / 50
ALS LOSS RMSE = 0.8975303356724822, MAE = 0.695520195370166
MODEL LOSS 151.15845
AUTOENCODER LOSS 150.62498
VALIDATION LOSS 0.9390390837815858
EPOCH 6 / 50
ALS LOSS RMSE = 0.8851820704252794, MAE = 0.6834553598856111
MODEL LOSS 151.0142
AUTOENCODER LOSS 150.60057
VALIDATION LOSS 0.9332533706018639
EPOC

In [160]:
## dump U and V matricies to pickle files
print("pu shape: %s x %s" % pu.shape)
print("qi shape: %s x %s" % qi.shape)

print("beta_u shape: %s" % bu.shape)
print("beta_i shape: %s" % bi.shape)

with open(U_V_PATH + 'mx.pickle', 'wb') as handle:
    pickle.dump({'mu':mu, 'pu':pu, 'qi':qi, 'bu':bu, 'bi':bi}, handle, protocol=pickle.HIGHEST_PROTOCOL)

pu shape: 19412 x 25
qi shape: 11917 x 25
beta_u shape: 19412
beta_i shape: 11917


## Evaluation

In [161]:
test_preds = mf_sgd.SGD.predict_dataset_with_params(testset.values, mu, bu, bi, qi, pu)

In [162]:
df_test['value'] = test_preds

In [163]:
mse = mean_squared_error(df_test.overall, df_test.value) ** 0.5
mae = mean_absolute_error(df_test.overall, df_test.value)

print("MSE: %s" % mse)
print("MAE: %s" % mae)

MSE: 0.9024861162977229
MAE: 0.6614456544927189


In [164]:
k_prec = {}
k_rec = {}

for k in range(0, 200):
    precisions, recalls = precision_recall_at_k_4df(df_test, k=k, threshold=3) 
    p_mean = np.mean(list(precisions.values()))
    r_mean = np.mean(list(recalls.values()))
    k_prec[k] = p_mean
    k_rec[k] = r_mean

In [165]:
row = XPRow(dataset='Toys and Games', xpdata=XPData(predictor=None, label='CDL-SDAE-SGD', nfactors=25), rmse=mse, mae=mae, precision=k_prec, recall=k_rec)

In [166]:
write_to_csv(row, 'sdae_sgd_optimized4')

## Optimization

In [195]:
if not os.path.isdir('./optimiz/'):
    os.mkdir('./optimiz/')
    
%reset Out

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Flushing output cache (12 entries)


In [196]:
params = {
    "hidden_size": [100, 150],
    "k": [25, 50],
    "matrix_noise": [0.3],
    "drop_ratio": [0.1],
    "lambda_q": [1, 10, 100],
    "lambda_v": [0.1, 1],
    "lambda_w": [1],
    "lambda_n": [10, 100]
}

# k=50, hidden_size=150, matrix_noise=0.3, drop_ratio=0.1, 
# lambda_w=20, lambda_v=100, lambda_n=10, lambda_q = 0.01

In [197]:
sorted_keys = sorted(params)
combinations = list(it.product(*(params[key] for key in sorted_keys)))

In [198]:
print("Num of combinations: %s" % len(combinations))

Num of combinations: 48


In [199]:
# write header row
write_row('./optimiz/cdl_sdae_sgd_5.csv', sorted_keys + ['rmse_train', 'mae_train', 'rmse_test', 'mae_test'])

In [200]:
for ps in combinations:
    tf.reset_default_graph()
    hyper_params = dict(zip(sorted_keys, ps))
    
    print("Start testing hyper params: ", hyper_params)
    cdl = CDL(dataset.values, testset.values, item_infomation_matrix,
              n_users=len(reviewers_map), n_items=len(asins_map),
              out_path=None, epochs=50, **hyper_params)
    
    mu, pu, qi, bu, bi = cdl.training() #188910
    
    preds = mf_sgd.SGD.predict_dataset_with_params(dataset.values, mu, bu, bi, qi, pu)
    train_rmse = mean_squared_error(df_train.overall, preds) ** 0.5
    train_mae = mean_absolute_error(df_train.overall, preds)
    print("MSE (non zero, train set): %s" % train_rmse)
    print("MAE (non zero, train set): %s" % train_mae)
    
    preds = mf_sgd.SGD.predict_dataset_with_params(testset.values, mu, bu, bi, qi, pu)
    test_rmse = mean_squared_error(df_test.overall, preds) ** 0.5
    test_mae = mean_absolute_error(df_test.overall, preds)
    print("MSE (test set): %s" % test_rmse)
    print("MAE (test set): %s" % test_mae)
    
    print("Stop testing hyper params: ", hyper_params)
    
    # write to file
    write_row('./optimiz/cdl_sdae_sgd_5.csv', [hyper_params[k] for k in sorted_keys] + [train_rmse, train_mae, test_rmse, test_mae] )

Start testing hyper params:  {'k': 25, 'lambda_v': 0.1, 'lambda_n': 10, 'drop_ratio': 0.1, 'hidden_size': 100, 'lambda_q': 1, 'matrix_noise': 0.3, 'lambda_w': 1}
Mask shape: (10668, 10000)
Noising completed..:(10668, 10000)
EPOCH 1 / 50
Running SGD...
ALS LOSS RMSE = 1.146258261541519, MAE = 0.8998693705933507
MODEL LOSS 578.9956
AUTOENCODER LOSS 140.20537
VALIDATION LOSS 1.1578175818141412
EPOCH 2 / 50
Running SGD...
ALS LOSS RMSE = 1.113357593106672, MAE = 0.8721600789981285
MODEL LOSS 138.97192
AUTOENCODER LOSS 136.30075
VALIDATION LOSS 1.1346009947588271
EPOCH 3 / 50
Running SGD...
ALS LOSS RMSE = 1.0892960046037186, MAE = 0.851714444789876
MODEL LOSS 138.12167
AUTOENCODER LOSS 136.357
VALIDATION LOSS 1.1191726398161224
EPOCH 4 / 50
Running SGD...
ALS LOSS RMSE = 1.0700055683768157, MAE = 0.8353285029059948
MODEL LOSS 138.07527
AUTOENCODER LOSS 136.34114
VALIDATION LOSS 1.1076883180561743
EPOCH 5 / 50
Running SGD...
ALS LOSS RMSE = 1.0537868619558617, MAE = 0.821493072049333
MODEL 