In [1]:
import os 
import sys
from tqdm import tqdm
import importlib
import numpy as np
import pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

In [3]:
BATCH_SIZE = 256
T = 20
VOCAB = ['start', 'view', 'click', 'install']
EVENT_VOCAB_DIM = len(VOCAB)
EMB_DIM = 16
HIDDEN_DIM = 128

## Classifier Model

In [4]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Input, LSTM, Embedding, Reshape, Dense, Dropout, Activation, Multiply, Add, Lambda
from tensorflow.keras import regularizers

from tensorflow.keras.metrics import AUC, BinaryAccuracy, Precision, Recall


def build_classifier(batch_size, T, event_vocab_dim, emb_dim, hidden_dim, dropout_rate=0.25):
    # normal LSTM
    i_et = Input(batch_shape=(batch_size, None, event_vocab_dim), name='event_type')  # input of discrete feature event type
    i_ts = Input(batch_shape=(batch_size, None, 1), name='time_delta_in')  # input of continuous feature timestamp
    
    embed0 = Dense(emb_dim, name='dense_emb')(i_et) # dense matrix size: 6*16
    merged0 = tf.concat([embed0, i_ts], axis=2)
    
    hm = LSTM(hidden_dim,
          name='lstm_token',
          stateful=False,
          return_sequences=False, 
          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          recurrent_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          bias_initializer=tf.keras.initializers.RandomNormal(stddev=0.1))(merged0)

    tm = LSTM(hidden_dim,
          name='lstm_time',
          stateful=False,
          return_sequences=False,
          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          recurrent_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          bias_initializer=tf.keras.initializers.RandomNormal(stddev=0.1))(merged0)
    
    token_time_comb = tf.keras.layers.concatenate([hm, tm], axis=1)
    
    dropped = Dropout(rate=dropout_rate)(token_time_comb)
    
    prob = Dense(1, 
             activation='sigmoid',
             name='final',
             kernel_initializer=tf.keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=None),
             bias_initializer=tf.keras.initializers.Constant(value=0.1))(dropped)
        
    classifier = Model(
        inputs=[i_et, i_ts],
        outputs=prob)
        
    metrics = [
        BinaryAccuracy(name='accuracy'),
        Precision(name='precision'),
        Recall(name='recall'),
        AUC(num_thresholds=500, curve='PR', name='auc_pr'),
        AUC(num_thresholds=500, curve='ROC', name='auc_roc')
    ]
    
    classifier.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(lr=0.001),
        metrics=metrics)
    
    return classifier

In [5]:
classifier = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [6]:
classifier.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
event_type (InputLayer)         [(256, None, 4)]     0                                            
__________________________________________________________________________________________________
dense_emb (Dense)               (256, None, 16)      80          event_type[0][0]                 
__________________________________________________________________________________________________
time_delta_in (InputLayer)      [(256, None, 1)]     0                                            
__________________________________________________________________________________________________
tf.concat (TFOpLambda)          (256, None, 17)      0           dense_emb[0][0]                  
                                                                 time_delta_in[0][0]          

## Load Saved TF Datasets

In [7]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'

In [44]:
train_names_gs_100 = [
'1-to-100-gs-im',
'1-to-100-gs-up',
'1-to-100-gs-G0',
'1-to-100-gs-G1',
'1-to-100-gs-G2']

train_names_gs_1000 = [
'1-to-1000-gs-im',
'1-to-1000-gs-up',
'1-to-1000-gs-G0',
'1-to-1000-gs-G1',
'1-to-1000-gs-G2']

val_data_path_gs_100 = os.path.join(path_prefix, '1-to-100-gs-val.tf.data')

val_data_path_gs_1000 = os.path.join(path_prefix, '1-to-1000-gs-val.tf.data')

In [45]:
results_save_path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/experiments'

In [46]:
# For each dataset, run 10 trails; At each Trail, build a new model and train to get best metrics

TRIALS = 3

epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5
)

In [47]:
from tensorflow import TensorSpec
element_spec = ((TensorSpec(shape=(BATCH_SIZE, T, 4), dtype=tf.float64, name=None), 
                 TensorSpec(shape=(BATCH_SIZE, T, 1), dtype=tf.float64, name=None)),
                TensorSpec(shape=(BATCH_SIZE, 1), dtype=tf.float64, name=None))

## Experiment for 1-to-100-gs

In [48]:
val_dataset_gs_100 = tf.data.experimental.load(val_data_path_gs_100, element_spec)

In [49]:
for name in train_names_gs_100:
    dataset_path = os.path.join(path_prefix, name + '.tf.data')
    train_dataset = tf.data.experimental.load(dataset_path, element_spec)
    
    print('Processing Data:', dataset_path)
    ten_run_bests = []
    
    for run in range(TRIALS):        
        print('Trial:', run)
        
        classifier_run = build_classifier(batch_size=BATCH_SIZE,
                                          T=T,
                                          event_vocab_dim=EVENT_VOCAB_DIM,
                                          emb_dim=EMB_DIM,
                                          hidden_dim=HIDDEN_DIM,    
                                          dropout_rate=0.25)
        
        run_results = classifier_run.fit(train_dataset, 
                                        epochs=epochs,
                                        validation_data=val_dataset_gs_100,
                                        shuffle=True,
                                        callbacks=[early_stopping])
        
        run_history = run_results.history
        best_ind = np.argmin(run_history['val_loss'])
        run_best = {k : v_list[best_ind] for k, v_list in run_history.items()}
        
        ten_run_bests.append(run_best)
    
    df_ten_bests = pd.DataFrame(ten_run_bests)
    results_save_path = os.path.join(results_save_path_prefix, name + '_ten_bests.csv')
    df_ten_bests.to_csv(results_save_path)

Processing Data: /home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets/1-to-100-gs-im.tf.data
Trial: 0
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Trial: 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Trial: 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Processing Data: /home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets/1-to-100-gs-up.tf.data
Trial: 0
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20
Trial: 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Trial: 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Processing Data: /home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets/1-to-100-gs-G0.tf.data
Trial: 0
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Trial: 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Trial: 2
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Processing Data: /home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets/1-to-100-gs-G1.tf.data
Trial: 0
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Trial: 1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Trial: 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Processing Data: /home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets/1-to-100-gs-G2.tf.data
Trial: 0
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Trial: 1
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Trial: 2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


In [51]:
TRIALS

3

## Experiment for 1-to-1000-gs

In [None]:
val_dataset_gs_1000 = tf.data.experimental.load(val_data_path_gs_1000, element_spec)

In [None]:
for name in train_names_gs_1000:
    dataset_path = os.path.join(path_prefix, name + '.tf.data')
    train_dataset = tf.data.experimental.load(dataset_path, element_spec)
    
    print('Processing Data:', dataset_path)
    ten_run_bests = []
    
    for run in range(TRIALS):
        print('Trial:', run)
        classifier_run = build_classifier(batch_size=BATCH_SIZE,
                                          T=T,
                                          event_vocab_dim=EVENT_VOCAB_DIM,
                                          emb_dim=EMB_DIM,
                                          hidden_dim=HIDDEN_DIM,    
                                          dropout_rate=0.25)
        
        run_results = classifier_run.fit(train_dataset, 
                                        epochs=epochs,
                                        validation_data=val_dataset_gs_1000,
                                        shuffle=True,
                                        callbacks=[early_stopping])
        
        run_history = run_results.history
        best_ind = np.argmin(run_history['val_loss'])
        run_best = {k : v_list[best_ind] for k, v_list in run_history.items()}
        
        ten_run_bests.append(run_best)
    
    df_ten_bests = pd.DataFrame(ten_run_bests)
    results_save_path = os.path.join(results_save_path_prefix, name + '_ten_bests.csv')
    df_ten_bests.to_csv(results_save_path)