In [1]:
import os 
import sys
from tqdm import tqdm
import importlib
import numpy as np
import pickle
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

module_path = "/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/"
data_path_train = "/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/long_seqs_v11/"
data_path_val = "/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/long_seqs_v11_val/"

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
BATCH_SIZE = 256
N_DATA = 400
T = 20
VOCAB = ['start', 'view', 'click', 'install']
EVENT_VOCAB_DIM = len(VOCAB)
EMB_DIM = 16
HIDDEN_DIM = 128

## Load Data

In [3]:
def load_sequence_from_pickle_to_numpy(pickle_file_path, use_init_token=False):
    """
        A list of sequence in format of (event_type, time_delta)
    :param pickle_file_path: e.g. /.../project-basileus/seq-gan/data/fixed_length/valid_sequences.pickle
    :return: (event_type_seqs, time_delta)
    """
    with open(pickle_file_path, 'rb') as f:
        raw_seqs = pickle.load(f)

    if not raw_seqs or not raw_seqs[0]:
        return np.array([]), np.array([])

    N = len(raw_seqs)
    T = len(raw_seqs[0])
    
    seqs = np.array(raw_seqs)
        
    et_seqs = seqs[:, :, 0].astype(np.float64).reshape((N, T, 1))
    ts_seqs = seqs[:, :, 1].astype(np.float64).reshape((N, T, 1))
    
    return et_seqs, ts_seqs

def zcore_norm(data):
    N = data.shape[0]
    
    _mean = tf.reduce_mean(data, axis=0)
    _mean = tf.expand_dims(_mean, axis=0)
    _mean = tf.repeat(_mean, N, axis=0)

    _std = tf.math.reduce_std(data, axis=0)
    _std = tf.expand_dims(_std, axis=0)
    _std = tf.repeat(_std, N, axis=0)

    return (data - _mean) / _std, _mean[0,:,:], _std[0,:,:]

def zcore_norm_2(data, _mean, _std):
    N = data.shape[0]
    
    _mean = tf.expand_dims(_mean, axis=0)
    _mean = tf.repeat(_mean, N, axis=0)

    _std = tf.expand_dims(_std, axis=0)
    _std = tf.repeat(_std, N, axis=0)

    return (data - _mean) / _std

def get_mean_std(data):
    _mean = tf.reduce_mean(data, axis=0)
    _mean = tf.expand_dims(_mean, axis=0)

    _std = tf.math.reduce_std(data, axis=0)
    _std = tf.expand_dims(_std, axis=0)

    return _mean, _std

def apply_mean_std(data, _mean, _std):
    N = data.shape[0]
    
    _mean = tf.reshape(_mean, (1, T, 1))
    _mean = tf.repeat(_mean, N, axis=0)
    
    _std = tf.reshape(_std, (1, T, 1))
    _std = tf.repeat(_std, N, axis=0)
    
    return data * _std + _mean

# pos_timestamp_seqs, GLOBAL_MEAN_POS, GLOBAL_STD_POS = zcore_norm(raw_pos_timestamp_seqs)
# neg_timestamp_seqs, GLOBAL_MEAN_NEG, GLOBAL_STD_NEG = zcore_norm(raw_neg_timestamp_seqs)

In [4]:
def load_data_and_process(pos_data_path, neg_data_path, ratio=500):
    raw_pos_event_type_seqs, raw_pos_timestamp_seqs = load_sequence_from_pickle_to_numpy(pos_data_path, use_init_token=False)
    raw_neg_event_type_seqs, raw_neg_timestamp_seqs = load_sequence_from_pickle_to_numpy(neg_data_path, use_init_token=False)
    
    # cast indicator data into one-hot
    pos_event_type_seqs = tf.cast(raw_pos_event_type_seqs, tf.int32)
    pos_event_type_seqs = tf.one_hot(pos_event_type_seqs, depth=EVENT_VOCAB_DIM, axis=2, dtype=tf.float64)
    pos_event_type_seqs = tf.squeeze(pos_event_type_seqs, axis=3)

    # cast indicator data into one-hot
    neg_event_type_seqs = tf.cast(raw_neg_event_type_seqs, tf.int32)
    neg_event_type_seqs = tf.one_hot(neg_event_type_seqs, depth=EVENT_VOCAB_DIM, axis=2, dtype=tf.float64)
    neg_event_type_seqs = tf.squeeze(neg_event_type_seqs, axis=3)
    
    pos_timestamp_seqs, _mean_pos, _std_pos = zcore_norm(raw_pos_timestamp_seqs)
    neg_timestamp_seqs, _mean_neg, _std_neg = zcore_norm(raw_neg_timestamp_seqs)
    
    pos_event_type_seqs, pos_timestamp_seqs = pos_event_type_seqs[:N_DATA, :, :], pos_timestamp_seqs[:N_DATA, :, :]
    neg_event_type_seqs, neg_timestamp_seqs = neg_event_type_seqs[:int(N_DATA*ratio), :, :], neg_timestamp_seqs[:int(N_DATA*ratio), :, :]
    
    return pos_event_type_seqs, pos_timestamp_seqs, neg_event_type_seqs, neg_timestamp_seqs, _mean_pos, _std_pos, _mean_neg, _std_neg

In [5]:
pos_data_path_train = os.path.join(data_path_train, 'positive_long_sequences.pickle')
neg_data_path_train = os.path.join(data_path_train, 'negative_long_sequences.pickle')

pos_data_path_val = os.path.join(data_path_val, 'positive_long_sequences.pickle')
neg_data_path_val = os.path.join(data_path_val, 'negative_long_sequences.pickle')

In [7]:
pos_event_type_seqs_train, pos_timestamp_seqs_train, neg_event_type_seqs_train, neg_timestamp_seqs_train, GLOBAL_MEAN_POS, GLOBAL_STD_POS, GLOBAL_MEAN_NEG, GLOBAL_STD_NEG = load_data_and_process(pos_data_path_train, neg_data_path_train)

In [8]:
pos_event_type_seqs_val, pos_timestamp_seqs_val, neg_event_type_seqs_val, neg_timestamp_seqs_val, _, _, _, _ = load_data_and_process(pos_data_path_val, neg_data_path_val)

## Create multitype SeqGan

In [9]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Input, LSTM, Embedding, Reshape, Dense, Dropout, Activation, Multiply, Add, Lambda
from tensorflow.keras import regularizers

from tensorflow.keras.metrics import AUC, BinaryAccuracy, Precision, Recall


def build_classifier(batch_size, T, event_vocab_dim, emb_dim, hidden_dim, dropout_rate=0.25):
    # normal LSTM
    i_et = Input(batch_shape=(batch_size, None, event_vocab_dim), name='event_type')  # input of discrete feature event type
    i_ts = Input(batch_shape=(batch_size, None, 1), name='time_delta_in')  # input of continuous feature timestamp
    
    embed0 = Dense(emb_dim, name='dense_emb')(i_et) # dense matrix size: 6*16
    merged0 = tf.concat([embed0, i_ts], axis=2)
    
    hm = LSTM(hidden_dim,
          name='lstm_token',
          stateful=False,
          return_sequences=False, 
          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          recurrent_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          bias_initializer=tf.keras.initializers.RandomNormal(stddev=0.1))(merged0)

    tm = LSTM(hidden_dim,
          name='lstm_time',
          stateful=False,
          return_sequences=False,
          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          recurrent_initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
          bias_initializer=tf.keras.initializers.RandomNormal(stddev=0.1))(merged0)
    
    token_time_comb = tf.keras.layers.concatenate([hm, tm], axis=1)
    
    dropped = Dropout(rate=dropout_rate)(token_time_comb)
    
    prob = Dense(1, 
             activation='sigmoid',
             name='final',
             kernel_initializer=tf.keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=None),
             bias_initializer=tf.keras.initializers.Constant(value=0.1))(dropped)
        
    classifier = Model(
        inputs=[i_et, i_ts],
        outputs=prob)
        
    metrics = [
        BinaryAccuracy(name='accuracy'),
        Precision(name='precision'),
        Recall(name='recall'),
        AUC(num_thresholds=500, curve='PR', name='auc_pr'),
        AUC(num_thresholds=500, curve='ROC', name='auc_roc')
    ]
    
    classifier.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(lr=0.001),
        metrics=metrics)
    
    return classifier

In [10]:
classifier = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [11]:
classifier.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
event_type (InputLayer)         [(256, None, 4)]     0                                            
__________________________________________________________________________________________________
dense_emb (Dense)               (256, None, 16)      80          event_type[0][0]                 
__________________________________________________________________________________________________
time_delta_in (InputLayer)      [(256, None, 1)]     0                                            
__________________________________________________________________________________________________
tf.concat (TFOpLambda)          (256, None, 17)      0           dense_emb[0][0]                  
                                                                 time_delta_in[0][0]          

## Load Train and Val

In [12]:
N_pos = pos_event_type_seqs_train.shape[0]
N_neg = neg_event_type_seqs_train.shape[0]

# def split(data):
#     N = data.shape[0]
#     train_data, val_data, test_data = data[:int(0.6*N),:,:], data[int(0.6*N):int(1*N),:,:], data[int(1*N):,:,:]
#     return train_data, val_data, test_data

train_pos_et, val_pos_et = pos_event_type_seqs_train, pos_event_type_seqs_val
train_pos_ts, val_pos_ts = pos_timestamp_seqs_train, pos_timestamp_seqs_val

train_neg_et, val_neg_et = neg_event_type_seqs_train, neg_event_type_seqs_val
train_neg_ts, val_neg_ts = neg_timestamp_seqs_train, neg_timestamp_seqs_val

In [13]:
def create_tf_dataset(features: np.array, labels: np.array, batch_size=2, epochs=10, buffer_size=10000):
    """
    Create TF dataset from
    """
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.repeat(epochs)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset


def combine_seqs_to_dataset(pos_et, pos_ts, neg_et, neg_ts, batch_size, epochs):
    _N_pos = pos_et.shape[0]
    _N_neg = neg_et.shape[0]

    _ets = tf.concat([pos_et, neg_et], axis=0)
    _tss = tf.concat([pos_ts, neg_ts], axis=0)
    _labels = tf.concat([np.ones((_N_pos, 1)), np.zeros((_N_neg, 1))], axis=0)

    _dataset = create_tf_dataset((_ets, _tss), 
                                 _labels,
                                 batch_size=batch_size,
                                 epochs=epochs,
                                 buffer_size=_N_pos + _N_neg) # shuffle the entire Dataset
    
    return _dataset

In [14]:
val_dataset = combine_seqs_to_dataset(val_pos_et, val_pos_ts, val_neg_et, val_neg_ts, BATCH_SIZE, 1)

In [15]:
from tensorflow import TensorSpec
element_spec = ((TensorSpec(shape=(BATCH_SIZE, T, 4), dtype=tf.float64, name=None), 
                 TensorSpec(shape=(BATCH_SIZE, T, 1), dtype=tf.float64, name=None)),
                TensorSpec(shape=(BATCH_SIZE, 1), dtype=tf.float64, name=None))

In [None]:
element_spec

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-standalone-val.tf.data')
tf.data.experimental.save(val_dataset, dataset_save_path)
val_dataset = tf.data.experimental.load(dataset_save_path, element_spec)

## Train with imbalanced data

In [None]:
# combining imbalanced data, no augmentations
train_dataset_im = combine_seqs_to_dataset(train_pos_et, train_pos_ts, train_neg_et, train_neg_ts, BATCH_SIZE, 1)

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-im.tf.data')
tf.data.experimental.save(train_dataset_im, dataset_save_path)
print(dataset_save_path)
train_dataset_im = tf.data.experimental.load(dataset_save_path, element_spec)

In [None]:
classifier_im = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [None]:
# _TOTAL_STEPS = int(epochs * (train_pos_ts.shape[0] + train_neg_et.shape[0]) / BATCH_SIZE)
epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3
)
history_gs_im = classifier_im.fit(train_dataset_im, 
                        epochs=epochs,
                        validation_data=val_dataset,
                        shuffle=True,
                        callbacks=[early_stopping]
                       )

## Train with upsampling

In [None]:
N_train_pos = train_pos_et.shape[0]
N_train_neg = train_neg_et.shape[0]

In [None]:
upsampled_indices = np.random.choice(N_train_pos, size=N_train_neg, replace=True).tolist()

In [None]:
upsampled_train_pos_et = tf.gather(train_pos_et, upsampled_indices)
upsampled_train_pos_ts = tf.gather(train_pos_ts, upsampled_indices)

In [None]:
train_dataset_up = combine_seqs_to_dataset(upsampled_train_pos_et, upsampled_train_pos_ts, train_neg_et, train_neg_ts, BATCH_SIZE, 1)

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-up.tf.data')
tf.data.experimental.save(train_dataset_up, dataset_save_path)
print(dataset_save_path)
train_dataset_up = tf.data.experimental.load(dataset_save_path, element_spec)

In [None]:
classifier_up = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [None]:
# _TOTAL_STEPS = int(epochs * (train_pos_ts.shape[0] + train_neg_et.shape[0]) / BATCH_SIZE)
epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3
)
history_up = classifier_up.fit(train_dataset_up, 
                        epochs=epochs,
                        validation_data=val_dataset,
                        shuffle=True,
                        callbacks=[early_stopping]
                       )

## Train Using Gumbel-Softmax generated Data

In [16]:
syn_pos_et_G2_gs_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/gumbel_softmax_2021-03-30-09-39-43/G2/syn_type_sequences.pickle'
syn_pos_ts_G2_gs_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/gumbel_softmax_2021-03-30-09-39-43/G2/syn_time_sequences.pickle'

In [17]:
with open(syn_pos_et_G2_gs_path, 'rb') as f:
    syn_pos_et_G2_gs = pickle.load(f)
    syn_pos_et_G2_gs = tf.one_hot(syn_pos_et_G2_gs, depth=EVENT_VOCAB_DIM, axis=2, dtype=tf.float64)
    syn_pos_et_G2_gs = tf.squeeze(syn_pos_et_G2_gs, axis=3)
    
with open(syn_pos_ts_G2_gs_path, 'rb') as f:
    syn_pos_ts_G2_gs = pickle.load(f)
    syn_pos_ts_G2_gs = zcore_norm_2(syn_pos_ts_G2_gs, GLOBAL_MEAN_POS, GLOBAL_STD_POS)

In [18]:
def sample_from_syn_data(syn_pos_et, syn_pos_ts, train_pos_et, train_pos_ts):
    N_train_pos = train_pos_et.shape[0]
    N_train_neg = train_neg_et.shape[0]
    N_syn_sample = N_train_neg - N_train_pos
    
    syn_sample_indices = np.random.choice(syn_pos_et.shape[0], size=N_syn_sample, replace=False).tolist()    
    
    sampled_syn_pos_et = tf.gather(syn_pos_et, syn_sample_indices)
    sampled_syn_pos_ts = tf.gather(syn_pos_ts, syn_sample_indices)
    
    aug_pos_et = tf.concat([train_pos_et, sampled_syn_pos_et], axis=0)
    aug_pos_ts = tf.concat([train_pos_ts, sampled_syn_pos_ts], axis=0)
    
    assert(aug_pos_et.shape[0] == aug_pos_ts.shape[0] ==N_train_neg)
    
    return aug_pos_et, aug_pos_ts

In [19]:
aug_pos_et_gs_G2, aug_pos_ts_gs_G2 = sample_from_syn_data(syn_pos_et_G2_gs, syn_pos_ts_G2_gs, train_pos_et, train_pos_ts)

In [20]:
train_dataset_gs_G2 = combine_seqs_to_dataset(aug_pos_et_gs_G2, aug_pos_ts_gs_G2, train_neg_et, train_neg_ts, BATCH_SIZE, 1)

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-gs-G2.tf.data')
tf.data.experimental.save(train_dataset_gs_G2, dataset_save_path)
print(dataset_save_path)
train_dataset_gs_G2 = tf.data.experimental.load(dataset_save_path, element_spec)

In [21]:
classifier_gs_G2 = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [22]:
# _TOTAL_STEPS = int(epochs * (train_pos_ts.shape[0] + train_neg_et.shape[0]) / BATCH_SIZE)
epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3
)
history_gs_g2 = classifier_gs_G2.fit(train_dataset_gs_G2, 
                        epochs=epochs,
                        validation_data=val_dataset,
                        shuffle=True,
                        callbacks=[early_stopping]
                       )

Epoch 1/20

KeyboardInterrupt: 

## Train Using GS MLE generated Data

In [None]:
syn_pos_et_G1_gs_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/gumbel_softmax_2021-03-30-09-39-43/G1/syn_type_sequences.pickle'
syn_pos_ts_G1_gs_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/gumbel_softmax_2021-03-30-09-39-43/G1/syn_time_sequences.pickle'

In [None]:
with open(syn_pos_et_G1_gs_path, 'rb') as f:
    syn_pos_et_G1_gs = pickle.load(f)
    syn_pos_et_G1_gs = tf.one_hot(syn_pos_et_G1_gs, depth=EVENT_VOCAB_DIM, axis=2, dtype=tf.float64)
    syn_pos_et_G1_gs = tf.squeeze(syn_pos_et_G1_gs, axis=3)
    
with open(syn_pos_ts_G1_gs_path, 'rb') as f:
    syn_pos_ts_G1_gs = pickle.load(f)
    syn_pos_ts_G1_gs = zcore_norm_2(syn_pos_ts_G1_gs, GLOBAL_MEAN_POS, GLOBAL_STD_POS)

In [None]:
aug_pos_et_gs_G1, aug_pos_ts_gs_G1 = sample_from_syn_data(syn_pos_et_G1_gs, syn_pos_ts_G1_gs, train_pos_et, train_pos_ts)

In [None]:
train_dataset_gs_G1 = combine_seqs_to_dataset(aug_pos_et_gs_G1, aug_pos_ts_gs_G1, train_neg_et, train_neg_ts, BATCH_SIZE, 1)

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-gs-G1.tf.data')
tf.data.experimental.save(train_dataset_gs_G1, dataset_save_path)
print(dataset_save_path)
train_dataset_gs_G1 = tf.data.experimental.load(dataset_save_path, element_spec)

In [None]:
classifier_gs_G1 = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [None]:
# _TOTAL_STEPS = int(epochs * (train_pos_ts.shape[0] + train_neg_et.shape[0]) / BATCH_SIZE)
epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3
)
history_g1 = classifier_gs_G1.fit(train_dataset_gs_G1, 
                        epochs=epochs,
                        validation_data=val_dataset,
                        shuffle=True,
                        callbacks=[early_stopping]
                       )

## Train Using GS Random G0 generated Data

In [None]:
syn_pos_et_G0_gs_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/gumbel_softmax_2021-03-30-09-39-43/G0/syn_type_sequences.pickle'
syn_pos_ts_G0_gs_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/gumbel_softmax_2021-03-30-09-39-43/G0/syn_time_sequences.pickle'

In [None]:
with open(syn_pos_et_G0_gs_path, 'rb') as f:
    syn_pos_et_G0_gs = pickle.load(f)
    syn_pos_et_G0_gs = tf.one_hot(syn_pos_et_G0_gs, depth=EVENT_VOCAB_DIM, axis=2, dtype=tf.float64)
    syn_pos_et_G0_gs = tf.squeeze(syn_pos_et_G0_gs, axis=3)
    
with open(syn_pos_ts_G0_gs_path, 'rb') as f:
    syn_pos_ts_G0_gs = pickle.load(f)
    syn_pos_ts_G0_gs = zcore_norm_2(syn_pos_ts_G0_gs, GLOBAL_MEAN_POS, GLOBAL_STD_POS)

In [None]:
aug_pos_et_gs_G0, aug_pos_ts_gs_G0 = sample_from_syn_data(syn_pos_et_G0_gs, syn_pos_ts_G0_gs, train_pos_et, train_pos_ts)
train_dataset_gs_G0 = combine_seqs_to_dataset(aug_pos_et_gs_G0, aug_pos_ts_gs_G0, train_neg_et, train_neg_ts, BATCH_SIZE, 1)

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-gs-G0.tf.data')
tf.data.experimental.save(train_dataset_gs_G0, dataset_save_path)
print(dataset_save_path)
train_dataset_gs_G0 = tf.data.experimental.load(dataset_save_path, element_spec)

In [None]:
classifier_gs_G0 = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [None]:
# _TOTAL_STEPS = int(epochs * (train_pos_ts.shape[0] + train_neg_et.shape[0]) / BATCH_SIZE)
epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3
)
history_g0 = classifier_gs_G0.fit(train_dataset_gs_G0,
                        epochs=epochs,
                        validation_data=val_dataset,
                        shuffle=True,
                        callbacks=[early_stopping]
                       )

## Train Using RL MCC generated Data

In [None]:
syn_pos_et_G2_mcc_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/monte_carlo_critic_2021-04-02-09-25-47/G2/syn_type_sequences.pickle'
syn_pos_ts_G2_mcc_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/monte_carlo_critic_2021-04-02-09-25-47/G2/syn_time_sequences.pickle'

In [None]:
with open(syn_pos_et_G2_mcc_path, 'rb') as f:
    syn_pos_et_G2_mcc = pickle.load(f)
    syn_pos_et_G2_mcc = tf.one_hot(syn_pos_et_G2_mcc, depth=EVENT_VOCAB_DIM, axis=2, dtype=tf.float64)
    syn_pos_et_G2_mcc = tf.squeeze(syn_pos_et_G2_mcc, axis=3)
    
with open(syn_pos_ts_G2_mcc_path, 'rb') as f:
    syn_pos_ts_G2_mcc = pickle.load(f)
    syn_pos_ts_G2_mcc = zcore_norm_2(syn_pos_ts_G2_mcc, GLOBAL_MEAN_POS, GLOBAL_STD_POS)

In [None]:
def sample_from_syn_data(syn_pos_et, syn_pos_ts, train_pos_et, train_pos_ts):
    N_train_pos = train_pos_et.shape[0]
    N_train_neg = train_neg_et.shape[0]
    N_syn_sample = N_train_neg - N_train_pos
    
    syn_sample_indices = np.random.choice(syn_pos_et.shape[0], size=N_syn_sample, replace=False).tolist()    
    
    sampled_syn_pos_et = tf.gather(syn_pos_et, syn_sample_indices)
    sampled_syn_pos_ts = tf.gather(syn_pos_ts, syn_sample_indices)
    
    aug_pos_et = tf.concat([train_pos_et, sampled_syn_pos_et], axis=0)
    aug_pos_ts = tf.concat([train_pos_ts, sampled_syn_pos_ts], axis=0)
    
    assert(aug_pos_et.shape[0] == aug_pos_ts.shape[0] ==N_train_neg)
    
    return aug_pos_et, aug_pos_ts

In [None]:
aug_pos_et_mcc_G2, aug_pos_ts_mcc_G2 = sample_from_syn_data(syn_pos_et_G2_mcc, syn_pos_ts_G2_mcc, train_pos_et, train_pos_ts)

In [None]:
train_dataset_mcc_G2 = combine_seqs_to_dataset(aug_pos_et_mcc_G2, aug_pos_ts_mcc_G2, train_neg_et, train_neg_ts, BATCH_SIZE, 1)

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-mcc-G2.tf.data')
tf.data.experimental.save(train_dataset_mcc_G2, dataset_save_path)
print(dataset_save_path)
train_dataset_mcc_G2 = tf.data.experimental.load(dataset_save_path, element_spec)

In [None]:
classifier_mcc_G2 = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [None]:
# _TOTAL_STEPS = int(epochs * (train_pos_ts.shape[0] + train_neg_et.shape[0]) / BATCH_SIZE)
epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3
)
history_mcc_g2 = classifier_mcc_G2.fit(train_dataset_mcc_G2, 
                        epochs=epochs,
                        validation_data=val_dataset,
                        shuffle=True,
                        callbacks=[early_stopping]
                       )

## Train Using MCC MLE generated Data

In [None]:
syn_pos_et_G1_mcc_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/monte_carlo_critic_2021-04-02-09-25-47/G1/syn_type_sequences.pickle'
syn_pos_ts_G1_mcc_path = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/syn_data/monte_carlo_critic_2021-04-02-09-25-47/G1/syn_time_sequences.pickle'

In [None]:
with open(syn_pos_et_G1_mcc_path, 'rb') as f:
    syn_pos_et_G1_mcc = pickle.load(f)
    syn_pos_et_G1_mcc = tf.one_hot(syn_pos_et_G1_mcc, depth=EVENT_VOCAB_DIM, axis=2, dtype=tf.float64)
    syn_pos_et_G1_mcc = tf.squeeze(syn_pos_et_G1_mcc, axis=3)
    
with open(syn_pos_ts_G1_mcc_path, 'rb') as f:
    syn_pos_ts_G1_mcc = pickle.load(f)
    syn_pos_ts_G1_mcc = zcore_norm_2(syn_pos_ts_G1_mcc, GLOBAL_MEAN_POS, GLOBAL_STD_POS)

In [None]:
aug_pos_et_mcc_G1, aug_pos_ts_mcc_G1 = sample_from_syn_data(syn_pos_et_G1_mcc, syn_pos_ts_G1_mcc, train_pos_et, train_pos_ts)

In [None]:
train_dataset_mcc_G1 = combine_seqs_to_dataset(aug_pos_et_mcc_G1, aug_pos_ts_mcc_G1, train_neg_et, train_neg_ts, BATCH_SIZE, 1)

In [None]:
path_prefix = '/home/lun/project-basileus/multitype-sequence-generation-by-tlstm-gan/data/classifier_tf_datasets'
dataset_save_path = os.path.join(path_prefix, '1-to-500-mcc-G1.tf.data')
tf.data.experimental.save(train_dataset_mcc_G1, dataset_save_path)
print(dataset_save_path)
train_dataset_mcc_G1 = tf.data.experimental.load(dataset_save_path, element_spec)

In [None]:
classifier_mcc_G1 = build_classifier(batch_size=BATCH_SIZE,
                              T=T,
                              event_vocab_dim=EVENT_VOCAB_DIM,
                              emb_dim=EMB_DIM,
                              hidden_dim=HIDDEN_DIM,    
                              dropout_rate=0.25)

In [None]:
# _TOTAL_STEPS = int(epochs * (train_pos_ts.shape[0] + train_neg_et.shape[0]) / BATCH_SIZE)
epochs = 20
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3
)
history_mcc_g1 = classifier_mcc_G1.fit(train_dataset_mcc_G1, 
                        epochs=epochs,
                        validation_data=val_dataset,
                        shuffle=True,
                        callbacks=[early_stopping]
                       )