In [847]:
import os
import sys
import glob
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from sklearn import metrics
from rnn_attention import attention
from tensorflow.contrib.rnn import GRUCell
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn

In [848]:
HIDDEN_SIZE=100
SEQUENCE_LENGTH=100
EMBEDDING_DIM=128
ATTENTION_SIZE=50
KEEP_PROB=0.5
vocabulary_size=5385
num_epochs=5
DELTA=0.5
tf.reset_default_graph() 

In [849]:
def read_data(data_path):
    if data_path is None:
        return None
    data_files = glob.glob(os.path.join(data_path, 'part-*'))
    data = pd.concat(map(lambda file: pd.read_csv(file, sep='#|,', header=None, engine='python'), data_files), axis = 0, ignore_index = True)
    labels = np.array([data.iloc[:, 0].values])
    sequences = np.array(data.iloc[:,1:].values)
    return sequences, labels

In [850]:
# Generates mini batches
def random_mini_batches(X, Y, mini_batch_size = 64, seed=0):
    m = X.shape[0]                 
    mini_batches = []
    
    np.random.seed(seed)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation, :]
    shuffled_Y = Y[:, permutation]

    num_complete_minibatches = math.floor(m/mini_batch_size)    
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[k * mini_batch_size : k * mini_batch_size + mini_batch_size, :]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size : m, :]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

In [851]:
# Different Placeholders
with tf.name_scope('Inputs'):
    batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph')
    target_ph = tf.placeholder(tf.float32, [None], name='target_ph')
    seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

In [852]:
# Embedding Layer
with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
    batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph)

In [853]:
# RNN Layer
with tf.name_scope('RNN'):
    rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32)

In [854]:
# Attention Layer
with tf.name_scope('Attention_layer'):
    attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE)

In [855]:
# Drop Out
with tf.name_scope('Drop_out'):
    drop = tf.nn.dropout(attention_output, keep_prob_ph)

In [856]:
# Fully Connected Layer
with tf.name_scope('Fully_connected_layer'):
    hidden_size = attention_output.shape[1].value
    W = tf.Variable(tf.truncated_normal([hidden_size, 1], stddev=0.1))  # Hidden size is multiplied by 2 for Bi-RNN
    b = tf.Variable(tf.constant(0., shape=[1]))
    y_hat = tf.nn.xw_plus_b(drop, W, b)
    y_hat = tf.squeeze(y_hat)

In [857]:
# Metrics
with tf.name_scope('Metrics'):
    # Cross-entropy loss and optimizer initialization
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph))
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)
    # Accuracy metric
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32))
   

In [858]:
def getSeqLength(x):
    if x is None:
        return None
    if 0 in list(x):
        return list(x).index(0) + 1
    return len(list(x))
        

In [859]:
with tf.Session() as sess:
    sequences_train, labels_train = read_data('data/train')
    sequences_test, labels_test = read_data('data/test')
    sess.run(tf.global_variables_initializer())
    epoch_loss = []
    epoch_thresholds = []
    epoch_fpr = []
    epoch_tpr = []
    epoch_auc = []
    for epoch in range(num_epochs):
        loss_list = []
        pred_list = []
        lab_list = []
        loss_train = 0
        loss_test = 0
        minibatches = random_mini_batches(sequences_train, labels_train)
        for minibatch in minibatches:
            (minibatch_X, minibatch_Y) = minibatch
            #seq_len = np.array([list(x).index(0) + 1 for x in minibatch_X])
            seq_len = np.array([getSeqLength(x) for x in minibatch_X])
            feed_dict = {batch_ph: minibatch_X,
                         target_ph: minibatch_Y.flatten('C'),
                         seq_len_ph: seq_len,
                         keep_prob_ph: KEEP_PROB}
            _, loss_tr, pred, lab = sess.run([optimizer, loss, y_hat, target_ph], feed_dict=feed_dict)
            loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
            loss_list.append(loss_train)
            pred_list += list(pred)
            lab_list  += list(lab)
        loss_train = sum(loss_list)/len(loss_list)
        fpr_train, tpr_train, thresholds_train = metrics.roc_curve(lab_list, pred_list)
        auc_train = metrics.auc(fpr_train, tpr_train)
        epoch_loss.append(loss_train)
        epoch_thresholds.append(thresholds_train)
        epoch_fpr.append(fpr_train)
        epoch_tpr.append(tpr_train)
        epoch_auc.append(auc_train)
    
    print('########################')
    print(epoch_loss)
    print(epoch_fpr)
    print(epoch_tpr)
    print(epoch_thresholds)
    print(epoch_auc)
    print('########################')
#         loss_list = []
#         lab_list = []
#         pred_list = []
#         sig_acts = []
#         emb_list = []
#         act_scores = []
#         all_activities = []
#         minibatches = random_mini_batches(sequences_test, labels_test)
#         for minibatch in minibatches:
#             (minibatch_X, minibatch_Y) = minibatch
#             seq_len = np.array([list(x).index(0) + 1 for x in minibatch_X])
#             feed_dict = {batch_ph: x_batch,
#                          target_ph: y_batch,
#                          seq_len_ph: seq_len,
#                          keep_prob_ph: 1.0
#                         }
#             loss_te, lab, pred, alp, u_emb = sess.run([loss, target_ph, y_hat, alphas, attention_output],
#                                                                  feed_dict=feed_dict)
#             emb_list += list(u_emb)                        
#             pred_list += list(pred)
#             lab_list += list(lab)
#             loss_list.append(loss_te)
        
        
            

########################
[0.43063702672668785, 0.4112151782378957, 0.4044541634321028, 0.3977318465433289, 0.39048972000027243]
[array([0.        , 0.        , 0.        , ..., 0.99942385, 0.99942385,
       1.        ]), array([0.        , 0.        , 0.        , ..., 0.99810307, 0.99810981,
       1.        ]), array([0.        , 0.        , 0.        , ..., 0.99776277, 0.99776951,
       1.        ]), array([0.        , 0.        , 0.        , ..., 0.99367916, 0.9936859 ,
       1.        ]), array([0.        , 0.        , 0.        , ..., 0.99028289, 0.99028963,
       1.        ])]
[array([0.00000000e+00, 3.38162291e-06, 1.52173031e-03, ...,
       9.99996618e-01, 1.00000000e+00, 1.00000000e+00]), array([0.00000000e+00, 3.38162291e-06, 6.11735584e-03, ...,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00]), array([0.00000000e+00, 3.38162291e-06, 7.92652410e-03, ...,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00]), array([0.00000000e+00, 3.38162291e-06, 3.29031909e-03,