In [1]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import math
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU, Masking
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from keras.metrics import AUC
from keras.backend import clear_session
from copy import deepcopy
from google.colab import drive
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier

drive.mount('/content/drive', force_remount=True)
#drive.mount('/content/drive')


BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'
#BASE_DIR = ''

# Training parameters
TIME_STEPS = 25
BATCH_SIZE = 128
EPOCHS = 4
PADDING = 0
EMBEDDINGS_DIM = 5

# Cross validation
K = 5

Mounted at /content/drive


In [2]:
with open(BASE_DIR + 'inputs.npy', 'rb') as f:
    X = np.load(f)
with open(BASE_DIR + 'outputs.npy', 'rb') as f:
    Y = np.load(f)

## Split into K-fold

In [3]:
kf = KFold(n_splits = K, shuffle = False)

In [4]:
model = Sequential()
model.add(Masking(mask_value=0, input_shape=(TIME_STEPS, EMBEDDINGS_DIM * 2 + 1)))
model.add(LSTM(200))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', metrics=[AUC(), 'acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (None, 25, 11)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 200)               169600    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 169,801
Trainable params: 169,801
Non-trainable params: 0
_________________________________________________________________


In [None]:
val_aucs = []
clear_session()
auc = AUC()
for train_index, val_index in kf.split(X):
    X_train = X[train_index]
    Y_train = Y[train_index]
    X_val = X[val_index]
    Y_val = Y[val_index]
    model = Sequential()
    model.add(Masking(mask_value=0, input_shape=(TIME_STEPS, EMBEDDINGS_DIM * 2 + 1)))
    model.add(LSTM(200))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', metrics=[auc])
    history = model.fit(X_train, Y_train, 
                        validation_data = (X_val, Y_val), 
                        epochs=EPOCHS, 
                        batch_size=BATCH_SIZE)
    val_aucs.append(history.history['val_auc'])

Epoch 1/4

In [None]:
print("Mean validation AUC: %f" % np.mean(val_aucs))

## Plot ROC and AUC graphs

In [None]:
test_index = range(math.floor(0.3 * len(X)))
X_test = X[test_index]
Y_test = Y[test_index]
Y_pred = model.predict(X_test)

fpr_keras, tpr_keras, thresholds_keras = roc_curve(Y_test, Y_pred)
auc_keras = auc(fpr_keras, tpr_keras)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='ROC (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

# Zoom in view of the upper left corner.
plt.figure(2)
plt.xlim(0, 0.35)
plt.ylim(0.65, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='ROC (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()

In [None]:
start = np.random.choice(len(X_test))
#start = 700
length = 50
sample_indices = range(start, start + length)

img = np.zeros((3,length,3))

X_samples = X_test[sample_indices, TIME_STEPS - 1, EMBEDDINGS_DIM:EMBEDDINGS_DIM+3]
X_samples -= np.min(X_samples)
X_samples /= np.max(X_samples)

img[0,:,:] = X_samples

Y_truth = Y_test[sample_indices]
img[1,:,1] = Y_truth
img[1,:,2] = 1. - Y_truth

Y_prediction = Y_pred[sample_indices]
img[2,:,1] = Y_prediction[:,0]
img[2,:,2] = 1. - Y_prediction[:,0]

#Y_img = np.concatenate((X_samples, Y_truth, Y_prediction), axis = 0)
#Y_img = np.transpose(Y_img)

fig3 = plt.figure(3, figsize=(12,8), dpi=100, constrained_layout=True)

image = plt.imshow(img)

plt.text(length, 0.15, "Category")
plt.text(length, 1.15, "Ground truth")
plt.text(length, 2.15, "Prediction")

In [None]:
X.shape