In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN"
outPath = "Results"
foldName = "folds.pickle"

epochs = 100
batch_size = 128
shuffle = True
seed = None

input_data_folder = "Data"
training_data_file = "Training-datasets-PredNTS.txt"
independent_data_file = "independent dataset-PredNTS.txt"

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

import math

In [3]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_nt(sequence, char_dict):
    
    seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
    i = 0
    for single_character in sequence:
        if(single_character.upper() in char_dict.keys()):
            seq_encoded[i][char_dict[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in NT sequence: '+sequence)
    return seq_encoded

In [5]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

In [6]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

In [7]:
# ##################################################################################
# ##### Function to customize the DLNN architecture with parameters
# ##################################################################################

# def DLNN_CORENup(input_seq_shape = (41, 21),
#                  conv_filters_per_layer_1 = 10, kernel_length_1 = 10, conv_strides_1 = 1, ## 1st Convolutional layer parameters
#                  max_pool_width_1 = 3, max_pool_stride_1 = 3, ## 1st Maxpool layer parameters
#                  lstm_decode_units = 5, ## LSTM layer parameters
#                  conv_filters_per_layer_2 = 10,  kernel_length_2 = 5, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
#                  max_pool_width_2 = 3, max_pool_stride_2 = 3, ## 2nd Maxpool layer parameters
#                  dense_decode_units = 128, ## Dense layer parameters
#                  prob = 0.5, learn_rate = 0.0005, 
#                  loss = 'binary_crossentropy', metrics = None):
    
#     beta = 0.001
    
#     ######################################################################################################
#     ########  SEQUENCE  ##################################################################################
#     ######################################################################################################
    
#     input1 = tf.keras.layers.Input(shape=input_seq_shape)

#     ## LSTM Path
    
# #     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(20, return_sequences = True, 
# #                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(input1)

#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(10, return_sequences = True, 
#                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(x1)
    
#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(5, return_sequences = True, 
#                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(x1)
    
#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(3, return_sequences = True, 
#                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(x1)
    
# #     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1, return_sequences = True, 
# #                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(x1)
    
#     x1 = tf.keras.layers.Dropout(prob)(x1)
    
#     x1 = tf.keras.layers.Flatten()(x1)

#     ## Conv Path
    
#     ######################################################################################################
#     ########  Classifier  ################################################################################
#     ######################################################################################################
    
#     y = tf.keras.layers.Dense(100, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
# #                               activation = 'relu'
#                              )(x1)
    
#     y = tf.keras.layers.Dropout(prob)(y)
    
#     y = tf.keras.layers.Dense(1, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'sigmoid')(y)

#     ## Generate Model from input and output
#     model = tf.keras.models.Model(inputs=input1, outputs=y)
    
#     ## Compile model
#     if(metrics != None):
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
#     else:
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

#     return model

In [8]:
# # parallel 1

# ##################################################################################
# ##### Function to customize the DLNN architecture with parameters
# ##################################################################################

# def DLNN_CORENup(input_seq_shape = (41, 21),
#                  conv_filters_per_layer_1 = 10, kernel_length_1 = 10, conv_strides_1 = 1, ## 1st Convolutional layer parameters
#                  max_pool_width_1 = 3, max_pool_stride_1 = 3, ## 1st Maxpool layer parameters
#                  lstm_decode_units = 5, ## LSTM layer parameters
#                  conv_filters_per_layer_2 = 10,  kernel_length_2 = 5, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
#                  max_pool_width_2 = 3, max_pool_stride_2 = 3, ## 2nd Maxpool layer parameters
#                  dense_decode_units = 128, ## Dense layer parameters
#                  prob = 0.5, learn_rate = 0.0005, 
#                  loss = 'binary_crossentropy', metrics = None):
    
#     beta = 0.001
    
#     ######################################################################################################
#     ########  SEQUENCE  ##################################################################################
#     ######################################################################################################
    
#     input1 = tf.keras.layers.Input(shape=input_seq_shape)
    
#     ######################################################################################################
#     ########  RNN  ##################################################################################
#     ######################################################################################################
    
# #     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences = True, 
# #                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(input1)

#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(10, return_sequences = True, 
# #                                                             kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                                            ))(input1)
    
#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(5, return_sequences = True,
# #                                                             kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                                            ))(x1)
    
#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(3, return_sequences = True, 
# #                                                             kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                                            ))(x1)
    
# #     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1, return_sequences = True, 
# #                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(x1)
    
#     x1 = tf.keras.layers.Flatten()(x1)
    
#     x1 = tf.keras.layers.Dense(100, 
# #                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
# #                               activation = 'relu'
#                              )(x1)
    
#     x1 = tf.keras.layers.Dropout(prob)(x1)
    
#     x1 = tf.keras.layers.Dense(100, 
# #                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
# #                               activation = 'relu'
#                              )(x1)
    
#     x1 = tf.keras.layers.Dropout(prob)(x1)
    
#     x1 = tf.keras.layers.Dense(1, 
# #                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'sigmoid')(x1)
    
#     ######################################################################################################
#     ########  CONV  ##################################################################################
#     ######################################################################################################

#     x2 = tf.keras.layers.Conv1D(conv_filters_per_layer_1, kernel_length_1, strides = conv_strides_1, 
# #                                 kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                                 padding = "same")(input1)
#     x2 = tf.keras.layers.Activation('relu')(x2)
#     x2 = tf.keras.layers.MaxPool1D(pool_size = max_pool_width_1, strides = max_pool_stride_1)(x2)
#     x2 = tf.keras.layers.Dropout(prob)(x2)

#     ## Conv Path

#     x2 = tf.keras.layers.Conv1D(conv_filters_per_layer_2, kernel_length_2, strides = conv_strides_2, 
# #                                 kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                                 padding = 'same')(x2)
#     x2 = tf.keras.layers.Activation('relu')(x2)
#     x2 = tf.keras.layers.MaxPooling1D(pool_size = max_pool_width_2, strides = max_pool_stride_2)(x2)
#     x2 = tf.keras.layers.Dropout(prob)(x2)
    
#     x2 = tf.keras.layers.Flatten()(x2)
    
#     x2 = tf.keras.layers.Dense(100, 
# #                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
# #                               activation = 'relu'
#                              )(x2)
    
#     x2 = tf.keras.layers.Dropout(prob)(x2)
    
#     x2 = tf.keras.layers.Dense(100, 
# #                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
# #                               activation = 'relu'
#                              )(x2)
    
#     x2 = tf.keras.layers.Dropout(prob)(x2)
    
#     x2 = tf.keras.layers.Dense(1, 
# #                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'sigmoid')(x2)

    
#     ######################################################################################################
#     ########  Classifier  ################################################################################
#     ######################################################################################################
    
#     y = tf.keras.layers.Concatenate()([x1,x2])
    
#     y = tf.keras.layers.Dense(1, 
#                               activation = 'sigmoid')(y)

#     ## Generate Model from input and output
#     model = tf.keras.models.Model(inputs=input1, outputs=y)
    
#     ## Compile model
#     if(metrics != None):
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
#     else:
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

#     return model

In [9]:
# parallel 2

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DLNN_CORENup(input_seq_shape = (41, 21),
                 conv_filters_per_layer_1 = 10, kernel_length_1 = 10, conv_strides_1 = 1, ## 1st Convolutional layer parameters
                 max_pool_width_1 = 3, max_pool_stride_1 = 3, ## 1st Maxpool layer parameters
                 lstm_decode_units = 5, ## LSTM layer parameters
                 conv_filters_per_layer_2 = 10,  kernel_length_2 = 5, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
                 max_pool_width_2 = 3, max_pool_stride_2 = 3, ## 2nd Maxpool layer parameters
                 dense_decode_units = 128, ## Dense layer parameters
                 prob = 0.5, learn_rate = 0.001, 
                 loss = 'binary_crossentropy', metrics = None):
    
    beta = 0.001
    
    ######################################################################################################
    ########  SEQUENCE  ##################################################################################
    ######################################################################################################
    
    input1 = tf.keras.layers.Input(shape=input_seq_shape)
    
    ######################################################################################################
    ########  RNN  ##################################################################################
    ######################################################################################################
    
#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences = True, 
#                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(input1)

    x1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(10, return_sequences = True, 
#                                                             kernel_regularizer = tf.keras.regularizers.l2(beta)
                                                           ))(input1)
    
    x1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(5, return_sequences = True,
#                                                             kernel_regularizer = tf.keras.regularizers.l2(beta)
                                                           ))(x1)
    
    x1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(3, return_sequences = True, 
#                                                             kernel_regularizer = tf.keras.regularizers.l2(beta)
                                                           ))(x1)
    
#     x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1, return_sequences = True, 
#                                                    kernel_regularizer = tf.keras.regularizers.l2(beta)))(x1)
    
    x1 = tf.keras.layers.Flatten()(x1)
    
    x1 = tf.keras.layers.Dense(100, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(x1)
    
    x1 = tf.keras.layers.Dropout(prob)(x1)
    
    x1 = tf.keras.layers.Dense(100, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(x1)
    
    x1 = tf.keras.layers.Dropout(prob)(x1)
    
    x1 = tf.keras.layers.Dense(1, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(x1)
    
    ######################################################################################################
    ########  CONV  ##################################################################################
    ######################################################################################################

    x2 = tf.keras.layers.Conv1D(conv_filters_per_layer_1, kernel_length_1, strides = conv_strides_1, 
#                                 kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = "same")(input1)
#     x2 = tf.keras.layers.Activation('relu')(x2)
    x2 = tf.keras.layers.MaxPool1D(pool_size = max_pool_width_1, strides = max_pool_stride_1)(x2)

    ## Conv Path

    x2 = tf.keras.layers.Conv1D(conv_filters_per_layer_2, kernel_length_2, strides = conv_strides_2, 
#                                 kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = 'same')(x2)
#     x2 = tf.keras.layers.Activation('relu')(x2)
    x2 = tf.keras.layers.MaxPooling1D(pool_size = max_pool_width_2, strides = max_pool_stride_2)(x2)
    
    x2 = tf.keras.layers.Flatten()(x2)
    
    x2 = tf.keras.layers.Dense(100, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(x2)
    
    x2 = tf.keras.layers.Dropout(prob)(x2)
    
    x2 = tf.keras.layers.Dense(100, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(x2)
    
    x2 = tf.keras.layers.Dropout(prob)(x2)
    
    x2 = tf.keras.layers.Dense(1, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(x2)

    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Concatenate()([x1,x2])
    
    y = tf.keras.layers.Dense(1, 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

    return model

In [10]:
DLNN_CORENup().summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 41, 21)]     0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 41, 10)       2110        ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 41, 20)       1980        ['input_1[0][0]']                
                                                                                                  
 max_pooling1d (MaxPooling1D)   (None, 13, 10)       0           ['conv1d[0][0]']                 
                                                                                              

In [11]:
# for step in range(10):
#     initial_learning_rate=1e-1
#     decay_steps=10000
#     decay_rate=0.9
#     print(step, ':', initial_learning_rate * decay_rate ** (step / decay_steps))

# Training

In [12]:
##################################################################################
##### read training file
##################################################################################
train_file_path = os.path.join(input_data_folder, training_data_file)
train_data = pd.read_csv(train_file_path, sep='\t', header=None)
train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
train_data.head()

##################################################################################
##### Create dictionary of all characters in the NT sequence 
##################################################################################
all_char_set = set({})
for val in [set(val) for val in train_data['Sequence']]:
    all_char_set = all_char_set.union(val)
all_char_list = list(all_char_set)
all_char_list.sort()
all_char_dict = {}
for i in range(len(all_char_list)):
    all_char_dict[all_char_list[i]] = i
    
##################################################################################
##### Create OHE of sequence
##################################################################################
train_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in train_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
train_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in train_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

features = np.array(list(train_data['OHE_Sequence']))
labels = np.array(list(train_data['label']))
labels = labels.reshape((labels.shape[0], 1))

input_seq_shape = features[0].shape

folds = build_kfold(features, labels, k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

In [13]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    model = DLNN_CORENup(input_seq_shape = input_seq_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                           monitor = 'val_loss', verbose = 1, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    # adding random shuffling of the dataset for training purpose
    index_arr = np.arange(fold["X_train"].shape[0])
    index_arr = np.random.permutation(index_arr)
    
    model.fit(x = fold["X_train"][index_arr], y = fold["y_train"][index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
              callbacks = modelCallbacks, validation_data = (fold["X_test"], fold["y_test"]))
    
    model = tf.keras.models.load_model(current_model_path)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict(fold["X_train"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.predict(fold["X_test"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    del model
    tf.keras.backend.clear_session()


Train/Test model on Fold #0.
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68426, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold0.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.68426 to 0.67161, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold0.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.67161 to 0.65940, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold0.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 0.65940 to 0.64791, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold0.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 0.64791 to 0.63725, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold0.hdf5
Epoch 6/100
Epoch 6: val_loss improved from 0.63725 to 0.63333, saving model to Results\NT_Site_PredNTS_

Epoch 28/100
Epoch 28: val_loss did not improve from 0.56305
Epoch 29/100
Epoch 29: val_loss did not improve from 0.56305
Epoch 30/100
Epoch 30: val_loss improved from 0.56305 to 0.56230, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold0.hdf5
Epoch 31/100
Epoch 31: val_loss did not improve from 0.56230
Epoch 32/100
Epoch 32: val_loss did not improve from 0.56230
Epoch 33/100
Epoch 33: val_loss improved from 0.56230 to 0.55341, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold0.hdf5
Epoch 34/100
Epoch 34: val_loss did not improve from 0.55341
Epoch 35/100
Epoch 35: val_loss did not improve from 0.55341
Epoch 36/100
Epoch 36: val_loss did not improve from 0.55341
Epoch 37/100
Epoch 37: val_loss did not improve from 0.55341
Epoch 38/100
Epoch 38: val_loss did not improve from 0.55341
Epoch 39/100
Epoch 39: val_loss improved from 0.55341 to 0.54556, saving model to Results\NT_Site_P

Epoch 60/100
Epoch 60: val_loss did not improve from 0.52009
Epoch 61/100
Epoch 61: val_loss did not improve from 0.52009
Epoch 62/100
Epoch 62: val_loss did not improve from 0.52009
Epoch 63/100
Epoch 63: val_loss did not improve from 0.52009
Epoch 64/100
Epoch 64: val_loss did not improve from 0.52009
Epoch 65/100
Epoch 65: val_loss did not improve from 0.52009
Epoch 66/100
Epoch 66: val_loss did not improve from 0.52009
Epoch 67/100
Epoch 67: val_loss did not improve from 0.52009
Epoch 68/100
Epoch 68: val_loss did not improve from 0.52009
Epoch 69/100
Epoch 69: val_loss did not improve from 0.52009
Epoch 70/100
Epoch 70: val_loss did not improve from 0.52009
Epoch 71/100
Epoch 71: val_loss did not improve from 0.52009
Epoch 72/100
Epoch 72: val_loss did not improve from 0.52009
Epoch 73/100
Epoch 73: val_loss did not improve from 0.52009
Epoch 74/100
Epoch 74: val_loss did not improve from 0.52009
Epoch 75/100
Epoch 75: val_loss improved from 0.52009 to 0.51789, saving model to Res

Epoch 95: val_loss did not improve from 0.51004
Epoch 96/100
Epoch 96: val_loss did not improve from 0.51004
Epoch 97/100
Epoch 97: val_loss did not improve from 0.51004
Epoch 98/100
Epoch 98: val_loss did not improve from 0.51004
Epoch 99/100
Epoch 99: val_loss did not improve from 0.51004
Epoch 100/100
Epoch 100: val_loss did not improve from 0.51004

Train/Test model on Fold #1.
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68650, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold1.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.68650 to 0.66618, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold1.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.66618 to 0.63444, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold1.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 0.63444 to 0.60471, saving model to Results\NT_Site_P

Epoch 24/100
Epoch 24: val_loss did not improve from 0.54385
Epoch 25/100
Epoch 25: val_loss did not improve from 0.54385
Epoch 26/100
Epoch 26: val_loss did not improve from 0.54385
Epoch 27/100
Epoch 27: val_loss did not improve from 0.54385
Epoch 28/100
Epoch 28: val_loss did not improve from 0.54385
Epoch 29/100
Epoch 29: val_loss did not improve from 0.54385
Epoch 30/100
Epoch 30: val_loss did not improve from 0.54385
Epoch 31/100
Epoch 31: val_loss did not improve from 0.54385
Epoch 32/100
Epoch 32: val_loss improved from 0.54385 to 0.54108, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold1.hdf5
Epoch 33/100
Epoch 33: val_loss did not improve from 0.54108
Epoch 34/100
Epoch 34: val_loss did not improve from 0.54108
Epoch 35/100
Epoch 35: val_loss did not improve from 0.54108
Epoch 36/100
Epoch 36: val_loss did not improve from 0.54108
Epoch 37/100
Epoch 37: val_loss did not improve from 0.54108
Epoch 38/100
Epoch 38: val_loss

Epoch 62/100
Epoch 62: val_loss did not improve from 0.54108
Epoch 63/100
Epoch 63: val_loss did not improve from 0.54108
Epoch 64/100
Epoch 64: val_loss did not improve from 0.54108
Epoch 65/100
Epoch 65: val_loss did not improve from 0.54108
Epoch 66/100
Epoch 66: val_loss did not improve from 0.54108
Epoch 67/100
Epoch 67: val_loss did not improve from 0.54108
Epoch 68/100
Epoch 68: val_loss did not improve from 0.54108
Epoch 69/100
Epoch 69: val_loss did not improve from 0.54108
Epoch 70/100
Epoch 70: val_loss did not improve from 0.54108
Epoch 71/100
Epoch 71: val_loss did not improve from 0.54108
Epoch 72/100
Epoch 72: val_loss did not improve from 0.54108
Epoch 73/100
Epoch 73: val_loss did not improve from 0.54108
Epoch 74/100
Epoch 74: val_loss did not improve from 0.54108
Epoch 75/100
Epoch 75: val_loss did not improve from 0.54108
Epoch 76/100
Epoch 76: val_loss did not improve from 0.54108
Epoch 77/100
Epoch 77: val_loss did not improve from 0.54108
Epoch 78/100
Epoch 78: v


Train/Test model on Fold #2.
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69104, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.69104 to 0.68823, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.68823 to 0.67457, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 0.67457 to 0.65117, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 0.65117 to 0.63640, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 6/100
Epoch 6: val_loss improved from 0.63640 to 0.63302, saving model to Results\NT_Site_PredNTS_

Epoch 28: val_loss did not improve from 0.58672
Epoch 29/100
Epoch 29: val_loss improved from 0.58672 to 0.58608, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 30/100
Epoch 30: val_loss improved from 0.58608 to 0.58373, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 31/100
Epoch 31: val_loss did not improve from 0.58373
Epoch 32/100
Epoch 32: val_loss improved from 0.58373 to 0.58075, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 33/100
Epoch 33: val_loss improved from 0.58075 to 0.57982, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 34/100
Epoch 34: val_loss improved from 0.57982 to 0.57600, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 35/10

Epoch 61: val_loss improved from 0.56143 to 0.56063, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 62/100
Epoch 62: val_loss did not improve from 0.56063
Epoch 63/100
Epoch 63: val_loss did not improve from 0.56063
Epoch 64/100
Epoch 64: val_loss did not improve from 0.56063
Epoch 65/100
Epoch 65: val_loss did not improve from 0.56063
Epoch 66/100
Epoch 66: val_loss did not improve from 0.56063
Epoch 67/100
Epoch 67: val_loss did not improve from 0.56063
Epoch 68/100
Epoch 68: val_loss did not improve from 0.56063
Epoch 69/100
Epoch 69: val_loss did not improve from 0.56063
Epoch 70/100
Epoch 70: val_loss improved from 0.56063 to 0.55923, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold2.hdf5
Epoch 71/100
Epoch 71: val_loss did not improve from 0.55923
Epoch 72/100
Epoch 72: val_loss did not improve from 0.55923
Epoch 73/100
Epoch 73: val_loss did not improve fro

Epoch 98/100
Epoch 98: val_loss did not improve from 0.54595
Epoch 99/100
Epoch 99: val_loss did not improve from 0.54595
Epoch 100/100
Epoch 100: val_loss did not improve from 0.54595

Train/Test model on Fold #3.
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68350, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.68350 to 0.67353, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.67353 to 0.66189, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 0.66189 to 0.65309, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 0.65309 to 0.64028, saving model to Results\NT_Si

Epoch 25: val_loss improved from 0.57310 to 0.56976, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 26/100
Epoch 26: val_loss did not improve from 0.56976
Epoch 27/100
Epoch 27: val_loss improved from 0.56976 to 0.56902, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 28/100
Epoch 28: val_loss improved from 0.56902 to 0.56598, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 29/100
Epoch 29: val_loss did not improve from 0.56598
Epoch 30/100
Epoch 30: val_loss did not improve from 0.56598
Epoch 31/100
Epoch 31: val_loss did not improve from 0.56598
Epoch 32/100
Epoch 32: val_loss improved from 0.56598 to 0.56247, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 33/100
Epoch 33: val_loss did not improve from 0.56247
Epoc

Epoch 56/100
Epoch 56: val_loss did not improve from 0.53522
Epoch 57/100
Epoch 57: val_loss did not improve from 0.53522
Epoch 58/100
Epoch 58: val_loss did not improve from 0.53522
Epoch 59/100
Epoch 59: val_loss improved from 0.53522 to 0.53284, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold3.hdf5
Epoch 60/100
Epoch 60: val_loss did not improve from 0.53284
Epoch 61/100
Epoch 61: val_loss did not improve from 0.53284
Epoch 62/100
Epoch 62: val_loss did not improve from 0.53284
Epoch 63/100
Epoch 63: val_loss did not improve from 0.53284
Epoch 64/100
Epoch 64: val_loss did not improve from 0.53284
Epoch 65/100
Epoch 65: val_loss did not improve from 0.53284
Epoch 66/100
Epoch 66: val_loss did not improve from 0.53284
Epoch 67/100
Epoch 67: val_loss did not improve from 0.53284
Epoch 68/100
Epoch 68: val_loss did not improve from 0.53284
Epoch 69/100
Epoch 69: val_loss did not improve from 0.53284
Epoch 70/100
Epoch 70: val_loss

Epoch 92/100
Epoch 92: val_loss did not improve from 0.51277
Epoch 93/100
Epoch 93: val_loss did not improve from 0.51277
Epoch 94/100
Epoch 94: val_loss did not improve from 0.51277
Epoch 95/100
Epoch 95: val_loss did not improve from 0.51277
Epoch 96/100
Epoch 96: val_loss did not improve from 0.51277
Epoch 97/100
Epoch 97: val_loss did not improve from 0.51277
Epoch 98/100
Epoch 98: val_loss did not improve from 0.51277
Epoch 99/100
Epoch 99: val_loss did not improve from 0.51277
Epoch 100/100
Epoch 100: val_loss did not improve from 0.51277

Train/Test model on Fold #4.
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68665, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold4.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.68665 to 0.66833, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold4.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.66833 to 0.64920, saving mode

Epoch 24/100
Epoch 24: val_loss improved from 0.57672 to 0.57587, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold4.hdf5
Epoch 25/100
Epoch 25: val_loss improved from 0.57587 to 0.57435, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold4.hdf5
Epoch 26/100
Epoch 26: val_loss did not improve from 0.57435
Epoch 27/100
Epoch 27: val_loss did not improve from 0.57435
Epoch 28/100
Epoch 28: val_loss improved from 0.57435 to 0.57432, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold4.hdf5
Epoch 29/100
Epoch 29: val_loss improved from 0.57432 to 0.57292, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\bestModel-fold4.hdf5
Epoch 30/100
Epoch 30: val_loss did not improve from 0.57292
Epoch 31/100
Epoch 31: val_loss did not improve from 0.57292
Epoch 32/100
Epoch 32: val_loss did not improve from

Epoch 60/100
Epoch 60: val_loss did not improve from 0.57292
Epoch 61/100
Epoch 61: val_loss did not improve from 0.57292
Epoch 62/100
Epoch 62: val_loss did not improve from 0.57292
Epoch 63/100
Epoch 63: val_loss did not improve from 0.57292
Epoch 64/100
Epoch 64: val_loss did not improve from 0.57292
Epoch 65/100
Epoch 65: val_loss did not improve from 0.57292
Epoch 66/100
Epoch 66: val_loss did not improve from 0.57292
Epoch 67/100
Epoch 67: val_loss did not improve from 0.57292
Epoch 68/100
Epoch 68: val_loss did not improve from 0.57292
Epoch 69/100
Epoch 69: val_loss did not improve from 0.57292
Epoch 70/100
Epoch 70: val_loss did not improve from 0.57292
Epoch 71/100
Epoch 71: val_loss did not improve from 0.57292
Epoch 72/100
Epoch 72: val_loss did not improve from 0.57292
Epoch 73/100
Epoch 73: val_loss did not improve from 0.57292
Epoch 74/100
Epoch 74: val_loss did not improve from 0.57292
Epoch 75/100
Epoch 75: val_loss did not improve from 0.57292
Epoch 76/100
Epoch 76: v

Epoch 99/100
Epoch 99: val_loss did not improve from 0.57292
Epoch 100/100
Epoch 100: val_loss did not improve from 0.57292


## k-fold Training evaluation

In [14]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test,0.74936,0.764719,0.809883,0.720379,0.778338,0.499774
Train,0.901129,0.910276,0.926549,0.888965,0.9133,0.802977


In [15]:
# parallel 1
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.777914	0.772954	0.852681	0.787560	0.768254	0.556444
# Train	0.890117	0.889315	0.941954	0.891271	0.888962	0.780636

In [16]:
# rnn, 20-10-5
# Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.744752	0.731433	0.825816	0.776636	0.712830	0.491745
# Train	0.787784	0.778217	0.871499	0.806677	0.768885	0.576924

In [17]:
evaluations_df[evaluations_df["Train_Test"] == "Test"]

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
1,0,Test,0.781971,0.802691,"[0.0, 0.4560669456066946, 0.4811715481171548, ...","[0.0, 0.0546218487394958, 0.06302521008403361,...","[1.8020394, 0.8020394, 0.8020393, 0.80203927, ...",0.836495,0.748954,0.815126,0.565274
3,1,Test,0.742138,0.760181,"[0.0, 0.226890756302521, 0.23949579831932774, ...","[0.0, 0.02092050209205021, 0.02510460251046025...","[1.8141658, 0.81416583, 0.8141658, 0.8141657, ...",0.812155,0.705882,0.778243,0.485432
5,2,Test,0.741597,0.744681,"[0.0, 0.5798319327731093, 0.5882352941176471, ...","[0.0, 0.15126050420168066, 0.15546218487394958...","[1.7551525, 0.75515246, 0.7551524, 0.7551522, ...",0.802786,0.735294,0.747899,0.483232
7,3,Test,0.760504,0.769565,"[0.0, 0.3697478991596639, 0.3739495798319328, ...","[0.0, 0.025210084033613446, 0.0252100840336134...","[1.8905818, 0.8905818, 0.89058167, 0.8905815, ...",0.812513,0.743697,0.777311,0.521303
9,4,Test,0.720588,0.746479,"[0.0, 0.2605042016806723, 0.2689075630252101, ...","[0.0, 0.029411764705882353, 0.0294117647058823...","[1.7516885, 0.75168854, 0.7516884, 0.7516883, ...",0.785467,0.668067,0.773109,0.443631


In [18]:
# rnn, 20-10-5
# Fold	Train_Test	Accuracy	Precision	TPR	FPR	TPR_FPR_Thresholds	AUC	Sensitivity	Specificity	MCC
# 1	0	Test	0.721174	0.690647	[0.0, 0.0041841004184100415, 0.066945606694560...	[0.0, 0.0, 0.0, 0.008403361344537815, 0.008403...	[1.9654377, 0.9654376, 0.94470316, 0.94159025,...	0.822097	0.803347	0.638655	0.448191
# 3	1	Test	0.769392	0.750000	[0.0, 0.004201680672268907, 0.1134453781512605...	[0.0, 0.0, 0.0, 0.0041841004184100415, 0.00418...	[1.9864669, 0.9864668, 0.94132906, 0.9403602, ...	0.833023	0.806723	0.732218	0.540396
# 5	2	Test	0.758403	0.770925	[0.0, 0.004201680672268907, 0.0042016806722689...	[0.0, 0.0, 0.004201680672268907, 0.00420168067...	[1.9832542, 0.98325425, 0.979345, 0.8914033, 0...	0.829073	0.735294	0.781513	0.517360
# 7	3	Test	0.760504	0.734848	[0.0, 0.004201680672268907, 0.1008403361344537...	[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...	[1.978077, 0.97807705, 0.9421845, 0.9375342, 0...	0.857796	0.815126	0.705882	0.524145
# 9	4	Test	0.714286	0.710744	[0.0, 0.004201680672268907, 0.0462184873949579...	[0.0, 0.0, 0.0, 0.008403361344537815, 0.008403...	[1.997149, 0.99714893, 0.9781189, 0.974313, 0....	0.787091	0.722689	0.705882	0.428632

# Independent data

In [19]:
train_features = features
train_labels = labels

In [20]:
##################################################################################
##### read independent data file
##################################################################################
indpe_file_path = os.path.join(input_data_folder, independent_data_file)
indpe_data = pd.read_csv(indpe_file_path, sep='\t', header=None)
indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
indpe_data.head()
    
##################################################################################
##### Create OHE of sequence
##################################################################################
indpe_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in indpe_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in indpe_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

indpe_features = np.array(list(indpe_data['OHE_Sequence']))
indpe_labels = np.array(list(indpe_data['label']))
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

input_seq_shape = indpe_features[0].shape

## Using k-fold Models

### Performance of each k-fold model

In [21]:
## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    label_pred = pred2label(y_pred)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(indpe_labels, label_pred)
    prec = precision_score(indpe_labels,label_pred)
    mcc = matthews_corrcoef(indpe_labels, label_pred)

    conf = confusion_matrix(indpe_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
    auc = roc_auc_score(indpe_labels, y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Independent")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.633306,0.244401,0.640599,0.57931,0.644031,0.17015


In [22]:
evaluations_df

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,0,Independent,0.645714,0.249458,"[0.0, 0.270935960591133, 0.2857142857142857, 0...","[0.0, 0.16731898238747553, 0.17906066536203522...","[1.8020394, 0.8020394, 0.8020393, 0.80203927, ...",0.632429,0.566502,0.661448,0.174951
1,1,Independent,0.643265,0.248927,"[0.0, 0.16748768472906403, 0.18719211822660098...","[0.0, 0.0812133072407045, 0.08512720156555773,...","[1.8141658, 0.81416583, 0.8141658, 0.8141656, ...",0.653861,0.571429,0.657534,0.175358
2,2,Independent,0.626122,0.241379,"[0.0, 0.41379310344827586, 0.4187192118226601,...","[0.0, 0.24951076320939333, 0.2573385518590998,...","[1.7551525, 0.75515246, 0.7551524, 0.75515234,...",0.629614,0.586207,0.634051,0.167004
3,3,Independent,0.613878,0.241379,"[0.0, 0.2315270935960591, 0.24630541871921183,...","[0.0, 0.12720156555772993, 0.12915851272015655...","[1.8905818, 0.8905818, 0.89058167, 0.8905814, ...",0.640806,0.62069,0.612524,0.175354
4,4,Independent,0.637551,0.24086,"[0.0, 0.11822660098522167, 0.12315270935960591...","[0.0, 0.0821917808219178, 0.08414872798434442,...","[1.7516885, 0.75168854, 0.7516885, 0.7516884, ...",0.646284,0.551724,0.654599,0.158084


### Mean score with k-fold models

In [23]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    total_pred += y_pred
    all_preds.append(y_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.630204,0.244898,0.657891,0.591133,0.637965,0.173881


### Voting score with k-fold models

In [24]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    vote_pred = pred2label(y_pred)
    total_pred += vote_pred
    all_preds.append(vote_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.644082,0.254737,0.658031,0.596059,0.65362,0.190537


## Using New Model

Train model on full data from training. Predict and evaluate on Independent data.

In [25]:
pos_indexes = np.where(indpe_labels==1)[0]
neg_indexes = np.random.permutation(np.where(indpe_labels==0)[0])[0:pos_indexes.shape[0]]
indpe_val_indexes = np.concatenate((pos_indexes, neg_indexes))

In [26]:
model = DLNN_CORENup(input_seq_shape = input_seq_shape)
    
## Define the model callbacks for early stopping and saving the model. Then train model
current_model_path = os.path.join(modelPath, "_fullModel.hdf5")
modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                       monitor = 'val_loss', verbose = 1, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

# adding random shuffling of the dataset for training purpose
index_arr = np.arange(train_features.shape[0])
index_arr = np.random.permutation(index_arr)

model.fit(x = train_features[index_arr], y = train_labels[index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
          callbacks = modelCallbacks, validation_data = (indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes]))
# model.fit(x = train_features[index_arr], y = train_labels[index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
#           callbacks = modelCallbacks, validation_split = 0.2)

model = tf.keras.models.load_model(current_model_path)

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68639, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\_fullModel.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.68639 to 0.67962, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_deepRNN\5fold\models\_fullModel.hdf5
Epoch 3/100
Epoch 3: val_loss did not improve from 0.67962
Epoch 4/100
Epoch 4: val_loss did not improve from 0.67962
Epoch 5/100
Epoch 5: val_loss did not improve from 0.67962
Epoch 6/100
Epoch 6: val_loss did not improve from 0.67962
Epoch 7/100
Epoch 7: val_loss did not improve from 0.67962
Epoch 8/100
Epoch 8: val_loss did not improve from 0.67962
Epoch 9/100
Epoch 9: val_loss did not improve from 0.67962
Epoch 10/100
Epoch 10: val_loss did not improve from 0.67962
Epoch 11/100
Epoch 11: val_loss did not improve from 0.67962
Epoch 12/100
Epoch 12: val_loss did not improve from 0.67962
Epoch 13/100
Epoch 13: val_loss did not improve from 0.67962
Epoch 14/

Epoch 39/100
Epoch 39: val_loss did not improve from 0.67962
Epoch 40/100
Epoch 40: val_loss did not improve from 0.67962
Epoch 41/100
Epoch 41: val_loss did not improve from 0.67962
Epoch 42/100
Epoch 42: val_loss did not improve from 0.67962
Epoch 43/100
Epoch 43: val_loss did not improve from 0.67962
Epoch 44/100
Epoch 44: val_loss did not improve from 0.67962
Epoch 45/100
Epoch 45: val_loss did not improve from 0.67962
Epoch 46/100
Epoch 46: val_loss did not improve from 0.67962
Epoch 47/100
Epoch 47: val_loss did not improve from 0.67962
Epoch 48/100
Epoch 48: val_loss did not improve from 0.67962
Epoch 49/100
Epoch 49: val_loss did not improve from 0.67962
Epoch 50/100
Epoch 50: val_loss did not improve from 0.67962
Epoch 51/100
Epoch 51: val_loss did not improve from 0.67962
Epoch 52/100
Epoch 52: val_loss did not improve from 0.67962
Epoch 53/100
Epoch 53: val_loss did not improve from 0.67962
Epoch 54/100
Epoch 54: val_loss did not improve from 0.67962
Epoch 55/100
Epoch 55: v

Epoch 77: val_loss did not improve from 0.67962
Epoch 78/100
Epoch 78: val_loss did not improve from 0.67962
Epoch 79/100
Epoch 79: val_loss did not improve from 0.67962
Epoch 80/100
Epoch 80: val_loss did not improve from 0.67962
Epoch 81/100
Epoch 81: val_loss did not improve from 0.67962
Epoch 82/100
Epoch 82: val_loss did not improve from 0.67962
Epoch 83/100
Epoch 83: val_loss did not improve from 0.67962
Epoch 84/100
Epoch 84: val_loss did not improve from 0.67962
Epoch 85/100
Epoch 85: val_loss did not improve from 0.67962
Epoch 86/100
Epoch 86: val_loss did not improve from 0.67962
Epoch 87/100
Epoch 87: val_loss did not improve from 0.67962
Epoch 88/100
Epoch 88: val_loss did not improve from 0.67962
Epoch 89/100
Epoch 89: val_loss did not improve from 0.67962
Epoch 90/100
Epoch 90: val_loss did not improve from 0.67962
Epoch 91/100
Epoch 91: val_loss did not improve from 0.67962
Epoch 92/100
Epoch 92: val_loss did not improve from 0.67962
Epoch 93/100
Epoch 93: val_loss did n

In [27]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = model.predict(indpe_features)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.66449,0.24,0.607714,0.472906,0.702544,0.139114


In [28]:
# rnn, 20-10-5
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Independent	0.647347	0.253763	0.675021	0.581281	0.66047	0.185228