In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_DLNN_Kmer_CNN"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "PredNTS_MathFeature_ENC"

monitor = 'val_loss'

In [2]:
train_data_filename = 'Training-datasets-PredNTS_kmer.csv'
indpe_data_filename = 'independent-dataset-PredNTS_kmer.csv'

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report, matthews_corrcoef

import math

In [4]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
# ##################################################################################
# ##### define all CUSTOM functions
# ##################################################################################

# def one_hot_encode_nt(sequence, char_dict):
    
#     seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
#     i = 0
#     for single_character in sequence:
#         if(single_character.upper() in char_dict.keys()):
#             seq_encoded[i][char_dict[single_character.upper()]] = 1
#             i = i+1
#         else:
#             raise ValueError('Incorrect character in NT sequence: '+sequence)
#     return seq_encoded

In [6]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

In [7]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

In [8]:
# epochs = 100
# batch_size = 16

# ##################################################################################
# ##### Function to customize the DLNN architecture with parameters
# ##################################################################################

# def DLNN_Classifier(input_vec_shape,
#                     dense_decode_units = 10, ## Dense layer parameters,
#                     dense_layers = 2,
#                     prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
#     beta = 0.001
    
#     input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
#     ######################################################################################################
#     ########  CNN layers  ################################################################################
#     ######################################################################################################
    
#     cnn = tf.keras.layers.Conv1D(filters = 10, 
#                                  kernel_size = 100,
#                                  kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                 )(input1)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
#     cnn = tf.keras.layers.MaxPool1D(pool_size = 10, strides = 10)(cnn)
#     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
#     cnn = tf.keras.layers.Conv1D(filters = 10, 
#                                  kernel_size = 50,
#                                  kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                 )(cnn)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
#     cnn = tf.keras.layers.MaxPool1D(pool_size = 5, strides = 5)(cnn)
#     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
#     cnn = tf.keras.layers.Flatten()(cnn)
    
#     ######################################################################################################
#     ########  Classifier  ################################################################################
#     ######################################################################################################
    
#     y = tf.keras.layers.Dense(dense_decode_units, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta),
#                               activation = 'relu'
#                              )(cnn)
#     y = tf.keras.layers.BatchNormalization()(y)
#     y = tf.keras.layers.Dropout(prob)(y)
    
#     y = tf.keras.layers.Dense(1, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'sigmoid'
#                              )(y)

#     ## Generate Model from input and output
#     model = tf.keras.models.Model(inputs=input1, outputs=y)
    
#     ## Compile model
#     if(metrics != None):
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
#                       loss = loss, metrics = metrics)
#     else:
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
#                       loss = loss)

#     return model

In [9]:
# epochs = 100
# batch_size = 16

# ##################################################################################
# ##### Function to customize the DLNN architecture with parameters
# ##################################################################################

# def DLNN_Classifier(input_vec_shape,
#                     dense_decode_units = 10, ## Dense layer parameters,
#                     dense_layers = 2,
#                     prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
#     beta = 0.001
    
#     input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
#     ######################################################################################################
#     ########  CNN layers  ################################################################################
#     ######################################################################################################
    
#     cnn = tf.keras.layers.Conv1D(filters = 10, 
#                                  kernel_size = 100,
#                                  kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                 )(input1)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
#     cnn = tf.keras.layers.MaxPool1D(pool_size = 10, strides = 10)(cnn)
#     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
#     cnn = tf.keras.layers.Conv1D(filters = 10, 
#                                  kernel_size = 50,
#                                  kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                 )(cnn)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
#     cnn = tf.keras.layers.MaxPool1D(pool_size = 5, strides = 5)(cnn)
#     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
#     cnn = tf.keras.layers.Flatten()(cnn)
    
#     ######################################################################################################
#     ########  Classifier  ################################################################################
#     ######################################################################################################
    
#     y = tf.keras.layers.Dense(dense_decode_units, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta),
#                               activation = 'relu'
#                              )(cnn)
#     y = tf.keras.layers.BatchNormalization()(y)
#     y = tf.keras.layers.Dropout(prob)(y)
    
#     y = tf.keras.layers.Dense(1, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'sigmoid'
#                              )(y)

#     ## Generate Model from input and output
#     model = tf.keras.models.Model(inputs=input1, outputs=y)
    
#     ## Compile model
#     if(metrics != None):
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
#                       loss = loss, metrics = metrics)
#     else:
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
#                       loss = loss)

#     return model

In [10]:
# epochs = 100
# batch_size = 16

# ##################################################################################
# ##### Function to customize the DLNN architecture with parameters
# ##################################################################################

# def DLNN_Classifier(input_vec_shape,
#                     dense_decode_units = 128, ## Dense layer parameters,
#                     dense_layers = 2,
#                     prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
#     beta = 0.001
    
#     input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
#     ######################################################################################################
#     ########  CNN layers  ################################################################################
#     ######################################################################################################
    
#     cnn = tf.keras.layers.Conv1D(filters = 25, 
#                                  kernel_size = 100,
#                                  strides = 10,
#                                  kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                 )(input1)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
# #     cnn = tf.keras.layers.MaxPool1D(pool_size = 10, strides = 10)(cnn)
# #     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
#     cnn = tf.keras.layers.Conv1D(filters = 25, 
#                                  kernel_size = 50,
#                                  strides = 10,
#                                  kernel_regularizer = tf.keras.regularizers.l2(beta)
#                                 )(cnn)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
    
# #     cnn = tf.keras.layers.MaxPool1D(pool_size = 5, strides = 5)(cnn)
#     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
#     cnn = tf.keras.layers.Flatten()(cnn)
    
#     ######################################################################################################
#     ########  Classifier  ################################################################################
#     ######################################################################################################
    
#     y = tf.keras.layers.Dense(dense_decode_units, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta),
#                               activation = 'relu'
#                              )(cnn)
#     y = tf.keras.layers.BatchNormalization()(y)
#     y = tf.keras.layers.Dropout(prob)(y)
    
#     y = tf.keras.layers.Dense(int(dense_decode_units/2), 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta),
#                               activation = 'relu'
#                              )(y)
#     y = tf.keras.layers.BatchNormalization()(y)
#     y = tf.keras.layers.Dropout(prob)(y)
    
#     y = tf.keras.layers.Dense(1, 
#                               kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'sigmoid'
#                              )(y)

#     ## Generate Model from input and output
#     model = tf.keras.models.Model(inputs=input1, outputs=y)
    
#     ## Compile model
#     if(metrics != None):
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
#                       loss = loss, metrics = metrics)
#     else:
#         model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
#                       loss = loss)

#     return model

In [11]:
epochs = 100
batch_size = 64

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 128, ## Dense layer parameters,
                    dense_layers = 2,
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  CNN layers  ################################################################################
    ######################################################################################################
    
    cnn = tf.keras.layers.Conv1D(filters = 15, 
                                 kernel_size = 100,
                                 strides = 5,
                                 kernel_regularizer = tf.keras.regularizers.l2(beta)
                                )(input1)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
#     cnn = tf.keras.layers.MaxPool1D(pool_size = 10, strides = 10)(cnn)
#     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
    cnn = tf.keras.layers.Conv1D(filters = 15, 
                                 kernel_size = 50,
                                 strides = 5,
                                 kernel_regularizer = tf.keras.regularizers.l2(beta)
                                )(cnn)
#     cnn = tf.keras.layers.Activation('relu')(cnn)
#     cnn = tf.keras.layers.MaxPool1D(pool_size = 5, strides = 5)(cnn)
#     cnn = tf.keras.layers.Dropout(prob)(cnn)
    
    cnn = tf.keras.layers.Flatten()(cnn)
    
    cnn = tf.keras.layers.GaussianNoise(stddev=0.001)(cnn)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta),
                              activation = 'relu'
                             )(cnn)
    y = tf.keras.layers.BatchNormalization()(y)
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.GaussianNoise(stddev=0.001)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta),
                              activation = 'relu'
                             )(y)
    y = tf.keras.layers.BatchNormalization()(y)
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.GaussianNoise(stddev=0.001)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid'
                             )(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [12]:
DLNN_Classifier((8420,1)).summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 8420, 1)]         0         
                                                                 
 conv1d (Conv1D)             (None, 1665, 15)          1515      
                                                                 
 conv1d_1 (Conv1D)           (None, 324, 15)           11265     
                                                                 
 flatten (Flatten)           (None, 4860)              0         
                                                                 
 gaussian_noise (GaussianNoi  (None, 4860)             0         
 se)                                                             
                                                                 
 dense (Dense)               (None, 128)               622208    
                                                             

# Train data preparation

In [13]:
##################################################################################
##### Read CSV data
##################################################################################

train_data_filepath = os.path.join(input_data_folder, train_data_filename)
train_data = pd.read_csv(train_data_filepath, sep=',', header=0)
train_data = train_data.drop('label', axis=1)

train_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in train_data['nameseq']])

train_data = train_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

train_features = np.array(train_data.drop('label', axis=1))
train_features = train_features[:, :, np.newaxis]
train_labels = np.array(train_data['label'])
train_labels = train_labels.reshape((train_labels.shape[0], 1))

input_vec_shape = train_features[0].shape

folds = build_kfold(train_features, train_labels, k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

# Training

In [14]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    model = DLNN_Classifier(input_vec_shape = input_vec_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                           monitor = monitor, verbose = 1, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    # adding random shuffling of the dataset for training purpose
    index_arr = np.arange(fold["X_train"].shape[0])
    index_arr = np.random.permutation(index_arr)
    
    model.fit(x = fold["X_train"][index_arr], y = fold["y_train"][index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
              callbacks = modelCallbacks, validation_data = (fold["X_test"], fold["y_test"]))
    
    model = tf.keras.models.load_model(current_model_path)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict(fold["X_train"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.predict(fold["X_test"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    del model
    tf.keras.backend.clear_session()


Train/Test model on Fold #0.
Epoch 1/100
Epoch 1: val_loss improved from inf to 1.04454, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold0.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 1.04454 to 1.03804, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold0.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 1.03804 to 1.02966, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold0.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 1.02966 to 1.02061, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold0.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 1.02061 to 1.01014, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold0.hdf5
Epoch 6/100
Epoch 6: val_loss improved from 1.01014 to 0.99912, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\

Epoch 25/100
Epoch 25: val_loss did not improve from 0.74541
Epoch 26/100
Epoch 26: val_loss did not improve from 0.74541
Epoch 27/100
Epoch 27: val_loss did not improve from 0.74541
Epoch 28/100
Epoch 28: val_loss did not improve from 0.74541
Epoch 29/100
Epoch 29: val_loss did not improve from 0.74541
Epoch 30/100
Epoch 30: val_loss did not improve from 0.74541
Epoch 31/100
Epoch 31: val_loss did not improve from 0.74541
Epoch 32/100
Epoch 32: val_loss did not improve from 0.74541
Epoch 33/100
Epoch 33: val_loss did not improve from 0.74541
Epoch 34/100
Epoch 34: val_loss did not improve from 0.74541
Epoch 35/100
Epoch 35: val_loss did not improve from 0.74541
Epoch 36/100
Epoch 36: val_loss did not improve from 0.74541
Epoch 37/100
Epoch 37: val_loss did not improve from 0.74541
Epoch 38/100
Epoch 38: val_loss did not improve from 0.74541
Epoch 39/100
Epoch 39: val_loss did not improve from 0.74541
Epoch 40/100
Epoch 40: val_loss did not improve from 0.74541
Epoch 41/100
Epoch 41: v

Epoch 55/100
Epoch 55: val_loss did not improve from 0.74541
Epoch 56/100
Epoch 56: val_loss did not improve from 0.74541
Epoch 57/100
Epoch 57: val_loss did not improve from 0.74541
Epoch 58/100
Epoch 58: val_loss did not improve from 0.74541
Epoch 59/100
Epoch 59: val_loss did not improve from 0.74541
Epoch 60/100
Epoch 60: val_loss did not improve from 0.74541
Epoch 61/100
Epoch 61: val_loss did not improve from 0.74541
Epoch 62/100
Epoch 62: val_loss did not improve from 0.74541
Epoch 63/100
Epoch 63: val_loss did not improve from 0.74541
Epoch 64/100
Epoch 64: val_loss did not improve from 0.74541
Epoch 65/100
Epoch 65: val_loss did not improve from 0.74541
Epoch 66/100
Epoch 66: val_loss did not improve from 0.74541
Epoch 67/100
Epoch 67: val_loss did not improve from 0.74541
Epoch 68/100
Epoch 68: val_loss did not improve from 0.74541
Epoch 69/100
Epoch 69: val_loss did not improve from 0.74541
Epoch 70/100
Epoch 70: val_loss did not improve from 0.74541
Epoch 71/100
Epoch 71: v

Epoch 85/100
Epoch 85: val_loss did not improve from 0.74541
Epoch 86/100
Epoch 86: val_loss did not improve from 0.74541
Epoch 87/100
Epoch 87: val_loss did not improve from 0.74541
Epoch 88/100
Epoch 88: val_loss did not improve from 0.74541
Epoch 89/100
Epoch 89: val_loss did not improve from 0.74541
Epoch 90/100
Epoch 90: val_loss did not improve from 0.74541
Epoch 91/100
Epoch 91: val_loss did not improve from 0.74541
Epoch 92/100
Epoch 92: val_loss did not improve from 0.74541
Epoch 93/100
Epoch 93: val_loss did not improve from 0.74541
Epoch 94/100
Epoch 94: val_loss did not improve from 0.74541
Epoch 95/100
Epoch 95: val_loss did not improve from 0.74541
Epoch 96/100
Epoch 96: val_loss did not improve from 0.74541
Epoch 97/100
Epoch 97: val_loss did not improve from 0.74541
Epoch 98/100
Epoch 98: val_loss did not improve from 0.74541
Epoch 99/100
Epoch 99: val_loss did not improve from 0.74541
Epoch 100/100
Epoch 100: val_loss did not improve from 0.74541

Train/Test model on F

Epoch 11/100
Epoch 11: val_loss improved from 0.92916 to 0.90231, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold1.hdf5
Epoch 12/100
Epoch 12: val_loss improved from 0.90231 to 0.86430, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold1.hdf5
Epoch 13/100
Epoch 13: val_loss improved from 0.86430 to 0.81869, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold1.hdf5
Epoch 14/100
Epoch 14: val_loss did not improve from 0.81869
Epoch 15/100
Epoch 15: val_loss improved from 0.81869 to 0.74913, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold1.hdf5
Epoch 16/100
Epoch 16: val_loss improved from 0.74913 to 0.73934, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold1.hdf5
Epoch 17/100
Epoch 17: val_loss did not improve from 0.73934
Epoch 18/100
Epoch 18: val_loss i

Epoch 38/100
Epoch 38: val_loss did not improve from 0.61472
Epoch 39/100
Epoch 39: val_loss did not improve from 0.61472
Epoch 40/100
Epoch 40: val_loss did not improve from 0.61472
Epoch 41/100
Epoch 41: val_loss did not improve from 0.61472
Epoch 42/100
Epoch 42: val_loss did not improve from 0.61472
Epoch 43/100
Epoch 43: val_loss did not improve from 0.61472
Epoch 44/100
Epoch 44: val_loss did not improve from 0.61472
Epoch 45/100
Epoch 45: val_loss did not improve from 0.61472
Epoch 46/100
Epoch 46: val_loss did not improve from 0.61472
Epoch 47/100
Epoch 47: val_loss did not improve from 0.61472
Epoch 48/100
Epoch 48: val_loss did not improve from 0.61472
Epoch 49/100
Epoch 49: val_loss did not improve from 0.61472
Epoch 50/100
Epoch 50: val_loss did not improve from 0.61472
Epoch 51/100
Epoch 51: val_loss did not improve from 0.61472
Epoch 52/100
Epoch 52: val_loss did not improve from 0.61472
Epoch 53/100
Epoch 53: val_loss did not improve from 0.61472
Epoch 54/100
Epoch 54: v

Epoch 68/100
Epoch 68: val_loss did not improve from 0.61472
Epoch 69/100
Epoch 69: val_loss did not improve from 0.61472
Epoch 70/100
Epoch 70: val_loss did not improve from 0.61472
Epoch 71/100
Epoch 71: val_loss did not improve from 0.61472
Epoch 72/100
Epoch 72: val_loss did not improve from 0.61472
Epoch 73/100
Epoch 73: val_loss did not improve from 0.61472
Epoch 74/100
Epoch 74: val_loss did not improve from 0.61472
Epoch 75/100
Epoch 75: val_loss did not improve from 0.61472
Epoch 76/100
Epoch 76: val_loss did not improve from 0.61472
Epoch 77/100
Epoch 77: val_loss did not improve from 0.61472
Epoch 78/100
Epoch 78: val_loss did not improve from 0.61472
Epoch 79/100
Epoch 79: val_loss did not improve from 0.61472
Epoch 80/100
Epoch 80: val_loss did not improve from 0.61472
Epoch 81/100
Epoch 81: val_loss did not improve from 0.61472
Epoch 82/100
Epoch 82: val_loss did not improve from 0.61472
Epoch 83/100
Epoch 83: val_loss did not improve from 0.61472
Epoch 84/100
Epoch 84: v

Epoch 97/100
Epoch 97: val_loss did not improve from 0.54634
Epoch 98/100
Epoch 98: val_loss did not improve from 0.54634
Epoch 99/100
Epoch 99: val_loss did not improve from 0.54634
Epoch 100/100
Epoch 100: val_loss did not improve from 0.54634

Train/Test model on Fold #2.
Epoch 1/100
Epoch 1: val_loss improved from inf to 1.04476, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold2.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 1.04476 to 1.03799, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold2.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 1.03799 to 1.02971, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold2.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 1.02971 to 1.02030, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold2.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 1.02030 to 1.010

Epoch 20: val_loss did not improve from 0.74022
Epoch 21/100
Epoch 21: val_loss did not improve from 0.74022
Epoch 22/100
Epoch 22: val_loss did not improve from 0.74022
Epoch 23/100
Epoch 23: val_loss did not improve from 0.74022
Epoch 24/100
Epoch 24: val_loss did not improve from 0.74022
Epoch 25/100
Epoch 25: val_loss did not improve from 0.74022
Epoch 26/100
Epoch 26: val_loss did not improve from 0.74022
Epoch 27/100
Epoch 27: val_loss did not improve from 0.74022
Epoch 28/100
Epoch 28: val_loss did not improve from 0.74022
Epoch 29/100
Epoch 29: val_loss did not improve from 0.74022
Epoch 30/100
Epoch 30: val_loss did not improve from 0.74022
Epoch 31/100
Epoch 31: val_loss did not improve from 0.74022
Epoch 32/100
Epoch 32: val_loss did not improve from 0.74022
Epoch 33/100
Epoch 33: val_loss did not improve from 0.74022
Epoch 34/100
Epoch 34: val_loss did not improve from 0.74022
Epoch 35/100
Epoch 35: val_loss did not improve from 0.74022
Epoch 36/100
Epoch 36: val_loss did n

Epoch 50/100
Epoch 50: val_loss did not improve from 0.74022
Epoch 51/100
Epoch 51: val_loss did not improve from 0.74022
Epoch 52/100
Epoch 52: val_loss did not improve from 0.74022
Epoch 53/100
Epoch 53: val_loss did not improve from 0.74022
Epoch 54/100
Epoch 54: val_loss did not improve from 0.74022
Epoch 55/100
Epoch 55: val_loss did not improve from 0.74022
Epoch 56/100
Epoch 56: val_loss did not improve from 0.74022
Epoch 57/100
Epoch 57: val_loss did not improve from 0.74022
Epoch 58/100
Epoch 58: val_loss did not improve from 0.74022
Epoch 59/100
Epoch 59: val_loss did not improve from 0.74022
Epoch 60/100
Epoch 60: val_loss did not improve from 0.74022
Epoch 61/100
Epoch 61: val_loss did not improve from 0.74022
Epoch 62/100
Epoch 62: val_loss did not improve from 0.74022
Epoch 63/100
Epoch 63: val_loss did not improve from 0.74022
Epoch 64/100
Epoch 64: val_loss did not improve from 0.74022
Epoch 65/100
Epoch 65: val_loss did not improve from 0.74022
Epoch 66/100
Epoch 66: v

Epoch 80/100
Epoch 80: val_loss did not improve from 0.74022
Epoch 81/100
Epoch 81: val_loss did not improve from 0.74022
Epoch 82/100
Epoch 82: val_loss did not improve from 0.74022
Epoch 83/100
Epoch 83: val_loss did not improve from 0.74022
Epoch 84/100
Epoch 84: val_loss did not improve from 0.74022
Epoch 85/100
Epoch 85: val_loss did not improve from 0.74022
Epoch 86/100
Epoch 86: val_loss did not improve from 0.74022
Epoch 87/100
Epoch 87: val_loss did not improve from 0.74022
Epoch 88/100
Epoch 88: val_loss did not improve from 0.74022
Epoch 89/100
Epoch 89: val_loss did not improve from 0.74022
Epoch 90/100
Epoch 90: val_loss did not improve from 0.74022
Epoch 91/100
Epoch 91: val_loss did not improve from 0.74022
Epoch 92/100
Epoch 92: val_loss did not improve from 0.74022
Epoch 93/100
Epoch 93: val_loss did not improve from 0.74022
Epoch 94/100
Epoch 94: val_loss did not improve from 0.74022
Epoch 95/100
Epoch 95: val_loss did not improve from 0.74022
Epoch 96/100
Epoch 96: v

Epoch 8/100
Epoch 8: val_loss improved from 1.00143 to 0.99091, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold3.hdf5
Epoch 9/100
Epoch 9: val_loss improved from 0.99091 to 0.97595, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold3.hdf5
Epoch 10/100
Epoch 10: val_loss improved from 0.97595 to 0.95595, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold3.hdf5
Epoch 11/100
Epoch 11: val_loss improved from 0.95595 to 0.93436, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold3.hdf5
Epoch 12/100
Epoch 12: val_loss improved from 0.93436 to 0.90545, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold3.hdf5
Epoch 13/100
Epoch 13: val_loss improved from 0.90545 to 0.87744, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-f

Epoch 34/100
Epoch 34: val_loss did not improve from 0.75117
Epoch 35/100
Epoch 35: val_loss did not improve from 0.75117
Epoch 36/100
Epoch 36: val_loss did not improve from 0.75117
Epoch 37/100
Epoch 37: val_loss did not improve from 0.75117
Epoch 38/100
Epoch 38: val_loss did not improve from 0.75117
Epoch 39/100
Epoch 39: val_loss did not improve from 0.75117
Epoch 40/100
Epoch 40: val_loss did not improve from 0.75117
Epoch 41/100
Epoch 41: val_loss did not improve from 0.75117
Epoch 42/100
Epoch 42: val_loss did not improve from 0.75117
Epoch 43/100
Epoch 43: val_loss did not improve from 0.75117
Epoch 44/100
Epoch 44: val_loss did not improve from 0.75117
Epoch 45/100
Epoch 45: val_loss did not improve from 0.75117
Epoch 46/100
Epoch 46: val_loss did not improve from 0.75117
Epoch 47/100
Epoch 47: val_loss did not improve from 0.75117
Epoch 48/100
Epoch 48: val_loss did not improve from 0.75117
Epoch 49/100
Epoch 49: val_loss did not improve from 0.75117
Epoch 50/100
Epoch 50: v

Epoch 64/100
Epoch 64: val_loss did not improve from 0.75117
Epoch 65/100
Epoch 65: val_loss did not improve from 0.75117
Epoch 66/100
Epoch 66: val_loss did not improve from 0.75117
Epoch 67/100
Epoch 67: val_loss did not improve from 0.75117
Epoch 68/100
Epoch 68: val_loss did not improve from 0.75117
Epoch 69/100
Epoch 69: val_loss did not improve from 0.75117
Epoch 70/100
Epoch 70: val_loss did not improve from 0.75117
Epoch 71/100
Epoch 71: val_loss did not improve from 0.75117
Epoch 72/100
Epoch 72: val_loss did not improve from 0.75117
Epoch 73/100
Epoch 73: val_loss did not improve from 0.75117
Epoch 74/100
Epoch 74: val_loss did not improve from 0.75117
Epoch 75/100
Epoch 75: val_loss did not improve from 0.75117
Epoch 76/100
Epoch 76: val_loss did not improve from 0.75117
Epoch 77/100
Epoch 77: val_loss did not improve from 0.75117
Epoch 78/100
Epoch 78: val_loss did not improve from 0.75117
Epoch 79/100
Epoch 79: val_loss did not improve from 0.75117
Epoch 80/100
Epoch 80: v

Epoch 94/100
Epoch 94: val_loss did not improve from 0.75117
Epoch 95/100
Epoch 95: val_loss did not improve from 0.75117
Epoch 96/100
Epoch 96: val_loss did not improve from 0.75117
Epoch 97/100
Epoch 97: val_loss did not improve from 0.75117
Epoch 98/100
Epoch 98: val_loss did not improve from 0.75117
Epoch 99/100
Epoch 99: val_loss did not improve from 0.75117
Epoch 100/100
Epoch 100: val_loss did not improve from 0.75117

Train/Test model on Fold #4.
Epoch 1/100
Epoch 1: val_loss improved from inf to 1.04568, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold4.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 1.04568 to 1.03878, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold4.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 1.03878 to 1.02983, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold4.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 

Epoch 17: val_loss improved from 0.79192 to 0.72288, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold4.hdf5
Epoch 18/100
Epoch 18: val_loss improved from 0.72288 to 0.69782, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\bestModel-fold4.hdf5
Epoch 19/100
Epoch 19: val_loss did not improve from 0.69782
Epoch 20/100
Epoch 20: val_loss did not improve from 0.69782
Epoch 21/100
Epoch 21: val_loss did not improve from 0.69782
Epoch 22/100
Epoch 22: val_loss did not improve from 0.69782
Epoch 23/100
Epoch 23: val_loss did not improve from 0.69782
Epoch 24/100
Epoch 24: val_loss did not improve from 0.69782
Epoch 25/100
Epoch 25: val_loss did not improve from 0.69782
Epoch 26/100
Epoch 26: val_loss did not improve from 0.69782
Epoch 27/100
Epoch 27: val_loss did not improve from 0.69782
Epoch 28/100
Epoch 28: val_loss did not improve from 0.69782
Epoch 29/100
Epoch 29: val_loss did not improve from 0.69782
Epoc

Epoch 46: val_loss did not improve from 0.69782
Epoch 47/100
Epoch 47: val_loss did not improve from 0.69782
Epoch 48/100
Epoch 48: val_loss did not improve from 0.69782
Epoch 49/100
Epoch 49: val_loss did not improve from 0.69782
Epoch 50/100
Epoch 50: val_loss did not improve from 0.69782
Epoch 51/100
Epoch 51: val_loss did not improve from 0.69782
Epoch 52/100
Epoch 52: val_loss did not improve from 0.69782
Epoch 53/100
Epoch 53: val_loss did not improve from 0.69782
Epoch 54/100
Epoch 54: val_loss did not improve from 0.69782
Epoch 55/100
Epoch 55: val_loss did not improve from 0.69782
Epoch 56/100
Epoch 56: val_loss did not improve from 0.69782
Epoch 57/100
Epoch 57: val_loss did not improve from 0.69782
Epoch 58/100
Epoch 58: val_loss did not improve from 0.69782
Epoch 59/100
Epoch 59: val_loss did not improve from 0.69782
Epoch 60/100
Epoch 60: val_loss did not improve from 0.69782
Epoch 61/100
Epoch 61: val_loss did not improve from 0.69782
Epoch 62/100
Epoch 62: val_loss did n

Epoch 76/100
Epoch 76: val_loss did not improve from 0.69782
Epoch 77/100
Epoch 77: val_loss did not improve from 0.69782
Epoch 78/100
Epoch 78: val_loss did not improve from 0.69782
Epoch 79/100
Epoch 79: val_loss did not improve from 0.69782
Epoch 80/100
Epoch 80: val_loss did not improve from 0.69782
Epoch 81/100
Epoch 81: val_loss did not improve from 0.69782
Epoch 82/100
Epoch 82: val_loss did not improve from 0.69782
Epoch 83/100
Epoch 83: val_loss did not improve from 0.69782
Epoch 84/100
Epoch 84: val_loss did not improve from 0.69782
Epoch 85/100
Epoch 85: val_loss did not improve from 0.69782
Epoch 86/100
Epoch 86: val_loss did not improve from 0.69782
Epoch 87/100
Epoch 87: val_loss did not improve from 0.69782
Epoch 88/100
Epoch 88: val_loss did not improve from 0.69782
Epoch 89/100
Epoch 89: val_loss did not improve from 0.69782
Epoch 90/100
Epoch 90: val_loss did not improve from 0.69782
Epoch 91/100
Epoch 91: val_loss did not improve from 0.69782
Epoch 92/100
Epoch 92: v

## k-fold Training evaluation

In [15]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test,0.76993,0.779469,0.856647,0.757322,0.782515,0.542609
Train,0.983104,0.986742,0.999007,0.979643,0.986568,0.966619


In [16]:
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.748948	0.744609	0.823913	0.758201	0.739696	0.498256
# Train	0.817065	0.810864	0.895026	0.827457	0.806676	0.634837

In [17]:
evaluations_df[evaluations_df["Train_Test"] == 'Test']

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
1,0,Test,0.754717,0.740157,"[0.0, 0.0041841004184100415, 0.025104602510460...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9976615, 0.9976615, 0.97916555, 0.9785064, ...",0.835686,0.786611,0.722689,0.510378
3,1,Test,0.811321,0.808333,"[0.0, 0.004201680672268907, 0.0126050420168067...","[0.0, 0.0, 0.0, 0.0, 0.008368200836820083, 0.0...","[2.0, 1.0, 0.9999927, 0.99484164, 0.99316144, ...",0.892804,0.815126,0.807531,0.622668
5,2,Test,0.752101,0.819149,"[0.0, 0.004201680672268907, 0.1512605042016806...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9914165, 0.9914164, 0.9158379, 0.91386926, ...",0.852985,0.647059,0.857143,0.515711
7,3,Test,0.762605,0.749004,"[0.0, 0.004201680672268907, 0.1134453781512605...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9731324, 0.9731323, 0.89994794, 0.8992969, ...",0.840777,0.789916,0.735294,0.525995
9,4,Test,0.768908,0.780702,"[0.0, 0.004201680672268907, 0.1092436974789916...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9904468, 0.9904468, 0.92771536, 0.9271604, ...",0.860983,0.747899,0.789916,0.53829


In [18]:
evaluations_df

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,0,Train,0.983202,0.970348,"[0.0, 0.0010504201680672268, 0.319327731092436...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.9977503, 0.99775034, 0.9255025, 0.9249926, ...",0.999376,0.996849,0.96957,0.966765
1,0,Test,0.754717,0.740157,"[0.0, 0.0041841004184100415, 0.025104602510460...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9976615, 0.9976615, 0.97916555, 0.9785064, ...",0.835686,0.786611,0.722689,0.510378
2,1,Train,0.995276,0.995798,"[0.0, 0.001049317943336831, 0.323189926547744,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.9999986, 0.99999857, 0.99046844, 0.99044305...",0.999408,0.994753,0.995798,0.990552
3,1,Test,0.811321,0.808333,"[0.0, 0.004201680672268907, 0.0126050420168067...","[0.0, 0.0, 0.0, 0.0, 0.008368200836820083, 0.0...","[2.0, 1.0, 0.9999927, 0.99484164, 0.99316144, ...",0.892804,0.815126,0.807531,0.622668
4,2,Train,0.970619,0.998888,"[0.0, 0.001049317943336831, 0.0766002098635886...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.9966245, 0.99662447, 0.94983214, 0.9496415,...",0.999436,0.942288,0.998951,0.942753
5,2,Test,0.752101,0.819149,"[0.0, 0.004201680672268907, 0.1512605042016806...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9914165, 0.9914164, 0.9158379, 0.91386926, ...",0.852985,0.647059,0.857143,0.515711
6,3,Train,0.981637,0.976141,"[0.0, 0.001049317943336831, 0.2675760755508919...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.9929612, 0.9929611, 0.87987566, 0.8795663, ...",0.998765,0.987408,0.975866,0.963338
7,3,Test,0.762605,0.749004,"[0.0, 0.004201680672268907, 0.1134453781512605...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9731324, 0.9731323, 0.89994794, 0.8992969, ...",0.840777,0.789916,0.735294,0.525995
8,4,Train,0.984785,0.992537,"[0.0, 0.001049317943336831, 0.1353620146904512...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.997844, 0.997844, 0.93598413, 0.93540037, 0...",0.998049,0.976915,0.992655,0.96969
9,4,Test,0.768908,0.780702,"[0.0, 0.004201680672268907, 0.1092436974789916...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9904468, 0.9904468, 0.92771536, 0.9271604, ...",0.860983,0.747899,0.789916,0.53829


# Independent data

In [19]:
##################################################################################
##### Read CSV data
##################################################################################

indpe_data_filepath = os.path.join(input_data_folder, indpe_data_filename)
indpe_data = pd.read_csv(indpe_data_filepath, sep=',', header=0)
indpe_data = indpe_data.drop('label', axis=1)

indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in indpe_data['nameseq']])

indpe_data = indpe_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

indpe_features = np.array(indpe_data.drop('label', axis=1))
indpe_features = indpe_features[:, :, np.newaxis]
indpe_labels = np.array(indpe_data['label'])
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

## Using k-fold Models

### Performance of each k-fold model

In [20]:
## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    label_pred = pred2label(y_pred)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(indpe_labels, label_pred)
    prec = precision_score(indpe_labels,label_pred)
    mcc = matthews_corrcoef(indpe_labels, label_pred)

    conf = confusion_matrix(indpe_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
    auc = roc_auc_score(indpe_labels, y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Independent")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    del model
    tf.keras.backend.clear_session()

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.579918,0.222653,0.633278,0.608867,0.574168,0.13783


In [21]:
evaluations_df

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,0,Independent,0.542857,0.218009,"[0.0, 0.0049261083743842365, 0.009852216748768...","[0.0, 0.0, 0.0, 0.0009784735812133072, 0.00097...","[1.9940883, 0.9940883, 0.99345136, 0.9931758, ...",0.631299,0.679803,0.515656,0.145434
1,1,Independent,0.532245,0.202572,"[0.0, 0.0, 0.0049261083743842365, 0.0049261083...","[0.0, 0.0009784735812133072, 0.000978473581213...","[1.9999995, 0.9999995, 0.99999285, 0.99998796,...",0.617624,0.62069,0.514677,0.100677
2,2,Independent,0.653061,0.238208,"[0.0, 0.0049261083743842365, 0.004926108374384...","[0.0, 0.0, 0.0009784735812133072, 0.0009784735...","[1.9949415, 0.99494153, 0.99180156, 0.98789155...",0.639146,0.497537,0.683953,0.141849
3,3,Independent,0.567347,0.220513,"[0.0, 0.0049261083743842365, 0.009852216748768...","[0.0, 0.0, 0.0, 0.0029354207436399216, 0.00293...","[1.9888995, 0.9888994, 0.9818885, 0.96961826, ...",0.636316,0.635468,0.553816,0.140903
4,4,Independent,0.604082,0.233962,"[0.0, 0.0049261083743842365, 0.024630541871921...","[0.0, 0.0, 0.0, 0.0009784735812133072, 0.00097...","[1.9911834, 0.9911834, 0.983288, 0.9774393, 0....",0.642004,0.610837,0.60274,0.160287


### Mean score with k-fold models

In [22]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    total_pred += y_pred
    all_preds.append(y_pred)
    
    del model
    tf.keras.backend.clear_session()
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.595918,0.224528,0.64371,0.586207,0.597847,0.13813


### Voting score with k-fold models

In [23]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    vote_pred = pred2label(y_pred)
    total_pred += vote_pred
    all_preds.append(vote_pred)
    
    del model
    tf.keras.backend.clear_session()
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.597551,0.226415,0.631017,0.591133,0.598826,0.142562


## Using New Model

Train model on full data from training. Predict and evaluate on Independent data.

In [24]:
model = DLNN_Classifier(input_vec_shape = input_vec_shape)
    
## Define the model callbacks for early stopping and saving the model. Then train model
current_model_path = os.path.join(modelPath, "_fullModel.hdf5")
modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                       monitor = monitor, verbose = 1, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

# adding random shuffling of the dataset for training purpose
index_arr = np.arange(train_features.shape[0])
index_arr = np.random.permutation(index_arr)

model.fit(x = train_features[index_arr], y = train_labels[index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
          callbacks = modelCallbacks, validation_data = (indpe_features, indpe_labels))
# model.fit(x = train_features[index_arr], y = train_labels[index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
#           callbacks = modelCallbacks, validation_split = 0.2)

del model
tf.keras.backend.clear_session()

model = tf.keras.models.load_model(current_model_path)

Epoch 1/100
Epoch 1: val_loss improved from inf to 1.05239, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\_fullModel.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 1.05239 to 1.04584, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\_fullModel.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 1.04584 to 1.03403, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\_fullModel.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 1.03403 to 1.02341, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\_fullModel.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 1.02341 to 1.00181, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\_fullModel.hdf5
Epoch 6/100
Epoch 6: val_loss improved from 1.00181 to 0.98956, saving model to Results\NT_Site_PredNTS_Classification_DLNN_Kmer_CNN\5fold\models\_fullModel.hdf5
Epoch 7/100
Epoch 7: val_loss im

Epoch 28/100
Epoch 28: val_loss did not improve from 0.91188
Epoch 29/100
Epoch 29: val_loss did not improve from 0.91188
Epoch 30/100
Epoch 30: val_loss did not improve from 0.91188
Epoch 31/100
Epoch 31: val_loss did not improve from 0.91188
Epoch 32/100
Epoch 32: val_loss did not improve from 0.91188
Epoch 33/100
Epoch 33: val_loss did not improve from 0.91188
Epoch 34/100
Epoch 34: val_loss did not improve from 0.91188
Epoch 35/100
Epoch 35: val_loss did not improve from 0.91188
Epoch 36/100
Epoch 36: val_loss did not improve from 0.91188
Epoch 37/100
Epoch 37: val_loss did not improve from 0.91188
Epoch 38/100
Epoch 38: val_loss did not improve from 0.91188
Epoch 39/100
Epoch 39: val_loss did not improve from 0.91188
Epoch 40/100
Epoch 40: val_loss did not improve from 0.91188
Epoch 41/100
Epoch 41: val_loss did not improve from 0.91188
Epoch 42/100
Epoch 42: val_loss did not improve from 0.91188
Epoch 43/100
Epoch 43: val_loss did not improve from 0.91188
Epoch 44/100
Epoch 44: v

Epoch 58/100
Epoch 58: val_loss did not improve from 0.91188
Epoch 59/100
Epoch 59: val_loss did not improve from 0.91188
Epoch 60/100
Epoch 60: val_loss did not improve from 0.91188
Epoch 61/100
Epoch 61: val_loss did not improve from 0.91188
Epoch 62/100
Epoch 62: val_loss did not improve from 0.91188
Epoch 63/100
Epoch 63: val_loss did not improve from 0.91188
Epoch 64/100
Epoch 64: val_loss did not improve from 0.91188
Epoch 65/100
Epoch 65: val_loss did not improve from 0.91188
Epoch 66/100
Epoch 66: val_loss did not improve from 0.91188
Epoch 67/100
Epoch 67: val_loss did not improve from 0.91188
Epoch 68/100
Epoch 68: val_loss did not improve from 0.91188
Epoch 69/100
Epoch 69: val_loss did not improve from 0.91188
Epoch 70/100
Epoch 70: val_loss did not improve from 0.91188
Epoch 71/100
Epoch 71: val_loss did not improve from 0.91188
Epoch 72/100
Epoch 72: val_loss did not improve from 0.91188
Epoch 73/100
Epoch 73: val_loss did not improve from 0.91188
Epoch 74/100
Epoch 74: v

Epoch 88/100
Epoch 88: val_loss did not improve from 0.91188
Epoch 89/100
Epoch 89: val_loss did not improve from 0.91188
Epoch 90/100
Epoch 90: val_loss did not improve from 0.91188
Epoch 91/100
Epoch 91: val_loss did not improve from 0.91188
Epoch 92/100
Epoch 92: val_loss did not improve from 0.91188
Epoch 93/100
Epoch 93: val_loss did not improve from 0.91188
Epoch 94/100
Epoch 94: val_loss did not improve from 0.91188
Epoch 95/100
Epoch 95: val_loss did not improve from 0.91188
Epoch 96/100
Epoch 96: val_loss did not improve from 0.91188
Epoch 97/100
Epoch 97: val_loss did not improve from 0.91188
Epoch 98/100
Epoch 98: val_loss did not improve from 0.91188
Epoch 99/100
Epoch 99: val_loss did not improve from 0.91188
Epoch 100/100
Epoch 100: val_loss did not improve from 0.91188


In [25]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = model.predict(indpe_features)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

del model
tf.keras.backend.clear_session()

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.678367,0.270983,0.66527,0.55665,0.702544,0.203388


In [26]:
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Independent	0.772245	0.324074	0.65458	0.344828	0.857143	0.197056

In [27]:
print(classification_report(indpe_labels, np.round(y_pred).astype(int)))

              precision    recall  f1-score   support

           0       0.89      0.70      0.78      1022
           1       0.27      0.56      0.36       203

    accuracy                           0.68      1225
   macro avg       0.58      0.63      0.57      1225
weighted avg       0.79      0.68      0.72      1225



In [28]:
# Model: "model"
# _________________________________________________________________
#  Layer (type)                Output Shape              Param #   
# =================================================================
#  input_1 (InputLayer)        [(None, 8420, 1)]         0         
                                                                 
#  conv1d (Conv1D)             (None, 833, 25)           2525      
                                                                 
#  activation (Activation)     (None, 833, 25)           0         
                                                                 
#  conv1d_1 (Conv1D)           (None, 79, 25)            31275     
                                                                 
#  activation_1 (Activation)   (None, 79, 25)            0         
                                                                 
#  dropout (Dropout)           (None, 79, 25)            0         
                                                                 
#  flatten (Flatten)           (None, 1975)              0         
                                                                 
#  dense (Dense)               (None, 25)                49400     
                                                                 
#  batch_normalization (BatchN  (None, 25)               100       
#  ormalization)                                                   
                                                                 
#  dropout_1 (Dropout)         (None, 25)                0         
                                                                 
#  dense_1 (Dense)             (None, 1)                 26        
                                                                 
# =================================================================
# Total params: 83,326
# Trainable params: 83,276
# Non-trainable params: 50
# _________________________________________________________________