In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_DLNN_CORENup_v3"
outPath = "Results"
foldName = "folds.pickle"

epochs = 50
batch_size = 64
shuffle = True
seed = None

input_data_folder = "Data"
training_data_file = "Training-datasets-PredNTS.txt"
independent_data_file = "independent dataset-PredNTS.txt"

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

import math

In [3]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_nt(sequence, char_dict):
    
    seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
    i = 0
    for single_character in sequence:
        if(single_character.upper() in char_dict.keys()):
            seq_encoded[i][char_dict[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in NT sequence: '+sequence)
    return seq_encoded

In [5]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

In [6]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

In [7]:
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DLNN_CORENup(input_seq_shape = (41, 21),
                 conv_filters_per_layer = 10, kernel_length = [10,5], conv_strides = 1,
                 max_pool_width = 3, max_pool_stride = 3, ## 1st Maxpool layer parameters
                 rnn_decode_units = 25, ## LSTM layer parameters
                 dense_decode_units = 128, ## Dense layer parameters
                 prob = 0.5, learn_rate = 0.001, 
                 loss = 'binary_crossentropy', metrics = None):
    
    beta = 0.001
    
    ######################################################################################################
    ########  SEQUENCE input  ############################################################################
    ######################################################################################################
    
    input1 = tf.keras.layers.Input(shape=input_seq_shape)
    
    ######################################################################################################
    ########  CONV Path  ############################################################################
    ######################################################################################################

    ## Conv layers
    
    for i in range(len(kernel_length)):
        
        if i == 0:
            conv = tf.keras.layers.Conv1D(conv_filters_per_layer, kernel_length[i],
                                          strides = conv_strides, kernel_regularizer = tf.keras.regularizers.l2(beta)
                                        )(input1)
        else:
            conv = tf.keras.layers.Conv1D(conv_filters_per_layer, kernel_length[i], 
                                          strides = conv_strides, kernel_regularizer = tf.keras.regularizers.l2(beta)
                                         )(conv)
        
        conv = tf.keras.layers.Activation('relu')(conv)
        conv = tf.keras.layers.MaxPool1D(pool_size = max_pool_width, strides = max_pool_stride)(conv)

    conv = tf.keras.layers.Flatten()(conv)
    
    ## Conv Scorer
    
#     conv = tf.keras.layers.Dropout(prob)(conv)
    
    conv = tf.keras.layers.Dense(dense_decode_units, 
                                 kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                 activation = 'relu')(conv)
    
    conv = tf.keras.layers.Dropout(prob)(conv)
    
    conv = tf.keras.layers.Dense(1, 
                                 kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                 activation = 'sigmoid')(conv)
    
    ######################################################################################################
    ########  RNN Path  ############################################################################
    ######################################################################################################
    
    ## RNN

    rnn = tf.keras.layers.GRU(rnn_decode_units, return_sequences = True, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(input1)
    
    rnn = tf.keras.layers.Flatten()(rnn)
    
    ## RNN scorer
    
#     rnn = tf.keras.layers.Dropout(prob)(rnn)
    
    rnn = tf.keras.layers.Dense(dense_decode_units, 
                                kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                activation = 'relu')(rnn)
    
    rnn = tf.keras.layers.Dropout(prob)(rnn)
    
    rnn = tf.keras.layers.Dense(1, 
                                kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                activation = 'sigmoid')(rnn)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Concatenate()([conv, rnn])
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

    return model

In [8]:
DLNN_CORENup().summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 41, 21)]     0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 32, 10)       2110        ['input_1[0][0]']                
                                                                                                  
 activation (Activation)        (None, 32, 10)       0           ['conv1d[0][0]']                 
                                                                                                  
 max_pooling1d (MaxPooling1D)   (None, 10, 10)       0           ['activation[0][0]']             
                                                                                              

# Training

In [9]:
##################################################################################
##### read training file
##################################################################################
train_file_path = os.path.join(input_data_folder, training_data_file)
train_data = pd.read_csv(train_file_path, sep='\t', header=None)
train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
train_data.head()

##################################################################################
##### Create dictionary of all characters in the NT sequence 
##################################################################################
all_char_set = set({})
for val in [set(val) for val in train_data['Sequence']]:
    all_char_set = all_char_set.union(val)
all_char_list = list(all_char_set)
all_char_list.sort()
all_char_dict = {}
for i in range(len(all_char_list)):
    all_char_dict[all_char_list[i]] = i
    
##################################################################################
##### Create OHE of sequence
##################################################################################
train_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in train_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
train_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in train_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

features = np.array(list(train_data['OHE_Sequence']))
labels = np.array(list(train_data['label']))
labels = labels.reshape((labels.shape[0], 1))

input_seq_shape = features[0].shape

folds = build_kfold(features, labels, k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

In [10]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    model = DLNN_CORENup(input_seq_shape = input_seq_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                           monitor = 'val_loss', verbose = 1, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    # adding random shuffling of the dataset for training purpose
    index_arr = np.arange(fold["X_train"].shape[0])
    index_arr = np.random.permutation(index_arr)
    
    model.fit(x = fold["X_train"][index_arr], y = fold["y_train"][index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
              callbacks = modelCallbacks, validation_data = (fold["X_test"], fold["y_test"]))
    
    model = tf.keras.models.load_model(current_model_path)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict(fold["X_train"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.predict(fold["X_test"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)


Train/Test model on Fold #0.
Epoch 1/50
Epoch 1: val_loss improved from inf to 0.86146, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold0.hdf5
Epoch 2/50
Epoch 2: val_loss improved from 0.86146 to 0.75570, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold0.hdf5
Epoch 3/50
Epoch 3: val_loss improved from 0.75570 to 0.69865, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold0.hdf5
Epoch 4/50
Epoch 4: val_loss improved from 0.69865 to 0.66446, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold0.hdf5
Epoch 5/50
Epoch 5: val_loss improved from 0.66446 to 0.65146, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold0.hdf5
Epoch 6/50
Epoch 6: val_loss improved from 0.65146 to 0.62071, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\

Epoch 34/50
Epoch 34: val_loss did not improve from 0.57696
Epoch 35/50
Epoch 35: val_loss did not improve from 0.57696
Epoch 36/50
Epoch 36: val_loss did not improve from 0.57696
Epoch 37/50
Epoch 37: val_loss did not improve from 0.57696
Epoch 38/50
Epoch 38: val_loss did not improve from 0.57696
Epoch 39/50
Epoch 39: val_loss did not improve from 0.57696
Epoch 40/50
Epoch 40: val_loss did not improve from 0.57696
Epoch 41/50
Epoch 41: val_loss did not improve from 0.57696
Epoch 42/50
Epoch 42: val_loss did not improve from 0.57696
Epoch 43/50
Epoch 43: val_loss did not improve from 0.57696
Epoch 44/50
Epoch 44: val_loss did not improve from 0.57696
Epoch 45/50
Epoch 45: val_loss did not improve from 0.57696
Epoch 46/50
Epoch 46: val_loss did not improve from 0.57696
Epoch 47/50
Epoch 47: val_loss did not improve from 0.57696
Epoch 48/50
Epoch 48: val_loss did not improve from 0.57696
Epoch 49/50
Epoch 49: val_loss did not improve from 0.57696
Epoch 50/50
Epoch 50: val_loss did not i

Epoch 17/50
Epoch 17: val_loss did not improve from 0.61406
Epoch 18/50
Epoch 18: val_loss did not improve from 0.61406
Epoch 19/50
Epoch 19: val_loss did not improve from 0.61406
Epoch 20/50
Epoch 20: val_loss did not improve from 0.61406
Epoch 21/50
Epoch 21: val_loss improved from 0.61406 to 0.61399, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold1.hdf5
Epoch 22/50
Epoch 22: val_loss did not improve from 0.61399
Epoch 23/50
Epoch 23: val_loss did not improve from 0.61399
Epoch 24/50
Epoch 24: val_loss did not improve from 0.61399
Epoch 25/50
Epoch 25: val_loss improved from 0.61399 to 0.61354, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold1.hdf5
Epoch 26/50
Epoch 26: val_loss did not improve from 0.61354
Epoch 27/50
Epoch 27: val_loss did not improve from 0.61354
Epoch 28/50
Epoch 28: val_loss did not improve from 0.61354
Epoch 29/50
Epoch 29: val_loss did not improve from 0.61354


Epoch 3/50
Epoch 3: val_loss improved from 0.74868 to 0.70016, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold2.hdf5
Epoch 4/50
Epoch 4: val_loss improved from 0.70016 to 0.67656, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold2.hdf5
Epoch 5/50
Epoch 5: val_loss improved from 0.67656 to 0.66218, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold2.hdf5
Epoch 6/50
Epoch 6: val_loss improved from 0.66218 to 0.65283, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold2.hdf5
Epoch 7/50
Epoch 7: val_loss improved from 0.65283 to 0.64736, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold2.hdf5
Epoch 8/50
Epoch 8: val_loss improved from 0.64736 to 0.64518, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fol

Epoch 32/50
Epoch 32: val_loss did not improve from 0.62137
Epoch 33/50
Epoch 33: val_loss did not improve from 0.62137
Epoch 34/50
Epoch 34: val_loss improved from 0.62137 to 0.62088, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold2.hdf5
Epoch 35/50
Epoch 35: val_loss did not improve from 0.62088
Epoch 36/50
Epoch 36: val_loss did not improve from 0.62088
Epoch 37/50
Epoch 37: val_loss did not improve from 0.62088
Epoch 38/50
Epoch 38: val_loss did not improve from 0.62088
Epoch 39/50
Epoch 39: val_loss did not improve from 0.62088
Epoch 40/50
Epoch 40: val_loss did not improve from 0.62088
Epoch 41/50
Epoch 41: val_loss did not improve from 0.62088
Epoch 42/50
Epoch 42: val_loss did not improve from 0.62088
Epoch 43/50
Epoch 43: val_loss did not improve from 0.62088
Epoch 44/50
Epoch 44: val_loss did not improve from 0.62088
Epoch 45/50
Epoch 45: val_loss did not improve from 0.62088
Epoch 46/50
Epoch 46: val_loss did not improve fro

Epoch 15/50
Epoch 15: val_loss improved from 0.58040 to 0.56646, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold3.hdf5
Epoch 16/50
Epoch 16: val_loss improved from 0.56646 to 0.56336, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold3.hdf5
Epoch 17/50
Epoch 17: val_loss did not improve from 0.56336
Epoch 18/50
Epoch 18: val_loss improved from 0.56336 to 0.55710, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold3.hdf5
Epoch 19/50
Epoch 19: val_loss did not improve from 0.55710
Epoch 20/50
Epoch 20: val_loss did not improve from 0.55710
Epoch 21/50
Epoch 21: val_loss did not improve from 0.55710
Epoch 22/50
Epoch 22: val_loss did not improve from 0.55710
Epoch 23/50
Epoch 23: val_loss did not improve from 0.55710
Epoch 24/50
Epoch 24: val_loss did not improve from 0.55710
Epoch 25/50
Epoch 25: val_loss did not improve from 0.55710
Epoch 26/5

Epoch 2/50
Epoch 2: val_loss improved from 0.84994 to 0.74505, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold4.hdf5
Epoch 3/50
Epoch 3: val_loss improved from 0.74505 to 0.68795, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold4.hdf5
Epoch 4/50
Epoch 4: val_loss improved from 0.68795 to 0.64755, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold4.hdf5
Epoch 5/50
Epoch 5: val_loss improved from 0.64755 to 0.61939, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold4.hdf5
Epoch 6/50
Epoch 6: val_loss improved from 0.61939 to 0.60561, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold4.hdf5
Epoch 7/50
Epoch 7: val_loss improved from 0.60561 to 0.60192, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fol

Epoch 30/50
Epoch 30: val_loss did not improve from 0.55652
Epoch 31/50
Epoch 31: val_loss did not improve from 0.55652
Epoch 32/50
Epoch 32: val_loss did not improve from 0.55652
Epoch 33/50
Epoch 33: val_loss did not improve from 0.55652
Epoch 34/50
Epoch 34: val_loss did not improve from 0.55652
Epoch 35/50
Epoch 35: val_loss did not improve from 0.55652
Epoch 36/50
Epoch 36: val_loss did not improve from 0.55652
Epoch 37/50
Epoch 37: val_loss did not improve from 0.55652
Epoch 38/50
Epoch 38: val_loss did not improve from 0.55652
Epoch 39/50
Epoch 39: val_loss improved from 0.55652 to 0.55625, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\bestModel-fold4.hdf5
Epoch 40/50
Epoch 40: val_loss did not improve from 0.55625
Epoch 41/50
Epoch 41: val_loss did not improve from 0.55625
Epoch 42/50
Epoch 42: val_loss did not improve from 0.55625
Epoch 43/50
Epoch 43: val_loss did not improve from 0.55625
Epoch 44/50
Epoch 44: val_loss did not improve fro

## k-fold Training evaluation

In [11]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test,0.759451,0.764704,0.832392,0.750631,0.768299,0.519218
Train,0.892733,0.902258,0.931971,0.880135,0.905329,0.785856


In [12]:
evaluations_df[evaluations_df["Train_Test"] == "Test"]

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
1,0,Test,0.767296,0.778261,"[0.0, 0.0041841004184100415, 0.075313807531380...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.8303012, 0.83030117, 0.8267717, 0.8260058, ...",0.842797,0.748954,0.785714,0.535007
3,1,Test,0.740042,0.731707,"[0.0, 0.004201680672268907, 0.0504201680672268...","[0.0, 0.0, 0.0, 0.0041841004184100415, 0.00418...","[1.8155707, 0.8155707, 0.81464505, 0.81464154,...",0.819196,0.756303,0.723849,0.480388
5,2,Test,0.714286,0.717949,"[0.0, 0.0, 0.037815126050420166, 0.03781512605...","[0.0, 0.004201680672268907, 0.0042016806722689...","[1.7489326, 0.7489326, 0.7489065, 0.7489033, 0...",0.788821,0.705882,0.722689,0.428632
7,3,Test,0.794118,0.799145,"[0.0, 0.004201680672268907, 0.0420168067226890...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.7979696, 0.7979695, 0.79654115, 0.79643345,...",0.847151,0.785714,0.802521,0.588318
9,4,Test,0.781513,0.79646,"[0.0, 0.004201680672268907, 0.0588235294117647...","[0.0, 0.0, 0.0, 0.004201680672268907, 0.004201...","[1.9090337, 0.90903366, 0.9078441, 0.9077635, ...",0.863993,0.756303,0.806723,0.563742


# Independent data

In [13]:
train_features = features
train_labels = labels

In [14]:
##################################################################################
##### read independent data file
##################################################################################
indpe_file_path = os.path.join(input_data_folder, independent_data_file)
indpe_data = pd.read_csv(indpe_file_path, sep='\t', header=None)
indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
indpe_data.head()
    
##################################################################################
##### Create OHE of sequence
##################################################################################
indpe_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in indpe_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in indpe_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

indpe_features = np.array(list(indpe_data['OHE_Sequence']))
indpe_labels = np.array(list(indpe_data['label']))
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

input_seq_shape = indpe_features[0].shape

## Using k-fold Models

### Performance of each k-fold model

In [15]:
## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    label_pred = pred2label(y_pred)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(indpe_labels, label_pred)
    prec = precision_score(indpe_labels,label_pred)
    mcc = matthews_corrcoef(indpe_labels, label_pred)

    conf = confusion_matrix(indpe_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
    auc = roc_auc_score(indpe_labels, y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Independent")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.628408,0.243447,0.658152,0.590148,0.636008,0.171553


In [16]:
evaluations_df

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,0,Independent,0.635102,0.253036,"[0.0, 0.0, 0.009852216748768473, 0.00985221674...","[0.0, 0.0009784735812133072, 0.000978473581213...","[1.830446, 0.83044606, 0.8303743, 0.83005667, ...",0.66422,0.615764,0.638943,0.19306
1,1,Independent,0.617143,0.248106,"[0.0, 0.0, 0.0049261083743842365, 0.0049261083...","[0.0, 0.0009784735812133072, 0.000978473581213...","[1.8155556, 0.8155555, 0.81550664, 0.8154487, ...",0.671753,0.64532,0.611546,0.192862
2,2,Independent,0.625306,0.238776,"[0.0, 0.0049261083743842365, 0.009852216748768...","[0.0, 0.0, 0.0, 0.004892367906066536, 0.004892...","[1.7489367, 0.74893665, 0.748936, 0.74893045, ...",0.646188,0.576355,0.635029,0.160437
3,3,Independent,0.635102,0.241525,"[0.0, 0.0049261083743842365, 0.004926108374384...","[0.0, 0.0, 0.003913894324853229, 0.00391389432...","[1.7981067, 0.7981066, 0.79802597, 0.79801047,...",0.654531,0.561576,0.649706,0.161424
4,4,Independent,0.629388,0.235789,"[0.0, 0.0049261083743842365, 0.004926108374384...","[0.0, 0.0, 0.0029354207436399216, 0.0029354207...","[1.909081, 0.90908104, 0.9089147, 0.908883, 0....",0.654069,0.551724,0.644814,0.149983


### Mean score with k-fold models

In [17]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    total_pred += y_pred
    all_preds.append(y_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.629388,0.245436,0.669469,0.596059,0.636008,0.175957


### Voting score with k-fold models

In [18]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model = tf.keras.models.load_model(current_model_path)

    y_pred = model.predict(indpe_features)
    vote_pred = pred2label(y_pred)
    total_pred += vote_pred
    all_preds.append(vote_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)

mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.635918,0.245283,0.654977,0.576355,0.64775,0.170889


## Using New Model

Train model on full data from training. Predict and evaluate on Independent data.

In [19]:
model = DLNN_CORENup(input_seq_shape = input_seq_shape)
    
## Define the model callbacks for early stopping and saving the model. Then train model
current_model_path = os.path.join(modelPath, "_fullModel.hdf5")
modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                       monitor = 'val_loss', verbose = 1, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

# adding random shuffling of the dataset for training purpose
index_arr = np.arange(train_features.shape[0])
index_arr = np.random.permutation(index_arr)

model.fit(x = train_features[index_arr], y = train_labels[index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
          callbacks = modelCallbacks, validation_data = (indpe_features, indpe_labels))
# model.fit(x = train_features[index_arr], y = train_labels[index_arr], batch_size = batch_size, epochs = epochs, verbose = 1, 
#           callbacks = modelCallbacks, validation_split = 0.2)

model = tf.keras.models.load_model(current_model_path)

Epoch 1/50
Epoch 1: val_loss improved from inf to 0.83806, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\_fullModel.hdf5
Epoch 2/50
Epoch 2: val_loss improved from 0.83806 to 0.82565, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\_fullModel.hdf5
Epoch 3/50
Epoch 3: val_loss improved from 0.82565 to 0.71941, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\_fullModel.hdf5
Epoch 4/50
Epoch 4: val_loss improved from 0.71941 to 0.71235, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\_fullModel.hdf5
Epoch 5/50
Epoch 5: val_loss did not improve from 0.71235
Epoch 6/50
Epoch 6: val_loss did not improve from 0.71235
Epoch 7/50
Epoch 7: val_loss improved from 0.71235 to 0.69940, saving model to Results\NT_Site_PredNTS_Classification_DLNN_CORENup_v3\5fold\models\_fullModel.hdf5
Epoch 8/50
Epoch 8: val_loss did not improve from 0.69940
Epoch 9/50
Epoc

Epoch 37: val_loss did not improve from 0.69940
Epoch 38/50
Epoch 38: val_loss did not improve from 0.69940
Epoch 39/50
Epoch 39: val_loss did not improve from 0.69940
Epoch 40/50
Epoch 40: val_loss did not improve from 0.69940
Epoch 41/50
Epoch 41: val_loss did not improve from 0.69940
Epoch 42/50
Epoch 42: val_loss did not improve from 0.69940
Epoch 43/50
Epoch 43: val_loss did not improve from 0.69940
Epoch 44/50
Epoch 44: val_loss did not improve from 0.69940
Epoch 45/50
Epoch 45: val_loss did not improve from 0.69940
Epoch 46/50
Epoch 46: val_loss did not improve from 0.69940
Epoch 47/50
Epoch 47: val_loss did not improve from 0.69940
Epoch 48/50
Epoch 48: val_loss did not improve from 0.69940
Epoch 49/50
Epoch 49: val_loss did not improve from 0.69940
Epoch 50/50
Epoch 50: val_loss did not improve from 0.69940


In [20]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = model.predict(indpe_features)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.627755,0.243408,0.653958,0.591133,0.635029,0.171481
