In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_Ensemble_7"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

ohe_input_data_folder = "Data"
ohe_training_data_file = "Training-datasets-PredNTS.txt"
ohe_independent_data_file = "independent dataset-PredNTS.txt"

enc_data_folder = "PredNTS_MathFeature_ENC"
kmer_train_data_filename = 'Training-datasets-PredNTS_kmer.csv'
kmer_indpe_data_filename = 'independent-dataset-PredNTS_kmer.csv'

kgap_max = 4
kgap_train_data_filename = 'Training-datasets-PredNTS_kgap_{}.csv'
kgap_indpe_data_filename = 'independent-dataset-PredNTS_kgap_{}.csv'

asim_enc_input_data_folder = "Data_from_Asim"
dde_filename = "Protein_DDE[100, 0, 0, 0]-st-simplesequence.csv"
dpr_filename = "Protein_DistancePair[100, 0, 0, 0]-st-simplesequence.csv"
dpc_filename = "Protein_DPC[100, 0, 0, 0]-st-simplesequence.csv"
tpc_filename = "TPC[100, 0, 0, 0]-st-simplesequence.csv"

callback_monitor = 'val_loss'

no_of_models = 7

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LinearRegression, LogisticRegression

import math

In [3]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_nt(sequence, char_dict):
    
    seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
    i = 0
    for single_character in sequence:
        if(single_character.upper() in char_dict.keys()):
            seq_encoded[i][char_dict[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in NT sequence: '+sequence)
    return seq_encoded

In [5]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(ohe_features, kmer_features, kgap_features, 
                dde_features, dpr_features, dpc_features, tpc_features,
                labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(ohe_features, labels):
        kfoldList.append({
            "X_OHE_train": ohe_features[train_index],
            "X_OHE_test": ohe_features[test_index],
            "X_Kmer_train": kmer_features[train_index],
            "X_Kmer_test": kmer_features[test_index],
            "X_Kgap_train": kgap_features[train_index],
            "X_Kgap_test": kgap_features[test_index],
            "X_DDE_train": dde_features[train_index],
            "X_DDE_test": dde_features[test_index],
            "X_DPR_train": dpr_features[train_index],
            "X_DPR_test": dpr_features[test_index],
            "X_DPC_train": dpc_features[train_index],
            "X_DPC_test": dpc_features[test_index],
            "X_TPC_train": tpc_features[train_index],
            "X_TPC_test": tpc_features[test_index],
            "y_train": labels[train_index],
            "y_test": labels[test_index]
        })
    return kfoldList

In [6]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred).astype(int)
    return y_pred

# Neural network models

In [7]:
ohe_epochs = 200
ohe_batch_size = 16
    
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def OHE_DLNN_CORENup(input_seq_shape = (41, 21),
                 conv_filters_per_layer_1 = 25, kernel_length_1 = 10, conv_strides_1 = 1, ## 1st Convolutional layer parameters
                 max_pool_width_1 = 3, max_pool_stride_1 = 3, ## 1st Maxpool layer parameters
                 lstm_decode_units = 25, ## LSTM layer parameters
                 conv_filters_per_layer_2 = 25,  kernel_length_2 = 5, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
                 max_pool_width_2 = 3, max_pool_stride_2 = 3, ## 2nd Maxpool layer parameters
                 dense_decode_units = 256, ## Dense layer parameters
                 prob = 0.5, learn_rate = 0.0005, 
                 loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    ######################################################################################################
    ########  SEQUENCE  ##################################################################################
    ######################################################################################################
    
    input1 = tf.keras.layers.Input(shape=input_seq_shape)

    x1 = tf.keras.layers.Conv1D(conv_filters_per_layer_1, kernel_length_1,
                                strides = conv_strides_1, kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = "same")(input1)
    x1 = tf.keras.layers.Activation('relu')(x1)
    x1 = tf.keras.layers.MaxPool1D(pool_size = max_pool_width_1, strides = max_pool_stride_1)(x1)
    x1 = tf.keras.layers.Dropout(prob)(x1)
    
    x1 = tf.keras.layers.GaussianNoise(stddev=0.1)(x1)

    ## LSTM Path

    x2 = tf.keras.layers.LSTM(lstm_decode_units, return_sequences = True, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(x1)
    
    x2 = tf.keras.layers.Dropout(prob)(x2)
    
    x2 = tf.keras.layers.Flatten()(x2)
    
    x2 = tf.keras.layers.GaussianNoise(stddev=0.1)(x2)

    ## Conv Path

    x3 = tf.keras.layers.Conv1D(conv_filters_per_layer_2, kernel_length_2, strides = conv_strides_2, 
                                kernel_regularizer = tf.keras.regularizers.l2(beta), padding = 'same')(x1)
    x3 = tf.keras.layers.Activation('relu')(x3)
    x3 = tf.keras.layers.MaxPooling1D(pool_size = max_pool_width_2, strides = max_pool_stride_2)(x3)
    x3 = tf.keras.layers.Dropout(prob)(x3)
    
    x3 = tf.keras.layers.Flatten()(x3)
    
    x3 = tf.keras.layers.GaussianNoise(stddev=0.1)(x3)
    
    x4 = tf.keras.layers.Concatenate(1)([x2,x3])
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(x4)
    
    y = tf.keras.layers.GaussianNoise(stddev=0.1)(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

    return model

In [8]:
kmer_epochs = 200
kmer_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def Kmer_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 8, ## Dense layer parameters,
                    dense_layers = 2,
                    prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(input1)
    y = tf.keras.layers.BatchNormalization()(y)
    y = tf.keras.layers.Dropout(prob)(y)
    
    for i in range(1,dense_layers+1):
    
        y = tf.keras.layers.Dense(int(dense_decode_units/(2**i)), 
                                  kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                 )(y)
        y = tf.keras.layers.BatchNormalization()(y)
        y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [9]:
kgap_epochs = 200
kgap_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def Kgap_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 128, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/4), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [10]:
dde_epochs = 100
dde_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DDE_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [11]:
dpr_epochs = 100
dpr_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DPR_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [12]:
dpc_epochs = 100
dpc_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DPC_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu'
                             )(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu'
                             )(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [13]:
tpc_epochs = 100
tpc_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def TPC_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [14]:
def logistic_classifier(cw = None):
    model = LogisticRegression(class_weight=cw)
    return model

# Training data preparation

In [15]:
##################################################################################
##### read SEQUENCE training file
##################################################################################
ohe_train_file_path = os.path.join(ohe_input_data_folder, ohe_training_data_file)
ohe_train_data = pd.read_csv(ohe_train_file_path, sep='\t', header=None)
ohe_train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
ohe_train_data.head()

##################################################################################
##### Create dictionary of all characters in the NT sequence 
##################################################################################
all_char_set = set({})
for val in [set(val) for val in ohe_train_data['Sequence']]:
    all_char_set = all_char_set.union(val)
all_char_list = list(all_char_set)
all_char_list.sort()
all_char_dict = {}
for i in range(len(all_char_list)):
    all_char_dict[all_char_list[i]] = i
    
##################################################################################
##### Create OHE of sequence
##################################################################################
ohe_train_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in ohe_train_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
ohe_train_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in ohe_train_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

ohe_train_features = np.array(list(ohe_train_data['OHE_Sequence']))
train_labels = np.array(list(ohe_train_data['label']))
train_labels = train_labels[:, np.newaxis]

ohe_input_seq_shape = ohe_train_features[0].shape

############################################################################################################
############################################################################################################
##### Read KMER data
############################################################################################################
############################################################################################################

kmer_train_data_filepath = os.path.join(enc_data_folder, kmer_train_data_filename)
kmer_train_data = pd.read_csv(kmer_train_data_filepath, sep=',', header=0)
kmer_train_data = kmer_train_data.drop('label', axis=1)

kmer_train_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kmer_train_data['nameseq']])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kmer_train_features = np.array(kmer_train_data.drop('label', axis=1).drop('nameseq', axis=1))
# train_labels = np.array(train_data['label'])
# train_labels = train_labels.reshape((train_labels.shape[0], 1))

kmer_input_vec_shape = kmer_train_features[0].shape

############################################################################################################
############################################################################################################
##### Read Kgap data
############################################################################################################
############################################################################################################

for i in range(kgap_max+1):
    
    kgap_current_train_data_filepath = os.path.join(enc_data_folder, kgap_train_data_filename.format(i))
    kgap_current_train_data = pd.read_csv(kgap_current_train_data_filepath, sep=',', header=0)
    kgap_current_train_data = kgap_current_train_data.drop('label', axis=1)
    
    if i == 0:
        kgap_train_data = kgap_current_train_data
    else:
        kgap_train_data = pd.merge(
            kgap_train_data,
            kgap_current_train_data,
            how="inner",
            on='nameseq'
        )

kgap_train_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kgap_train_data['nameseq']])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kgap_train_features = np.array(kgap_train_data.drop('label', axis=1).drop('nameseq', axis=1))
# train_labels = np.array(train_data['label'])
# train_labels = train_labels.reshape((train_labels.shape[0], 1))

kgap_input_vec_shape = kgap_train_features[0].shape

############################################################################################################
############################################################################################################
##### AsimEmbedding Training data
############################################################################################################
############################################################################################################

dde_data_file = os.path.join(asim_enc_input_data_folder, dde_filename)
dpr_data_file = os.path.join(asim_enc_input_data_folder, dpr_filename)
dpc_data_file = os.path.join(asim_enc_input_data_folder, dpc_filename)
tpc_data_file = os.path.join(asim_enc_input_data_folder, tpc_filename)

dde_data = pd.read_csv(dde_data_file, sep=',', header=0)
dpr_data = pd.read_csv(dpr_data_file, sep=',', header=0)
dpc_data = pd.read_csv(dpc_data_file, sep=',', header=0)
tpc_data = pd.read_csv(tpc_data_file, sep=',', header=0)

dde_train_data = dde_data[dde_data['set'] == 'train'].drop('set', axis=1)
dpr_train_data = dpr_data[dpr_data['set'] == 'train'].drop('set', axis=1)
dpc_train_data = dpc_data[dpc_data['set'] == 'train'].drop('set', axis=1)
tpc_train_data = tpc_data[tpc_data['set'] == 'train'].drop('set', axis=1)

dde_train_features = np.array(dde_train_data.drop('labels', axis=1))
dpr_train_features = np.array(dpr_train_data.drop('labels', axis=1))
dpc_train_features = np.array(dpc_train_data.drop('labels', axis=1))
tpc_train_features = np.array(tpc_train_data.drop('labels', axis=1))

dde_train_features_shape = dde_train_features[0].shape
dpr_train_features_shape = dpr_train_features[0].shape
dpc_train_features_shape = dpc_train_features[0].shape
tpc_train_features_shape = tpc_train_features[0].shape

# train_labels = np.array(dde_train_data["labels"])

############################################################################################################
############################################################################################################
##### Build folds
############################################################################################################
############################################################################################################

folds = build_kfold(ohe_train_features, kmer_train_features, kgap_train_features, 
                    dde_train_features, dpr_train_features, dpc_train_features, tpc_train_features,
                    train_labels, k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

# Independent data preparation

In [16]:
##################################################################################
##### read independent data file
##################################################################################
ohe_indpe_file_path = os.path.join(ohe_input_data_folder, ohe_independent_data_file)
ohe_indpe_data = pd.read_csv(ohe_indpe_file_path, sep='\t', header=None)
ohe_indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
ohe_indpe_data.head()
    
##################################################################################
##### Create OHE of sequence
##################################################################################
ohe_indpe_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in ohe_indpe_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
ohe_indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in ohe_indpe_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

ohe_indpe_features = np.array(list(ohe_indpe_data['OHE_Sequence']))
indpe_labels = np.array(list(ohe_indpe_data['label']))
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

############################################################################################################
############################################################################################################
##### Read Kmer data
############################################################################################################
############################################################################################################

kmer_indpe_data_filepath = os.path.join(enc_data_folder, kmer_indpe_data_filename)
kmer_indpe_data = pd.read_csv(kmer_indpe_data_filepath, sep=',', header=0)
kmer_indpe_data = kmer_indpe_data.drop('label', axis=1)

kmer_indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in kmer_indpe_data['nameseq']])

kmer_indpe_features = np.array(kmer_indpe_data.drop('label', axis=1).drop('nameseq', axis=1))

############################################################################################################
############################################################################################################
##### Read Kgap data
############################################################################################################
############################################################################################################

for i in range(kgap_max+1):

    kgap_current_indpe_data_filepath = os.path.join(enc_data_folder, kgap_indpe_data_filename.format(i))
    kgap_current_indpe_data = pd.read_csv(kgap_current_indpe_data_filepath, sep=',', header=0)
    kgap_current_indpe_data = kgap_current_indpe_data.drop('label', axis=1)
    
    if i == 0:
        kgap_indpe_data = kgap_current_indpe_data
    else:
        kgap_indpe_data = pd.merge(
            kgap_indpe_data,
            kgap_current_indpe_data,
            how="inner",
            on='nameseq'
        )

kgap_indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kgap_indpe_data['nameseq']])

kgap_indpe_features = np.array(kgap_indpe_data.drop('label', axis=1).drop('nameseq', axis=1))

############################################################################################################
############################################################################################################
##### AsimEmbedding Independent data
############################################################################################################
############################################################################################################

dde_indpe_data = dde_data[dde_data['set'] == 'test'].drop('set', axis=1)
dpr_indpe_data = dpr_data[dpr_data['set'] == 'test'].drop('set', axis=1)
dpc_indpe_data = dpc_data[dpc_data['set'] == 'test'].drop('set', axis=1)
tpc_indpe_data = tpc_data[tpc_data['set'] == 'test'].drop('set', axis=1)

dde_indpe_features = np.array(dde_indpe_data.drop('labels', axis=1))
dpr_indpe_features = np.array(dpr_indpe_data.drop('labels', axis=1))
dpc_indpe_features = np.array(dpc_indpe_data.drop('labels', axis=1))
tpc_indpe_features = np.array(tpc_indpe_data.drop('labels', axis=1))

# indpe_labels = np.array(dde_indpe_data["labels"])


# Training the ensemble

In [17]:
folds[0].keys()

dict_keys(['X_OHE_train', 'X_OHE_test', 'X_Kmer_train', 'X_Kmer_test', 'X_Kgap_train', 'X_Kgap_test', 'X_DDE_train', 'X_DDE_test', 'X_DPR_train', 'X_DPR_test', 'X_DPC_train', 'X_DPC_test', 'X_TPC_train', 'X_TPC_test', 'y_train', 'y_test'])

In [18]:
## Create and set directory to save all models
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

In [19]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Type" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(fold["X_OHE_train"].shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)
    
    ##################################################################################
    ##### Train OHE network
    ##################################################################################
    
    print("Training OHE network.")
    
    ohe_model = OHE_DLNN_CORENup(input_seq_shape = ohe_input_seq_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(ohe_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    ohe_model.fit(x = fold["X_OHE_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = ohe_batch_size, epochs = ohe_epochs, verbose = 0, 
                  callbacks = ohe_modelCallbacks, validation_data = (fold["X_OHE_test"], fold["y_test"]))
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### Train Kmer network
    ##################################################################################
    
    print("Training Kmer network.")
    
    kmer_model = Kmer_DLNN_Classifier(input_vec_shape = kmer_input_vec_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(kmer_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    kmer_model.fit(x = fold["X_Kmer_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                   batch_size = kmer_batch_size, epochs = kmer_epochs, verbose = 0, 
                   callbacks = kmer_modelCallbacks, validation_data = (fold["X_Kmer_test"], fold["y_test"]))
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### Train Kgap network
    ##################################################################################
    
    print("Training Kgap network.")
    
    kgap_model = Kgap_DLNN_Classifier(input_vec_shape = kgap_input_vec_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(kgap_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    kgap_model.fit(x = fold["X_Kgap_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                   batch_size = kgap_batch_size, epochs = kgap_epochs, verbose = 0, 
                   callbacks = kgap_modelCallbacks, validation_data = (fold["X_Kgap_test"], fold["y_test"]))
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### TRAIN DDE model
    ##################################################################################
    
    print("Training DDE model.")
    
    dde_model = DDE_DLNN_Classifier(input_vec_shape = dde_train_features_shape)

    dde_model_file_path = os.path.join(modelPath, "DDE_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    dde_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(dde_model_file_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    dde_model.fit(x = fold["X_DDE_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = dde_batch_size, epochs = dde_epochs, 
                  verbose = 0, callbacks = dde_modelCallbacks, 
                  validation_data = (fold["X_DDE_test"], fold["y_test"]))
    
    del dde_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### TRAIN DPR model
    ##################################################################################
    
    print("Training DPR model.")
    
    dpr_model = DPR_DLNN_Classifier(input_vec_shape = dpr_train_features_shape)

    dpr_model_file_path = os.path.join(modelPath, "DPR_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    dpr_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(dpr_model_file_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    dpr_model.fit(x = fold["X_DPR_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = dpr_batch_size, epochs = dpr_epochs, 
                  verbose = 0, callbacks = dpr_modelCallbacks, 
                  validation_data = (fold["X_DPR_test"], fold["y_test"]))
    
    del dpr_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### TRAIN DPC model
    ##################################################################################
    
    print("Training DPC model.")
    
    dpc_model = DPC_DLNN_Classifier(input_vec_shape = dpc_train_features_shape)

    dpc_model_file_path = os.path.join(modelPath, "DPC_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    dpc_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(dpc_model_file_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    dpc_model.fit(x = fold["X_DPC_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = dpc_batch_size, epochs = dpc_epochs, 
                  verbose = 0, callbacks = dpc_modelCallbacks, 
                  validation_data = (fold["X_DPC_test"], fold["y_test"]))
    
    del dpc_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### TRAIN TPC model
    ##################################################################################
    
    print("Training TPC model.")
    
    tpc_model = TPC_DLNN_Classifier(input_vec_shape = tpc_train_features_shape)

    tpc_model_file_path = os.path.join(modelPath, "TPC_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    tpc_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(tpc_model_file_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    tpc_model.fit(x = fold["X_TPC_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = tpc_batch_size, epochs = tpc_epochs, 
                  verbose = 0, callbacks = tpc_modelCallbacks, 
                  validation_data = (fold["X_TPC_test"], fold["y_test"]))
    
    del tpc_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### Generate scores for Train dataset
    ##################################################################################
    
    print("Generating the 7 scores.")
    
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    ohe_train_y_pred = ohe_model.predict(fold["X_OHE_train"])
    ohe_test_y_pred = ohe_model.predict(fold["X_OHE_test"])
    del ohe_model
    tf.keras.backend.clear_session()
    
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    kmer_train_y_pred = kmer_model.predict(fold["X_Kmer_train"])
    kmer_test_y_pred = kmer_model.predict(fold["X_Kmer_test"])
    del kmer_model
    tf.keras.backend.clear_session()
    
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    kgap_train_y_pred = kgap_model.predict(fold["X_Kgap_train"])
    kgap_test_y_pred = kgap_model.predict(fold["X_Kgap_test"])
    del kgap_model
    tf.keras.backend.clear_session()
    
    dde_model = tf.keras.models.load_model(dde_model_file_path)
    dde_train_y_pred = dde_model.predict(fold["X_DDE_train"])
    dde_test_y_pred = dde_model.predict(fold["X_DDE_test"])
    del dde_model
    tf.keras.backend.clear_session()
    
    dpr_model = tf.keras.models.load_model(dpr_model_file_path)
    dpr_train_y_pred = dpr_model.predict(fold["X_DPR_train"])
    dpr_test_y_pred = dpr_model.predict(fold["X_DPR_test"])
    del dpr_model
    tf.keras.backend.clear_session()
    
    dpc_model = tf.keras.models.load_model(dpc_model_file_path)
    dpc_train_y_pred = dpc_model.predict(fold["X_DPC_train"])
    dpc_test_y_pred = dpc_model.predict(fold["X_DPC_test"])
    del dpc_model
    tf.keras.backend.clear_session()
    
    tpc_model = tf.keras.models.load_model(tpc_model_file_path)
    tpc_train_y_pred = tpc_model.predict(fold["X_TPC_train"])
    tpc_test_y_pred = tpc_model.predict(fold["X_TPC_test"])
    del tpc_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### Accumulate the 7 scores
    ##################################################################################
    
    X_lr_train = np.concatenate((ohe_train_y_pred, kmer_train_y_pred, kgap_train_y_pred,
                                 dde_train_y_pred, dpr_train_y_pred, dpc_train_y_pred, tpc_train_y_pred), 
                                axis=1)
    X_lr_test = np.concatenate((ohe_test_y_pred, kmer_test_y_pred, kgap_test_y_pred, 
                                dde_test_y_pred, dpr_test_y_pred, dpc_test_y_pred, tpc_test_y_pred), 
                               axis=1)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating TRAIN set SUM metrics.")
    
    y_pred = np.sum(X_lr_train, axis=1)/no_of_models
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("Sum")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating TEST set SUM metrics.")
    
    y_pred = np.sum(X_lr_test, axis=1)/no_of_models
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("Sum")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating TRAIN set VOTE metrics.")
    
    y_pred = np.sum(pred2label(X_lr_train), axis=1)/no_of_models
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("Vote")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating TEST set VOTE metrics.")
    
    y_pred = np.sum(pred2label(X_lr_test), axis=1)/no_of_models
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("Vote")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### LOGISTIC regression using the scores
    ##################################################################################
    
    print("Training LOGISTIC regression.")
    
    lr_model = logistic_classifier()
    lr_model.fit(X_lr_train, fold["y_train"])
    
    lr_current_model_path = os.path.join(modelPath, "LogR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'wb')
    pickle.dump(lr_model, lr_model_file_obj)
    lr_model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating logistic train set metrics.")
    
    y_pred = lr_model.predict(X_lr_train)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("LogR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating logistic test set metrics.")

    y_pred = lr_model.predict(X_lr_test)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("LogR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
#     ##################################################################################
#     ##### LINEAR regression using the scores
#     ##################################################################################
    
#     print("Training LINEAR regression.")

#     lr_model = LinearRegression(positive=True)
#     lr_model.fit(X_lr_train, fold["y_train"])
    
#     lr_current_model_path = os.path.join(modelPath, "LinR_bestModel-fold{}.hdf5".format(i))
#     lr_model_file_obj = open(lr_current_model_path, 'wb')
#     pickle.dump(lr_model, lr_model_file_obj)
#     lr_model_file_obj.close()
    
#     ##################################################################################
#     ##### Prediction and metrics for TRAIN dataset
#     ##################################################################################
    
#     print("Generating linear train set metrics.")
    
#     y_pred = lr_model.predict(X_lr_train)
#     label_pred = pred2label(y_pred)
    
#     # Compute precision, recall, sensitivity, specifity, mcc
#     acc = accuracy_score(fold["y_train"], label_pred)
#     prec = precision_score(fold["y_train"],label_pred)
#     mcc = matthews_corrcoef(fold["y_train"], label_pred)

#     conf = confusion_matrix(fold["y_train"], label_pred)
#     tn, fp, fn, tp = conf.ravel()
#     sens = tp/(tp+fn)
#     spec = tn/(tn+fp)
    
#     fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
#     auc = roc_auc_score(fold["y_train"], y_pred)
    
#     evaluations["Fold"].append(i)
#     evaluations["Train_Test"].append("Train")
#     evaluations["Type"].append("LinR")
#     evaluations["Accuracy"].append(acc)
#     evaluations["Precision"].append(prec)
#     evaluations["TPR"].append(tpr)
#     evaluations["FPR"].append(fpr)
#     evaluations["TPR_FPR_Thresholds"].append(thresholds)
#     evaluations["AUC"].append(auc)
#     evaluations["Sensitivity"].append(sens)
#     evaluations["Specificity"].append(spec)
#     evaluations["MCC"].append(mcc)
    
#     ##################################################################################
#     ##### Prediction and metrics for TEST dataset
#     ##################################################################################
    
#     print("Generating linear test set metrics.")

#     y_pred = lr_model.predict(X_lr_test)
#     label_pred = pred2label(y_pred)
    
#     # Compute precision, recall, sensitivity, specifity, mcc
#     acc = accuracy_score(fold["y_test"], label_pred)
#     prec = precision_score(fold["y_test"],label_pred)
#     mcc = matthews_corrcoef(fold["y_test"], label_pred)

#     conf = confusion_matrix(fold["y_test"], label_pred)
#     tn, fp, fn, tp = conf.ravel()
#     sens = tp/(tp+fn)
#     spec = tn/(tn+fp)
    
#     fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
#     auc = roc_auc_score(fold["y_test"], y_pred)
    
#     evaluations["Fold"].append(i)
#     evaluations["Train_Test"].append("Test")
#     evaluations["Type"].append("LinR")
#     evaluations["Accuracy"].append(acc)
#     evaluations["Precision"].append(prec)
#     evaluations["TPR"].append(tpr)
#     evaluations["FPR"].append(fpr)
#     evaluations["TPR_FPR_Thresholds"].append(thresholds)
#     evaluations["AUC"].append(auc)
#     evaluations["Sensitivity"].append(sens)
#     evaluations["Specificity"].append(spec)
#     evaluations["MCC"].append(mcc)


Train/Test model on Fold #0.
Training OHE network.
Training Kmer network.
Training Kgap network.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating the 7 scores.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training LOGISTIC regression.
Generating logistic train set metrics.
Generating logistic test set metrics.

Train/Test model on Fold #1.
Training OHE network.


  y = column_or_1d(y, warn=True)


Training Kmer network.
Training Kgap network.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating the 7 scores.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training LOGISTIC regression.
Generating logistic train set metrics.
Generating logistic test set metrics.

Train/Test model on Fold #2.
Training OHE network.


  y = column_or_1d(y, warn=True)


Training Kmer network.
Training Kgap network.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating the 7 scores.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training LOGISTIC regression.
Generating logistic train set metrics.
Generating logistic test set metrics.

Train/Test model on Fold #3.
Training OHE network.


  y = column_or_1d(y, warn=True)


Training Kmer network.
Training Kgap network.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating the 7 scores.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training LOGISTIC regression.
Generating logistic train set metrics.
Generating logistic test set metrics.

Train/Test model on Fold #4.
Training OHE network.


  y = column_or_1d(y, warn=True)


Training Kmer network.
Training Kgap network.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating the 7 scores.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training LOGISTIC regression.
Generating logistic train set metrics.
Generating logistic test set metrics.


  y = column_or_1d(y, warn=True)


## k-fold Training evaluation

In [20]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped_mean = evaluations_df.groupby(["Train_Test", "Type"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Test,LogR,0.819075,0.829988,0.819068,0.80353,0.834605,0.639233
Test,Sum,0.819078,0.828306,0.895799,0.805211,0.832921,0.638557
Test,Vote,0.810265,0.820555,0.874106,0.794293,0.826216,0.620962
Train,LogR,0.993808,0.994335,0.993807,0.993282,0.994332,0.987622
Train,Sum,0.970718,0.9697,0.996817,0.971872,0.969564,0.941478
Train,Vote,0.966835,0.9644,0.993965,0.969563,0.964105,0.933747


In [21]:
evaluations_df_grouped_std = evaluations_df.groupby(["Train_Test", "Type"]).std().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped_std

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Test,LogR,0.02261,0.029142,0.022638,0.038184,0.032812,0.045607
Test,Sum,0.027601,0.02915,0.020769,0.031624,0.029862,0.05506
Test,Vote,0.03372,0.035184,0.022688,0.036966,0.034586,0.067403
Train,LogR,0.001307,0.00282,0.001307,0.00191,0.002836,0.002617
Train,Sum,0.005352,0.009094,0.001006,0.004544,0.009385,0.010699
Train,Vote,0.005615,0.010369,0.00118,0.005503,0.010841,0.0112


In [22]:
# evaluations_df[evaluations_df["Train_Test"] == "Test"]

In [23]:
# evaluations_df

# Independent data

## Using New Model

Train model on full data from training. Predict and evaluate on Independent data.

In [24]:
##################################################################################
##### Preparing Validation data
##################################################################################
pos_indexes = np.where(indpe_labels==1)[0]
neg_indexes = np.random.permutation(np.where(indpe_labels==0)[0])[0:pos_indexes.shape[0]]
indpe_val_indexes = np.concatenate((pos_indexes, neg_indexes))

# ohe_val_data = (ohe_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes])
# kmer_val_data = (kmer_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes])
# kgap_val_data = (kgap_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes])
# dde_val_data = (dde_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes])
# dpr_val_data = (dpr_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes])
# dpc_val_data = (dpc_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes])
# tpc_val_data = (tpc_indpe_features[indpe_val_indexes], indpe_labels[indpe_val_indexes])

ohe_val_data = (ohe_indpe_features, indpe_labels)
kmer_val_data = (kmer_indpe_features, indpe_labels)
kgap_val_data = (kgap_indpe_features, indpe_labels)
dde_val_data = (dde_indpe_features, indpe_labels)
dpr_val_data = (dpr_indpe_features, indpe_labels)
dpc_val_data = (dpc_indpe_features, indpe_labels)
tpc_val_data = (tpc_indpe_features, indpe_labels)

# adding random shuffling of the dataset for training purpose
index_arr = np.arange(ohe_train_features.shape[0])
index_arr = np.random.permutation(index_arr)

##################################################################################
##### Train OHE network
##################################################################################

print("Training OHE network.")

ohe_model = OHE_DLNN_CORENup(input_seq_shape = ohe_input_seq_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
ohe_full_model_path = os.path.join(modelPath, "OHE_fullModel.hdf5".format(i))
ohe_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(ohe_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
ohe_model.fit(x = ohe_train_features[index_arr], y = train_labels[index_arr], 
              batch_size = ohe_batch_size, epochs = ohe_epochs, verbose = 0, 
              callbacks = ohe_modelCallbacks, 
              validation_data = ohe_val_data)

del ohe_model
tf.keras.backend.clear_session()

##################################################################################
##### Train Kmer network
##################################################################################

print("Training Kmer network.")

kmer_model = Kmer_DLNN_Classifier(input_vec_shape = kmer_input_vec_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
kmer_full_model_path = os.path.join(modelPath, "KMER_fullModel.hdf5".format(i))
kmer_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(kmer_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
kmer_model.fit(x = kmer_train_features[index_arr], y = train_labels[index_arr], 
               batch_size = kmer_batch_size, epochs = kmer_epochs, verbose = 0, 
               callbacks = kmer_modelCallbacks, 
               validation_data = kmer_val_data)

del kmer_model
tf.keras.backend.clear_session()

##################################################################################
##### Train Kgap network
##################################################################################

print("Training Kgap network.")

kgap_model = Kgap_DLNN_Classifier(input_vec_shape = kgap_input_vec_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
kgap_full_model_path = os.path.join(modelPath, "KGAP_fullModel.hdf5".format(i))
kgap_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(kgap_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
kgap_model.fit(x = kgap_train_features[index_arr], y = train_labels[index_arr], 
               batch_size = kgap_batch_size, epochs = kgap_epochs, verbose = 0, 
               callbacks = kgap_modelCallbacks, 
               validation_data = kgap_val_data)

del kgap_model
tf.keras.backend.clear_session()

# kgap_model = tf.keras.models.load_model(kgap_current_model_path)

##################################################################################
##### TRAIN DDE model
##################################################################################

print("Training DDE model.")

dde_model = DDE_DLNN_Classifier(input_vec_shape = dde_train_features_shape)

dde_full_model_file_path = os.path.join(modelPath, "DDE_fullModel.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
dde_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(dde_full_model_file_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
dde_model.fit(x = dde_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = dde_batch_size, epochs = dde_epochs, 
              verbose = 0, callbacks = dde_modelCallbacks, 
              validation_data = dde_val_data)

del dde_model
tf.keras.backend.clear_session()

##################################################################################
##### TRAIN DPR model
##################################################################################

print("Training DPR model.")

dpr_model = DPR_DLNN_Classifier(input_vec_shape = dpr_train_features_shape)

dpr_full_model_file_path = os.path.join(modelPath, "DPR_fullModel.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
dpr_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(dpr_full_model_file_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
dpr_model.fit(x = dpr_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = dpr_batch_size, epochs = dpr_epochs, 
              verbose = 0, callbacks = dpr_modelCallbacks, 
              validation_data = dpr_val_data)

del dpr_model
tf.keras.backend.clear_session()

##################################################################################
##### TRAIN DPC model
##################################################################################

print("Training DPC model.")

dpc_model = DPC_DLNN_Classifier(input_vec_shape = dpc_train_features_shape)

dpc_full_model_file_path = os.path.join(modelPath, "DPC_fullModel.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
dpc_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(dpc_full_model_file_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
dpc_model.fit(x = dpc_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = dpc_batch_size, epochs = dpc_epochs, 
              verbose = 0, callbacks = dpc_modelCallbacks, 
              validation_data = dpc_val_data)

del dpc_model
tf.keras.backend.clear_session()

##################################################################################
##### TRAIN TPC model
##################################################################################

print("Training TPC model.")

tpc_model = TPC_DLNN_Classifier(input_vec_shape = tpc_train_features_shape)

tpc_full_model_file_path = os.path.join(modelPath, "TPC_fullModel.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
tpc_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(tpc_full_model_file_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
tpc_model.fit(x = tpc_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = tpc_batch_size, epochs = tpc_epochs, 
              verbose = 0, callbacks = tpc_modelCallbacks, 
              validation_data = tpc_val_data)

del tpc_model
tf.keras.backend.clear_session()

Training OHE network.
Training Kmer network.
Training Kgap network.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.


In [25]:
##################################################################################
##### Generate scores for Train and Indpe dataset
##################################################################################

print("Generating the 7 scores.")

ohe_model = tf.keras.models.load_model(ohe_full_model_path)
ohe_train_y_pred = ohe_model.predict(ohe_train_features)
ohe_indpe_y_pred = ohe_model.predict(ohe_indpe_features)
del ohe_model
tf.keras.backend.clear_session()

kmer_model = tf.keras.models.load_model(kmer_full_model_path)
kmer_train_y_pred = kmer_model.predict(kmer_train_features)
kmer_indpe_y_pred = kmer_model.predict(kmer_indpe_features)
del kmer_model
tf.keras.backend.clear_session()

kgap_model = tf.keras.models.load_model(kgap_full_model_path)
kgap_train_y_pred = kgap_model.predict(kgap_train_features)
kgap_indpe_y_pred = kgap_model.predict(kgap_indpe_features)
del kgap_model
tf.keras.backend.clear_session()

dde_model = tf.keras.models.load_model(dde_full_model_file_path)
dde_train_y_pred = dde_model.predict(dde_train_features)
dde_indpe_y_pred = dde_model.predict(dde_indpe_features)
del dde_model
tf.keras.backend.clear_session()

dpr_model = tf.keras.models.load_model(dpr_full_model_file_path)
dpr_train_y_pred = dpr_model.predict(dpr_train_features)
dpr_indpe_y_pred = dpr_model.predict(dpr_indpe_features)
del dpr_model
tf.keras.backend.clear_session()

dpc_model = tf.keras.models.load_model(dpc_full_model_file_path)
dpc_train_y_pred = dpc_model.predict(dpc_train_features)
dpc_indpe_y_pred = dpc_model.predict(dpc_indpe_features)
del dpc_model
tf.keras.backend.clear_session()

tpc_model = tf.keras.models.load_model(tpc_full_model_file_path)
tpc_train_y_pred = tpc_model.predict(tpc_train_features)
tpc_indpe_y_pred = tpc_model.predict(tpc_indpe_features)
del tpc_model
tf.keras.backend.clear_session()

##################################################################################
##### Linear regression using the 3 scores
##################################################################################

X_lr_train = np.concatenate((ohe_train_y_pred, kmer_train_y_pred, kgap_train_y_pred,
                             dde_train_y_pred, dpr_train_y_pred, dpc_train_y_pred, tpc_train_y_pred), 
                            axis=1)
X_lr_indpe = np.concatenate((ohe_indpe_y_pred, kmer_indpe_y_pred, kgap_indpe_y_pred, 
                             dde_indpe_y_pred, dpr_indpe_y_pred, dpc_indpe_y_pred, tpc_indpe_y_pred), 
                            axis=1)

Generating the 7 scores.


In [26]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test": [],
    "Type": [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Train dataset
##################################################################################

print("Generating TRAIN set SUM metrics.")

y_pred = np.sum(X_lr_train, axis=1)/no_of_models
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels, label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, label_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("Sum")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for Train dataset
##################################################################################

print("Generating TRAIN set VOTE metrics.")

y_pred = np.sum(pred2label(X_lr_train), axis=1)/no_of_models
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels, label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, label_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("Vote")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

print("Generating Independent set SUM metrics.")

y_pred = np.sum(X_lr_indpe, axis=1)/no_of_models
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, label_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("Sum")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

print("Generating Independent set VOTE metrics.")

y_pred = np.sum(pred2label(X_lr_indpe), axis=1)/no_of_models
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, label_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("Vote")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### LOGISTIC regression using the scores
##################################################################################

print("Training LOGISTIC regression.")

lr_model = logistic_classifier(cw={0:1,1:0.1})
lr_model.fit(X_lr_train, train_labels)

lr_current_model_path = os.path.join(modelPath, "LogR_bestModel-fold{}.hdf5".format(i))
lr_model_file_obj = open(lr_current_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating TRAIN set metrics.")

y_pred = lr_model.predict(X_lr_train)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for INDEPENDENT dataset
##################################################################################

print("Generating INDEPENDENT set metrics.")

y_pred = lr_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

# ##################################################################################
# ##### LINEAR regression using the scores
# ##################################################################################

# print("Training LINEAR regression.")

# lr_model = LinearRegression()
# lr_model.fit(X_lr_train, train_labels)

# lr_current_model_path = os.path.join(modelPath, "LinR_bestModel-fold{}.hdf5".format(i))
# lr_model_file_obj = open(lr_current_model_path, 'wb')
# pickle.dump(lr_model, lr_model_file_obj)
# lr_model_file_obj.close()

# ##################################################################################
# ##### Prediction and metrics for TRAIN dataset
# ##################################################################################

# print("Generating TRAIN set metrics.")

# y_pred = lr_model.predict(X_lr_train)
# y_pred = (y_pred-y_pred.min())/(y_pred.max() - y_pred.min())
# label_pred = pred2label(y_pred)
# # label_pred = pred2label(np.clip(y_pred, 0, 1))

# # Compute precision, recall, sensitivity, specifity, mcc
# acc = accuracy_score(train_labels, label_pred)
# prec = precision_score(train_labels,label_pred)
# mcc = matthews_corrcoef(train_labels, label_pred)

# conf = confusion_matrix(train_labels, label_pred)
# tn, fp, fn, tp = conf.ravel()
# sens = tp/(tp+fn)
# spec = tn/(tn+fp)

# fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
# auc = roc_auc_score(train_labels, y_pred)

# evaluations["Train_Test"].append("Train")
# evaluations["Type"].append("LinR")
# evaluations["Accuracy"].append(acc)
# evaluations["Precision"].append(prec)
# evaluations["TPR"].append(tpr)
# evaluations["FPR"].append(fpr)
# evaluations["TPR_FPR_Thresholds"].append(thresholds)
# evaluations["AUC"].append(auc)
# evaluations["Sensitivity"].append(sens)
# evaluations["Specificity"].append(spec)
# evaluations["MCC"].append(mcc)

# ##################################################################################
# ##### Prediction and metrics for TEST dataset
# ##################################################################################

# print("Generating INDEPENDENT set metrics.")

# y_pred = lr_model.predict(X_lr_indpe)
# y_pred = (y_pred-y_pred.min())/(y_pred.max() - y_pred.min())
# # y_pred = np.clip(y_pred, 0, 1)
# label_pred = pred2label(y_pred)
# # label_pred = pred2label(np.clip(y_pred, 0, 1))

# # Compute precision, recall, sensitivity, specifity, mcc
# acc = accuracy_score(indpe_labels, label_pred)
# prec = precision_score(indpe_labels,label_pred)
# mcc = matthews_corrcoef(indpe_labels, label_pred)

# conf = confusion_matrix(indpe_labels, label_pred)
# tn, fp, fn, tp = conf.ravel()
# sens = tp/(tp+fn)
# spec = tn/(tn+fp)

# fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
# auc = roc_auc_score(indpe_labels, y_pred)

# evaluations["Train_Test"].append("Independent")
# evaluations["Type"].append("LinR")
# evaluations["Accuracy"].append(acc)
# evaluations["Precision"].append(prec)
# evaluations["TPR"].append(tpr)
# evaluations["FPR"].append(fpr)
# evaluations["TPR_FPR_Thresholds"].append(thresholds)
# evaluations["AUC"].append(auc)
# evaluations["Sensitivity"].append(sens)
# evaluations["Specificity"].append(spec)
# evaluations["MCC"].append(mcc)

Generating TRAIN set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating Independent set SUM metrics.
Generating Independent set VOTE metrics.
Training LOGISTIC regression.
Generating TRAIN set metrics.
Generating INDEPENDENT set metrics.


  y = column_or_1d(y, warn=True)


In [27]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test", "Type"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Independent,LogR,0.776327,0.340807,0.615274,0.374384,0.856164,0.222152
Independent,Sum,0.557551,0.2288,0.672409,0.704433,0.528376,0.173164
Independent,Vote,0.268571,0.174648,0.653567,0.916256,0.139922,0.061987
Train,LogR,0.834173,0.986553,0.834173,0.677582,0.990764,0.703749
Train,Sum,0.897565,0.866203,0.964833,0.940386,0.854744,0.798062
Train,Vote,0.671704,0.604071,0.959281,0.996641,0.346767,0.451829


In [28]:
sub_set_models = [0,1,2]
x_train_lr = X_lr_train[:, sub_set_models]
x_indpe_lr = X_lr_indpe[:, sub_set_models]

## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test": [],
    "Type": [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### LOGISTIC regression using the scores
##################################################################################

print("Training LOGISTIC regression.")

lr_model = logistic_classifier(cw={0:1,1:0.1})
lr_model.fit(x_train_lr, train_labels)

lr_current_model_path = os.path.join(modelPath, "LogR_bestModel-fold{}.hdf5".format(i))
lr_model_file_obj = open(lr_current_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating TRAIN set metrics.")

y_pred = lr_model.predict(x_train_lr)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for INDEPENDENT dataset
##################################################################################

print("Generating INDEPENDENT set metrics.")

y_pred = lr_model.predict(x_indpe_lr)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

evaluations_df = pd.DataFrame.from_dict(evaluations)
evaluations_df

Training LOGISTIC regression.
Generating TRAIN set metrics.
Generating INDEPENDENT set metrics.


  y = column_or_1d(y, warn=True)


Unnamed: 0,Train_Test,Type,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,Train,LogR,0.833753,0.98773,"[0.0, 0.6759026028547439, 1.0]","[0.0, 0.008396305625524769, 1.0]","[2, 1, 0]",0.833753,0.675903,0.991604,0.703483
1,Independent,LogR,0.776327,0.3379,"[0.0, 0.3645320197044335, 1.0]","[0.0, 0.14187866927592954, 1.0]","[2, 1, 0]",0.611327,0.364532,0.858121,0.216064


In [29]:
from sklearn.svm import SVC

In [30]:
svc_model = SVC()

In [31]:
svc_model.fit(X_lr_train, train_labels)

  y = column_or_1d(y, warn=True)


SVC()

In [32]:
y_pred = svc_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred[:, np.newaxis])

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, label_pred)
auc = roc_auc_score(indpe_labels, y_pred)

In [33]:
acc, mcc, spec, sens

(0.6097959183673469,
 0.17146983530731597,
 0.6076320939334638,
 0.6206896551724138)