In [None]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_Ensemble_OHE_Kmer_Kgap_2_valsplit"
outPath = "Results"
foldName = "folds.pickle"

# epochs = 100
# batch_size = 64
shuffle = True
seed = None

ohe_input_data_folder = "Data"
ohe_training_data_file = "Training-datasets-PredNTS.txt"
ohe_independent_data_file = "independent dataset-PredNTS.txt"

enc_data_folder = "PredNTS_MathFeature_ENC"
kmer_train_data_filename = 'Training-datasets-PredNTS_kmer.csv'
kmer_indpe_data_filename = 'independent-dataset-PredNTS_kmer.csv'

kgap_max = 4
kgap_train_data_filename = 'Training-datasets-PredNTS_kgap_{}.csv'
kgap_indpe_data_filename = 'independent-dataset-PredNTS_kgap_{}.csv'

callback_monitor = 'val_loss'

train_val_split = 0.2

In [None]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression

import math

In [None]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_nt(sequence, char_dict):
    
    seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
    i = 0
    for single_character in sequence:
        if(single_character.upper() in char_dict.keys()):
            seq_encoded[i][char_dict[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in NT sequence: '+sequence)
    return seq_encoded

In [None]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(ohe_features, kmer_features, kgap_features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(ohe_features, labels):
        kfoldList.append({
            "X_OHE_train": ohe_features[train_index],
            "X_OHE_test": ohe_features[test_index],
            "X_Kmer_train": kmer_features[train_index],
            "X_Kmer_test": kmer_features[test_index],
            "X_Kgap_train": kgap_features[train_index],
            "X_Kgap_test": kgap_features[test_index],
            "y_train": labels[train_index],
            "y_test": labels[test_index]
        })
    return kfoldList

In [None]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

# Neural network models

In [None]:
ohe_epochs = 200
ohe_batch_size = 16
    
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def OHE_DLNN_CORENup(input_seq_shape = (41, 21),
                 conv_filters_per_layer_1 = 25, kernel_length_1 = 10, conv_strides_1 = 1, ## 1st Convolutional layer parameters
                 max_pool_width_1 = 3, max_pool_stride_1 = 3, ## 1st Maxpool layer parameters
                 lstm_decode_units = 25, ## LSTM layer parameters
                 conv_filters_per_layer_2 = 25,  kernel_length_2 = 5, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
                 max_pool_width_2 = 3, max_pool_stride_2 = 3, ## 2nd Maxpool layer parameters
                 dense_decode_units = 256, ## Dense layer parameters
                 prob = 0.5, learn_rate = 0.0005, 
                 loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    ######################################################################################################
    ########  SEQUENCE  ##################################################################################
    ######################################################################################################
    
    input1 = tf.keras.layers.Input(shape=input_seq_shape)

    x1 = tf.keras.layers.Conv1D(conv_filters_per_layer_1, kernel_length_1,
                                strides = conv_strides_1, kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = "same")(input1)
    x1 = tf.keras.layers.Activation('relu')(x1)
    x1 = tf.keras.layers.MaxPool1D(pool_size = max_pool_width_1, strides = max_pool_stride_1)(x1)
    x1 = tf.keras.layers.Dropout(prob)(x1)
    
    x1 = tf.keras.layers.GaussianNoise(stddev=0.1)(x1)

    ## LSTM Path

    x2 = tf.keras.layers.LSTM(lstm_decode_units, return_sequences = True, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(x1)
    
    x2 = tf.keras.layers.Dropout(prob)(x2)
    
    x2 = tf.keras.layers.Flatten()(x2)
    
    x2 = tf.keras.layers.GaussianNoise(stddev=0.1)(x2)

    ## Conv Path

    x3 = tf.keras.layers.Conv1D(conv_filters_per_layer_2, kernel_length_2, strides = conv_strides_2, 
                                kernel_regularizer = tf.keras.regularizers.l2(beta), padding = 'same')(x1)
    x3 = tf.keras.layers.Activation('relu')(x3)
    x3 = tf.keras.layers.MaxPooling1D(pool_size = max_pool_width_2, strides = max_pool_stride_2)(x3)
    x3 = tf.keras.layers.Dropout(prob)(x3)
    
    x3 = tf.keras.layers.Flatten()(x3)
    
    x3 = tf.keras.layers.GaussianNoise(stddev=0.1)(x3)
    
    x4 = tf.keras.layers.Concatenate(1)([x2,x3])
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(x4)
    
    y = tf.keras.layers.GaussianNoise(stddev=0.1)(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

    return model

In [None]:
kmer_epochs = 200
kmer_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def Kmer_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 8, ## Dense layer parameters,
                    dense_layers = 2,
                    prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(input1)
    y = tf.keras.layers.BatchNormalization()(y)
    y = tf.keras.layers.Dropout(prob)(y)
    
    for i in range(1,dense_layers+1):
    
        y = tf.keras.layers.Dense(int(dense_decode_units/(2**i)), 
                                  kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                 )(y)
        y = tf.keras.layers.BatchNormalization()(y)
        y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [None]:
kgap_epochs = 200
kgap_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def Kgap_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 128, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0001, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/4), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [None]:
OHE_DLNN_CORENup().summary()

In [None]:
Kmer_DLNN_Classifier((8420,)).summary()

In [None]:
Kgap_DLNN_Classifier((2000,)).summary()

In [None]:
def logistic_classifier(cw = None):
    model = LogisticRegression(class_weight=cw)
    return model

# Training data preparation

In [None]:
##################################################################################
##### read SEQUENCE training file
##################################################################################
ohe_train_file_path = os.path.join(ohe_input_data_folder, ohe_training_data_file)
ohe_train_data = pd.read_csv(ohe_train_file_path, sep='\t', header=None)
ohe_train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
ohe_train_data.head()

##################################################################################
##### Create dictionary of all characters in the NT sequence 
##################################################################################
all_char_set = set({})
for val in [set(val) for val in ohe_train_data['Sequence']]:
    all_char_set = all_char_set.union(val)
all_char_list = list(all_char_set)
all_char_list.sort()
all_char_dict = {}
for i in range(len(all_char_list)):
    all_char_dict[all_char_list[i]] = i
    
##################################################################################
##### Create OHE of sequence
##################################################################################
ohe_train_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in ohe_train_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
ohe_train_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in ohe_train_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

ohe_train_features = np.array(list(ohe_train_data['OHE_Sequence']))
labels = np.array(list(ohe_train_data['label']))
labels = labels.reshape((labels.shape[0], 1))

ohe_input_seq_shape = ohe_train_features[0].shape

############################################################################################################
############################################################################################################
##### Read KMER data
############################################################################################################
############################################################################################################

kmer_train_data_filepath = os.path.join(enc_data_folder, kmer_train_data_filename)
kmer_train_data = pd.read_csv(kmer_train_data_filepath, sep=',', header=0)
kmer_train_data = kmer_train_data.drop('label', axis=1)

kmer_train_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kmer_train_data['nameseq']])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kmer_train_features = np.array(kmer_train_data.drop('label', axis=1).drop('nameseq', axis=1))
# train_labels = np.array(train_data['label'])
# train_labels = train_labels.reshape((train_labels.shape[0], 1))

kmer_input_vec_shape = kmer_train_features[0].shape

############################################################################################################
############################################################################################################
##### Read Kgap data
############################################################################################################
############################################################################################################

for i in range(kgap_max+1):
    
    kgap_current_train_data_filepath = os.path.join(enc_data_folder, kgap_train_data_filename.format(i))
    kgap_current_train_data = pd.read_csv(kgap_current_train_data_filepath, sep=',', header=0)
    kgap_current_train_data = kgap_current_train_data.drop('label', axis=1)
    
    if i == 0:
        kgap_train_data = kgap_current_train_data
    else:
        kgap_train_data = pd.merge(
            kgap_train_data,
            kgap_current_train_data,
            how="inner",
            on='nameseq'
        )

kgap_train_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kgap_train_data['nameseq']])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kgap_train_features = np.array(kgap_train_data.drop('label', axis=1).drop('nameseq', axis=1))
# train_labels = np.array(train_data['label'])
# train_labels = train_labels.reshape((train_labels.shape[0], 1))

kgap_input_vec_shape = kgap_train_features[0].shape

############################################################################################################
############################################################################################################
##### Build folds
############################################################################################################
############################################################################################################

folds = build_kfold(ohe_train_features, kmer_train_features, kgap_train_features, labels, k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

# Independent data preparation

In [None]:
##################################################################################
##### read independent data file
##################################################################################
ohe_indpe_file_path = os.path.join(ohe_input_data_folder, ohe_independent_data_file)
ohe_indpe_data = pd.read_csv(ohe_indpe_file_path, sep='\t', header=None)
ohe_indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
ohe_indpe_data.head()
    
##################################################################################
##### Create OHE of sequence
##################################################################################
ohe_indpe_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in ohe_indpe_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
ohe_indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in ohe_indpe_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

ohe_indpe_features = np.array(list(ohe_indpe_data['OHE_Sequence']))
indpe_labels = np.array(list(ohe_indpe_data['label']))
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

############################################################################################################
############################################################################################################
##### Read Kmer data
############################################################################################################
############################################################################################################

kmer_indpe_data_filepath = os.path.join(enc_data_folder, kmer_indpe_data_filename)
kmer_indpe_data = pd.read_csv(kmer_indpe_data_filepath, sep=',', header=0)
kmer_indpe_data = kmer_indpe_data.drop('label', axis=1)

kmer_indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in kmer_indpe_data['nameseq']])

# kmer_indpe_data = kmer_indpe_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kmer_indpe_features = np.array(kmer_indpe_data.drop('label', axis=1).drop('nameseq', axis=1))
# kmer_indpe_labels = np.array(kmer_indpe_data['label'])
# kmer_indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))


############################################################################################################
############################################################################################################
##### Read Kgap data
############################################################################################################
############################################################################################################

for i in range(kgap_max+1):

    kgap_current_indpe_data_filepath = os.path.join(enc_data_folder, kgap_indpe_data_filename.format(i))
    kgap_current_indpe_data = pd.read_csv(kgap_current_indpe_data_filepath, sep=',', header=0)
    kgap_current_indpe_data = kgap_current_indpe_data.drop('label', axis=1)
    
    if i == 0:
        kgap_indpe_data = kgap_current_indpe_data
    else:
        kgap_indpe_data = pd.merge(
            kgap_indpe_data,
            kgap_current_indpe_data,
            how="inner",
            on='nameseq'
        )

kgap_indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                      for val in kgap_indpe_data['nameseq']])

# kgap_indpe_data = kgap_indpe_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

kgap_indpe_features = np.array(kgap_indpe_data.drop('label', axis=1).drop('nameseq', axis=1))

# Training the ensemble

In [None]:
folds[0].keys()

In [None]:
## Create and set directory to save all models
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

In [None]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    # adding random shuffling of the dataset for training purpose
    index_arr = np.arange(fold["X_OHE_train"].shape[0])
    index_arr = np.random.permutation(index_arr)
    
    ##################################################################################
    ##### Train OHE network
    ##################################################################################
    
    print("Training OHE network.")
    
    ohe_model = OHE_DLNN_CORENup(input_seq_shape = ohe_input_seq_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(ohe_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    ohe_model.fit(x = fold["X_OHE_train"][index_arr], y = fold["y_train"][index_arr], 
                  batch_size = ohe_batch_size, epochs = ohe_epochs, verbose = 0, 
                  callbacks = ohe_modelCallbacks, 
                  validation_split = train_val_split)
    
    del ohe_model
    tf.keras.backend.clear_session()
    
#     ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ##################################################################################
    ##### Train Kmer network
    ##################################################################################
    
    print("Training Kmer network.")
    
    kmer_model = Kmer_DLNN_Classifier(input_vec_shape = kmer_input_vec_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(kmer_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    kmer_model.fit(x = fold["X_Kmer_train"][index_arr], y = fold["y_train"][index_arr], 
                   batch_size = kmer_batch_size, epochs = kmer_epochs, verbose = 0, 
                   callbacks = kmer_modelCallbacks, 
                   validation_split = train_val_split)
    
    del kmer_model
    tf.keras.backend.clear_session()
    
#     kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    ##################################################################################
    ##### Train Kgap network
    ##################################################################################
    
    print("Training Kgap network.")
    
    kgap_model = Kgap_DLNN_Classifier(input_vec_shape = kgap_input_vec_shape)
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(kgap_current_model_path,
                                           monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    
    kgap_model.fit(x = fold["X_Kgap_train"][index_arr], y = fold["y_train"][index_arr], 
                   batch_size = kgap_batch_size, epochs = kgap_epochs, verbose = 0, 
                   callbacks = kgap_modelCallbacks, 
                   validation_split = train_val_split)
    
    del kgap_model
    tf.keras.backend.clear_session()
    
#     kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    ##################################################################################
    ##### Generate scores for Train dataset
    ##################################################################################
    
    print("Generating the 3 scores.")
    
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    ohe_train_y_pred = ohe_model.predict(fold["X_OHE_train"])
    ohe_test_y_pred = ohe_model.predict(fold["X_OHE_test"])
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    kmer_train_y_pred = kmer_model.predict(fold["X_Kmer_train"])
    kmer_test_y_pred = kmer_model.predict(fold["X_Kmer_test"])
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    kgap_train_y_pred = kgap_model.predict(fold["X_Kgap_train"])
    kgap_test_y_pred = kgap_model.predict(fold["X_Kgap_test"])
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### Linear regression using the 3 scores
    ##################################################################################
    
    print("Training linear regression.")
    
    X_lr_train = np.concatenate((ohe_train_y_pred, kmer_train_y_pred, kgap_train_y_pred), axis=1)
    X_lr_test = np.concatenate((ohe_test_y_pred, kmer_test_y_pred, kgap_test_y_pred), axis=1)
    
    # lr_model = LogisticRegression(penalty='elasticnet')
    lr_model = logistic_classifier()
    lr_model.fit(X_lr_train, fold["y_train"].reshape((fold["y_train"].shape[0])))
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'wb')
    pickle.dump(lr_model, lr_model_file_obj)
    lr_model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating train set metrics.")
    
    y_pred = lr_model.predict(X_lr_train)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating test set metrics.")

    y_pred = lr_model.predict(X_lr_test)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

## k-fold Training evaluation

In [None]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

In [None]:
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.805216	0.807666	0.805211	0.802690	0.807732	0.611440
# Train	0.959069	0.961407	0.959068	0.956549	0.961587	0.918157

In [None]:
# no valsplit
# Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.816957	0.821317	0.816979	0.811093	0.822865	0.634636
# Train	0.993283	0.993904	0.993283	0.992653	0.993913	0.986567

In [None]:
evaluations_df[evaluations_df["Train_Test"] == "Test"]

In [None]:
evaluations_df

# Independent data

## Using k-fold Models

### Performance of each k-fold model

In [None]:
## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

for i in range(n_fold):
    
    ### OHE #####
    
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ohe_y_pred = ohe_model.predict(ohe_indpe_features)
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    ### Kmer #####
    
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    kmer_y_pred = kmer_model.predict(kmer_indpe_features)
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    ### Kgap #####
    
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    kgap_y_pred = kgap_model.predict(kgap_indpe_features)
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ### LR #####
    
    lr_indpe_features = np.concatenate((ohe_y_pred, kmer_y_pred, kgap_y_pred), axis=1)
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'rb')
    lr_model = pickle.load(lr_model_file_obj)
    lr_model_file_obj.close()
    
    ############

    y_pred = lr_model.predict(lr_indpe_features)
    label_pred = pred2label(y_pred)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(indpe_labels, label_pred)
    prec = precision_score(indpe_labels,label_pred)
    mcc = matthews_corrcoef(indpe_labels, label_pred)

    conf = confusion_matrix(indpe_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
    auc = roc_auc_score(indpe_labels, y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Independent")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

In [None]:
evaluations_df

### Mean score with k-fold models

In [None]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    ### OHE #####
    
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ohe_y_pred = ohe_model.predict(ohe_indpe_features)
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    ### Kmer #####
    
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    kmer_y_pred = kmer_model.predict(kmer_indpe_features)
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    ### Kgap #####
    
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    kgap_y_pred = kgap_model.predict(kgap_indpe_features)
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ### LR #####
    
    lr_indpe_features = np.concatenate((ohe_y_pred, kmer_y_pred, kgap_y_pred), axis=1)
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'rb')
    lr_model = pickle.load(lr_model_file_obj)
    lr_model_file_obj.close()
    
    ############

    y_pred = lr_model.predict(lr_indpe_features)
    
    total_pred += y_pred[:, np.newaxis]
    all_preds.append(y_pred[:, np.newaxis])
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

### Voting score with k-fold models

In [None]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    ### OHE #####
    
    ohe_current_model_path = os.path.join(modelPath, "OHE_bestModel-fold{}.hdf5".format(i))
    ohe_model = tf.keras.models.load_model(ohe_current_model_path)
    
    ohe_y_pred = ohe_model.predict(ohe_indpe_features)
    
    del ohe_model
    tf.keras.backend.clear_session()
    
    ### Kmer #####
    
    kmer_current_model_path = os.path.join(modelPath, "KMER_bestModel-fold{}.hdf5".format(i))
    kmer_model = tf.keras.models.load_model(kmer_current_model_path)
    
    kmer_y_pred = kmer_model.predict(kmer_indpe_features)
    
    del kmer_model
    tf.keras.backend.clear_session()
    
    ### Kgap #####
    
    kgap_current_model_path = os.path.join(modelPath, "KGAP_bestModel-fold{}.hdf5".format(i))
    kgap_model = tf.keras.models.load_model(kgap_current_model_path)
    
    kgap_y_pred = kgap_model.predict(kgap_indpe_features)
    
    del kgap_model
    tf.keras.backend.clear_session()
    
    ### LR #####
    
    lr_indpe_features = np.concatenate((ohe_y_pred, kmer_y_pred, kgap_y_pred), axis=1)
    
    lr_current_model_path = os.path.join(modelPath, "LR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'rb')
    lr_model = pickle.load(lr_model_file_obj)
    lr_model_file_obj.close()
    
    ############

    y_pred = lr_model.predict(lr_indpe_features)
    
    vote_pred = pred2label(y_pred[:, np.newaxis])
    total_pred += vote_pred
    all_preds.append(vote_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

## Using New Model

Train model on full data from training. Predict and evaluate on Independent data.

In [None]:
# pos_indexes = np.where(indpe_labels==1)[0]
# neg_indexes = np.random.permutation(np.where(indpe_labels==0)[0])[0:pos_indexes.shape[0]]
# indpe_val_indexes = np.concatenate((pos_indexes, neg_indexes))

In [None]:
# adding random shuffling of the dataset for training purpose
index_arr = np.arange(ohe_train_features.shape[0])
index_arr = np.random.permutation(index_arr)

##################################################################################
##### Train OHE network
##################################################################################

print("Training OHE network.")

ohe_model = OHE_DLNN_CORENup(input_seq_shape = ohe_input_seq_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
ohe_full_model_path = os.path.join(modelPath, "OHE_fullModel.hdf5".format(i))
ohe_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(ohe_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

ohe_model.fit(x = ohe_train_features[index_arr], y = labels[index_arr], 
              batch_size = ohe_batch_size, epochs = ohe_epochs, verbose = 0, 
              callbacks = ohe_modelCallbacks, 
              validation_split = train_val_split)

del ohe_model
tf.keras.backend.clear_session()

##################################################################################
##### Train Kmer network
##################################################################################

print("Training Kmer network.")

kmer_model = Kmer_DLNN_Classifier(input_vec_shape = kmer_input_vec_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
kmer_full_model_path = os.path.join(modelPath, "KMER_fullModel.hdf5".format(i))
kmer_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(kmer_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

kmer_model.fit(x = kmer_train_features[index_arr], y = labels[index_arr], 
               batch_size = kmer_batch_size, epochs = kmer_epochs, verbose = 0, 
               callbacks = kmer_modelCallbacks, 
               validation_split = train_val_split)

del kmer_model
tf.keras.backend.clear_session()

##################################################################################
##### Train Kgap network
##################################################################################

print("Training Kgap network.")

kgap_model = Kgap_DLNN_Classifier(input_vec_shape = kgap_input_vec_shape)

## Define the model callbacks for early stopping and saving the model. Then train model
kgap_full_model_path = os.path.join(modelPath, "KGAP_fullModel.hdf5".format(i))
kgap_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(kgap_full_model_path,
                                       monitor = callback_monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

kgap_model.fit(x = kgap_train_features[index_arr], y = labels[index_arr], 
               batch_size = kgap_batch_size, epochs = kgap_epochs, verbose = 0, 
               callbacks = kgap_modelCallbacks, 
               validation_split = train_val_split)

del kgap_model
tf.keras.backend.clear_session()

# kgap_model = tf.keras.models.load_model(kgap_current_model_path)

In [None]:
##################################################################################
##### Generate scores for Train dataset
##################################################################################

print("Generating the 3 scores.")

ohe_model = tf.keras.models.load_model(ohe_full_model_path)
ohe_train_y_pred = ohe_model.predict(ohe_train_features)
ohe_indpe_y_pred = ohe_model.predict(ohe_indpe_features)

del ohe_model
tf.keras.backend.clear_session()

kmer_model = tf.keras.models.load_model(kmer_full_model_path)
kmer_train_y_pred = kmer_model.predict(kmer_train_features)
kmer_indpe_y_pred = kmer_model.predict(kmer_indpe_features)

del kmer_model
tf.keras.backend.clear_session()

kgap_model = tf.keras.models.load_model(kgap_full_model_path)
kgap_train_y_pred = kgap_model.predict(kgap_train_features)
kgap_indpe_y_pred = kgap_model.predict(kgap_indpe_features)

del kgap_model
tf.keras.backend.clear_session()

##################################################################################
##### Linear regression using the 3 scores
##################################################################################

print("Training linear regression.")

X_lr_train = np.concatenate((ohe_train_y_pred, kmer_train_y_pred, kgap_train_y_pred), axis=1)
X_lr_indpe = np.concatenate((ohe_indpe_y_pred, kmer_indpe_y_pred, kgap_indpe_y_pred), axis=1)

lr_model = logistic_classifier(cw={0:1, 1:0.075})
lr_model.fit(X_lr_train, labels.reshape((labels.shape[0])))

lr_full_model_path = os.path.join(modelPath, "LR_fullModel.hdf5".format(i))
lr_model_file_obj = open(lr_full_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

In [None]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Train dataset
##################################################################################

y_pred = lr_model.predict(X_lr_train)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(labels, label_pred)
prec = precision_score(labels,label_pred)
mcc = matthews_corrcoef(labels, label_pred)

conf = confusion_matrix(labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(labels, label_pred)
auc = roc_auc_score(labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = lr_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, label_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy',  
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

In [None]:
# Accuracy	Sensitivity	Specificity	MCC
# Train_Test				
# Independent	0.739592	0.428571	0.801370	0.201134
# Train	0.926952	0.862301	0.991604	0.861133

In [None]:
# no valsplit
# 	Accuracy	Sensitivity	Specificity	MCC
# Train_Test				
# Independent	0.766531	0.389163	0.841487	0.215735

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_model = DecisionTreeClassifier(criterion="gini",
                                  max_depth = 5,
                                  class_weight={0:1, 1:1}
                                 )
dt_model.fit(X_lr_train, labels.reshape((labels.shape[0])))

In [None]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Train dataset
##################################################################################

y_pred = dt_model.predict(X_lr_train)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(labels, label_pred)
prec = precision_score(labels,label_pred)
mcc = matthews_corrcoef(labels, label_pred)

conf = confusion_matrix(labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(labels, label_pred)
auc = roc_auc_score(labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = dt_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, label_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy',  
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

In [None]:
dt_model.feature_importances_