In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_DLNN_AsimEmbedding_ensemble"
outPath = "Results"
foldName = "folds.pickle"

epochs = 100
batch_size = 16
shuffle = True
seed = None

input_data_folder = "Data_from_Asim"

dde_filename = "Protein_DDE[100, 0, 0, 0]-st-simplesequence.csv"
dpr_filename = "Protein_DistancePair[100, 0, 0, 0]-st-simplesequence.csv"
dpc_filename = "Protein_DPC[100, 0, 0, 0]-st-simplesequence.csv"
tpc_filename = "TPC[100, 0, 0, 0]-st-simplesequence.csv"

monitor ="val_loss"

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression, LinearRegression

import math

In [3]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(dde_features, dpr_features, dpc_features, tpc_features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(dde_features, labels):
        kfoldList.append({
            "X_DDE_train": dde_features[train_index],
            "X_DDE_test": dde_features[test_index],
            "X_DPR_train": dpr_features[train_index],
            "X_DPR_test": dpr_features[test_index],
            "X_DPC_train": dpc_features[train_index],
            "X_DPC_test": dpc_features[test_index],
            "X_TPC_train": tpc_features[train_index],
            "X_TPC_test": tpc_features[test_index],
            "y_train": labels[train_index],
            "y_test": labels[test_index]
        })
    return kfoldList

In [4]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred).astype(int)
    return y_pred

# Model

In [5]:
dde_epochs = 100
dde_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DDE_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [6]:
dpr_epochs = 100
dpr_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DPR_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [7]:
dpc_epochs = 100
dpc_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DPC_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu'
                             )(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu'
                             )(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

In [8]:
tpc_epochs = 100
tpc_batch_size = 16

##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def TPC_DLNN_Classifier(input_vec_shape,
                    dense_decode_units = 32, ## Dense layer parameters
                    prob = 0.5, learn_rate = 0.0005, loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    input1 = tf.keras.layers.Input(shape=input_vec_shape)
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(input1)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(int(dense_decode_units/2), 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
#                               activation = 'relu'
                             )(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), 
                      loss = loss)

    return model

# Data preparation

### Read all 4 data files

In [9]:
dde_data_file = os.path.join(input_data_folder, dde_filename)
dpr_data_file = os.path.join(input_data_folder, dpr_filename)
dpc_data_file = os.path.join(input_data_folder, dpc_filename)
tpc_data_file = os.path.join(input_data_folder, tpc_filename)

dde_data = pd.read_csv(dde_data_file, sep=',', header=0)
dpr_data = pd.read_csv(dpr_data_file, sep=',', header=0)
dpc_data = pd.read_csv(dpc_data_file, sep=',', header=0)
tpc_data = pd.read_csv(tpc_data_file, sep=',', header=0)

dde_train_data = dde_data[dde_data['set'] == 'train'].drop('set', axis=1)
dde_indpe_data = dde_data[dde_data['set'] == 'test'].drop('set', axis=1)
dpr_train_data = dpr_data[dpr_data['set'] == 'train'].drop('set', axis=1)
dpr_indpe_data = dpr_data[dpr_data['set'] == 'test'].drop('set', axis=1)
dpc_train_data = dpc_data[dpc_data['set'] == 'train'].drop('set', axis=1)
dpc_indpe_data = dpc_data[dpc_data['set'] == 'test'].drop('set', axis=1)
tpc_train_data = tpc_data[tpc_data['set'] == 'train'].drop('set', axis=1)
tpc_indpe_data = tpc_data[tpc_data['set'] == 'test'].drop('set', axis=1)

### Prepare Training folds

In [10]:
dde_train_features = np.array(dde_train_data.drop('labels', axis=1))
dpr_train_features = np.array(dpr_train_data.drop('labels', axis=1))
dpc_train_features = np.array(dpc_train_data.drop('labels', axis=1))
tpc_train_features = np.array(tpc_train_data.drop('labels', axis=1))

dde_train_features_shape = dde_train_features[0].shape
dpr_train_features_shape = dpr_train_features[0].shape
dpc_train_features_shape = dpc_train_features[0].shape
tpc_train_features_shape = tpc_train_features[0].shape

train_labels = np.array(dde_train_data["labels"])

folds = build_kfold(dde_train_features, dpr_train_features, dpc_train_features, tpc_train_features, train_labels, 
                    k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

### Prepare Independent data

In [11]:
dde_indpe_features = np.array(dde_indpe_data.drop('labels', axis=1))
dpr_indpe_features = np.array(dpr_indpe_data.drop('labels', axis=1))
dpc_indpe_features = np.array(dpc_indpe_data.drop('labels', axis=1))
tpc_indpe_features = np.array(tpc_indpe_data.drop('labels', axis=1))

indpe_labels = np.array(dde_indpe_data["labels"])

# Training

In [12]:
##################################################################################
##### Evaluation structure
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Type" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

In [13]:
##################################################################################
##### TRAIN and PREDICT for every Fold, using models
##################################################################################

# fold counter
i = -1

for fold in folds:
    
    i = i+1

    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(fold["X_DDE_train"].shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)

    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    ##################################################################################
    ##### TRAIN DDE model
    ##################################################################################
    
    print("Training DDE model.")
    
    dde_model = DDE_DLNN_Classifier(input_vec_shape = dde_train_features_shape)

    dde_model_file_path = os.path.join(modelPath, "DDE_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    dde_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(dde_model_file_path,
                                           monitor = monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    dde_model.fit(x = fold["X_DDE_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = dde_batch_size, epochs = dde_epochs, 
                  verbose = 0, callbacks = dde_modelCallbacks, 
                  validation_data = (fold["X_DDE_test"], fold["y_test"]))
    
    del dde_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### TRAIN DPR model
    ##################################################################################
    
    print("Training DPR model.")
    
    dpr_model = DPR_DLNN_Classifier(input_vec_shape = dpr_train_features_shape)

    dpr_model_file_path = os.path.join(modelPath, "DPR_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    dpr_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(dpr_model_file_path,
                                           monitor = monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    dpr_model.fit(x = fold["X_DPR_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = dpr_batch_size, epochs = dpr_epochs, 
                  verbose = 0, callbacks = dpr_modelCallbacks, 
                  validation_data = (fold["X_DPR_test"], fold["y_test"]))
    
    del dpr_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### TRAIN DPC model
    ##################################################################################
    
    print("Training DPC model.")
    
    dpc_model = DPC_DLNN_Classifier(input_vec_shape = dpc_train_features_shape)

    dpc_model_file_path = os.path.join(modelPath, "DPC_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    dpc_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(dpc_model_file_path,
                                           monitor = monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    dpc_model.fit(x = fold["X_DPC_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = dpc_batch_size, epochs = dpc_epochs, 
                  verbose = 0, callbacks = dpc_modelCallbacks, 
                  validation_data = (fold["X_DPC_test"], fold["y_test"]))
    
    del dpc_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### TRAIN TPC model
    ##################################################################################
    
    print("Training TPC model.")
    
    tpc_model = TPC_DLNN_Classifier(input_vec_shape = tpc_train_features_shape)

    tpc_model_file_path = os.path.join(modelPath, "TPC_bestModel-fold{}.hdf5".format(i))
    ## Define the model callbacks for early stopping and saving the model. Then train model
    tpc_modelCallbacks = [
        tf.keras.callbacks.ModelCheckpoint(tpc_model_file_path,
                                           monitor = monitor, verbose = 0, save_best_only = True, 
                                           save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
    ]
    tpc_model.fit(x = fold["X_TPC_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
                  batch_size = tpc_batch_size, epochs = tpc_epochs, 
                  verbose = 0, callbacks = tpc_modelCallbacks, 
                  validation_data = (fold["X_TPC_test"], fold["y_test"]))
    
    del tpc_model
    tf.keras.backend.clear_session()
    
    ##################################################################################
    ##### Generate scores for Train/Test dataset
    ##################################################################################
    
    dde_model = tf.keras.models.load_model(dde_model_file_path)
    dde_train_y_pred = dde_model.predict(fold["X_DDE_train"])
    dde_test_y_pred = dde_model.predict(fold["X_DDE_test"])
    del dde_model
    tf.keras.backend.clear_session()
    
    dpr_model = tf.keras.models.load_model(dpr_model_file_path)
    dpr_train_y_pred = dpr_model.predict(fold["X_DPR_train"])
    dpr_test_y_pred = dpr_model.predict(fold["X_DPR_test"])
    del dpr_model
    tf.keras.backend.clear_session()
    
    dpc_model = tf.keras.models.load_model(dpc_model_file_path)
    dpc_train_y_pred = dpc_model.predict(fold["X_DPC_train"])
    dpc_test_y_pred = dpc_model.predict(fold["X_DPC_test"])
    del dpc_model
    tf.keras.backend.clear_session()
    
    tpc_model = tf.keras.models.load_model(tpc_model_file_path)
    tpc_train_y_pred = tpc_model.predict(fold["X_TPC_train"])
    tpc_test_y_pred = tpc_model.predict(fold["X_TPC_test"])
    del tpc_model
    tf.keras.backend.clear_session()
    
#     X_lr_train = np.concatenate((dde_train_y_pred[:, np.newaxis], 
#                                  dpr_train_y_pred[:, np.newaxis], 
#                                  dpc_train_y_pred[:, np.newaxis], 
#                                  tpc_train_y_pred[:, np.newaxis]), 
#                                 axis=1)
#     X_lr_test = np.concatenate((dde_test_y_pred[:, np.newaxis], 
#                                 dpr_test_y_pred[:, np.newaxis], 
#                                 dpc_test_y_pred[:, np.newaxis], 
#                                 tpc_test_y_pred[:, np.newaxis]), axis=1)
    
    X_lr_train = np.concatenate((dde_train_y_pred, 
                                 dpr_train_y_pred, 
                                 dpc_train_y_pred, 
                                 tpc_train_y_pred), 
                                axis=1)
    X_lr_test = np.concatenate((dde_test_y_pred, 
                                dpr_test_y_pred, 
                                dpc_test_y_pred, 
                                tpc_test_y_pred), 
                               axis=1)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating TRAIN set SUM metrics.")
    
    y_pred = np.sum(X_lr_train, axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("Sum")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating TEST set SUM metrics.")

    y_pred = np.sum(X_lr_test, axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("Sum")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating TRAIN set VOTE metrics.")
    
    # y_pred = lr_model.predict(X_lr_train)
    y_pred = np.sum(pred2label(X_lr_train), axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("Vote")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating TEST set VOTE metrics.")

    # y_pred = lr_model.predict(X_lr_test)
    y_pred = np.sum(pred2label(X_lr_test), axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("Vote")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### LOGISTIC regression using the scores
    ##################################################################################
    
    print("Training logistic regression.")
    
    # lr_model = LogisticRegression(penalty='elasticnet')
    lr_model = LogisticRegression()
    lr_model.fit(X_lr_train, fold["y_train"])
    
    lr_current_model_path = os.path.join(modelPath, "LogR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'wb')
    pickle.dump(lr_model, lr_model_file_obj)
    lr_model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating logistic train set metrics.")
    
    y_pred = lr_model.predict(X_lr_train)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("LogR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating logistic test set metrics.")

    y_pred = lr_model.predict(X_lr_test)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("LogR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### LINEAR regression using the scores
    ##################################################################################
    
    print("Training linear regression.")
    
    # lr_model = LogisticRegression(penalty='elasticnet')
    lr_model = LinearRegression(positive=True)
    lr_model.fit(X_lr_train, fold["y_train"])
    
    lr_current_model_path = os.path.join(modelPath, "LinR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'wb')
    pickle.dump(lr_model, lr_model_file_obj)
    lr_model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating linear train set metrics.")
    
    y_pred = lr_model.predict(X_lr_train)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("LinR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating linear test set metrics.")

    y_pred = lr_model.predict(X_lr_test)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("LinR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)


Train/Test model on Fold #0.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training logistic regression.
Generating logistic train set metrics.
Generating logistic test set metrics.
Training linear regression.
Generating linear train set metrics.
Generating linear test set metrics.

Train/Test model on Fold #1.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training logistic regression.
Generating logistic train set metrics.
Generating logistic test set metrics.
Training linear regression.
Generating linear train set metrics.
Generating linear test set metrics.

Train/Test model on Fold #2.
Training DDE model.
Training DPR model.
Training DPC model.
Tr

## k-fold Training evaluation

In [14]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test", "Type"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Test,LinR,0.816963,0.818646,0.894565,0.814433,0.8195,0.633981
Test,LogR,0.814443,0.818509,0.81444,0.808533,0.820347,0.629365
Test,Sum,0.76742,0.708952,0.879094,0.907633,0.627225,0.557315
Test,Vote,0.785891,0.760554,0.853871,0.834594,0.737186,0.574727
Train,LinR,0.976595,0.974963,0.997144,0.978379,0.974809,0.953253
Train,LogR,0.981004,0.982335,0.981003,0.979638,0.982369,0.962034
Train,Sum,0.920446,0.869087,0.990755,0.990344,0.850545,0.849333
Train,Vote,0.94364,0.918546,0.983508,0.973761,0.913522,0.889002


# Independent data

# Train on full data

In [15]:
# adding random shuffling of the dataset for training purpose
randomized_index_arr = np.arange(dde_train_features.shape[0])
randomized_index_arr = np.random.permutation(randomized_index_arr)

print("\nTrain/Test model on full training data.")

##################################################################################
##### TRAIN DDE model
##################################################################################

print("Training DDE model.")

dde_model = DDE_DLNN_Classifier(input_vec_shape = dde_train_features_shape)

dde_model_file_path = os.path.join(modelPath, "DDE_bestModel-full.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
dde_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(dde_model_file_path,
                                       monitor = monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
dde_model.fit(x = dde_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = dde_batch_size, epochs = dde_epochs, 
              verbose = 0, callbacks = dde_modelCallbacks, 
              validation_data = (dde_indpe_features, indpe_labels))

del dde_model
tf.keras.backend.clear_session()

##################################################################################
##### TRAIN DPR model
##################################################################################

print("Training DPR model.")

dpr_model = DPR_DLNN_Classifier(input_vec_shape = dpr_train_features_shape)

dpr_model_file_path = os.path.join(modelPath, "DPR_bestModel-full.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
dpr_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(dpr_model_file_path,
                                       monitor = monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
dpr_model.fit(x = dpr_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = dpr_batch_size, epochs = dpr_epochs, 
              verbose = 0, callbacks = dpr_modelCallbacks, 
              validation_data = (dpr_indpe_features, indpe_labels))

del dpr_model
tf.keras.backend.clear_session()

##################################################################################
##### TRAIN DPC model
##################################################################################

print("Training DPC model.")

dpc_model = DPC_DLNN_Classifier(input_vec_shape = dpc_train_features_shape)

dpc_model_file_path = os.path.join(modelPath, "DPC_bestModel-full.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
dpc_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(dpc_model_file_path,
                                       monitor = monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
dpc_model.fit(x = dpc_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = dpc_batch_size, epochs = dpc_epochs, 
              verbose = 0, callbacks = dpc_modelCallbacks, 
              validation_data = (dpc_indpe_features, indpe_labels))

del dpc_model
tf.keras.backend.clear_session()

##################################################################################
##### TRAIN TPC model
##################################################################################

print("Training TPC model.")

tpc_model = TPC_DLNN_Classifier(input_vec_shape = tpc_train_features_shape)

tpc_model_file_path = os.path.join(modelPath, "TPC_bestModel-full.hdf5")
## Define the model callbacks for early stopping and saving the model. Then train model
tpc_modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(tpc_model_file_path,
                                       monitor = monitor, verbose = 0, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]
tpc_model.fit(x = tpc_train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
              batch_size = tpc_batch_size, epochs = tpc_epochs, 
              verbose = 0, callbacks = tpc_modelCallbacks, 
              validation_data = (tpc_indpe_features, indpe_labels))

del tpc_model
tf.keras.backend.clear_session()


Train/Test model on full training data.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.


In [45]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Type" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Generate scores for Train/Test dataset
##################################################################################

dde_model = tf.keras.models.load_model(dde_model_file_path)
dde_train_y_pred = dde_model.predict(dde_train_features)
dde_indpe_y_pred = dde_model.predict(dde_indpe_features)
del dde_model
tf.keras.backend.clear_session()

dpr_model = tf.keras.models.load_model(dpr_model_file_path)
dpr_train_y_pred = dpr_model.predict(dpr_train_features)
dpr_indpe_y_pred = dpr_model.predict(dpr_indpe_features)
del dpr_model
tf.keras.backend.clear_session()

dpc_model = tf.keras.models.load_model(dpc_model_file_path)
dpc_train_y_pred = dpc_model.predict(dpc_train_features)
dpc_indpe_y_pred = dpc_model.predict(dpc_indpe_features)
del dpc_model
tf.keras.backend.clear_session()

tpc_model = tf.keras.models.load_model(tpc_model_file_path)
tpc_train_y_pred = tpc_model.predict(tpc_train_features)
tpc_indpe_y_pred = tpc_model.predict(tpc_indpe_features)
del tpc_model
tf.keras.backend.clear_session()

# X_lr_train = np.concatenate((dde_train_y_pred[:, np.newaxis], 
#                              dpr_train_y_pred[:, np.newaxis], 
#                              dpc_train_y_pred[:, np.newaxis], 
#                              tpc_train_y_pred[:, np.newaxis]), 
#                             axis=1)
# X_lr_indpe = np.concatenate((dde_indpe_y_pred[:, np.newaxis], 
#                              dpr_indpe_y_pred[:, np.newaxis], 
#                              dpc_indpe_y_pred[:, np.newaxis], 
#                              tpc_indpe_y_pred[:, np.newaxis]), 
#                             axis=1)

X_lr_train = np.concatenate((dde_train_y_pred, 
                             dpr_train_y_pred, 
                             dpc_train_y_pred, 
                             tpc_train_y_pred), 
                            axis=1)
X_lr_indpe = np.concatenate((dde_indpe_y_pred, 
                             dpr_indpe_y_pred, 
                             dpc_indpe_y_pred, 
                             tpc_indpe_y_pred), 
                            axis=1)

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating TRAIN set SUM metrics.")

y_pred = np.sum(X_lr_train, axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("Sum")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating Independent set SUM metrics.")

y_pred = np.sum(X_lr_indpe, axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("Sum")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating TRAIN set VOTE metrics.")

# y_pred = lr_model.predict(X_lr_train)
y_pred = np.sum(pred2label(X_lr_train), axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("Vote")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating Independent set VOTE metrics.")

# y_pred = lr_model.predict(X_lr_test)
y_pred = np.sum(pred2label(X_lr_indpe), axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("Vote")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### LOGISTIC regression using the scores
##################################################################################

print("Training logistic regression.")

# lr_model = LogisticRegression(penalty='elasticnet')
lr_model = LogisticRegression()
lr_model.fit(X_lr_train, train_labels)

lr_current_model_path = os.path.join(modelPath, "LogR_bestModel-fold{}.hdf5".format(i))
lr_model_file_obj = open(lr_current_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating train set metrics.")

y_pred = lr_model.predict(X_lr_train)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating test set metrics.")

y_pred = lr_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### LINEAR regression using the scores
##################################################################################

print("Training linear regression.")

lr_model = LinearRegression()
lr_model.fit(X_lr_train, train_labels)

lr_current_model_path = os.path.join(modelPath, "LinR_bestModel-fold{}.hdf5".format(i))
lr_model_file_obj = open(lr_current_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating train set metrics.")

y_pred = lr_model.predict(X_lr_train)
y_pred = (y_pred-y_pred.min())/(y_pred.max() - y_pred.min())
label_pred = pred2label(y_pred)
# label_pred = pred2label(np.clip(y_pred, 0, 1))

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("LinR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating test set metrics.")

y_pred = lr_model.predict(X_lr_indpe)
y_pred = (y_pred-y_pred.min())/(y_pred.max() - y_pred.min())
label_pred = pred2label(y_pred)
# label_pred = pred2label(np.clip(y_pred, 0, 1))

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("LinR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

Generating TRAIN set SUM metrics.
Generating Independent set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating Independent set VOTE metrics.
Training logistic regression.
Generating train set metrics.
Generating test set metrics.
Training linear regression.
Generating train set metrics.
Generating test set metrics.


In [46]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df

Unnamed: 0,Train_Test,Type,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,Train,Sum,0.665827,0.600305,"[0.0, 0.0008396305625524769, 0.042821158690176...","[0.0, 0.0, 0.0, 0.0008396305625524769, 0.00083...","[1.9715235, 0.9715236, 0.8819272, 0.8816154, 0...",0.919933,0.992443,0.339211,0.438025
1,Independent,Sum,0.267755,0.181066,"[0.0, 0.0, 0.0049261083743842365, 0.0049261083...","[0.0, 0.0009784735812133072, 0.000978473581213...","[1.9562814, 0.95628136, 0.92149633, 0.9106055,...",0.656951,0.970443,0.12818,0.116353
2,Train,Vote,0.809824,0.889241,"[0.0, 0.33837111670864817, 0.5617128463476071,...","[0.0, 0.0025188916876574307, 0.043660789252728...","[2.333333333333333, 1.3333333333333333, 1.0, 0...",0.876778,0.707809,0.911839,0.632962
3,Independent,Vote,0.675918,0.251282,"[0.0, 0.1330049261083744, 0.32019704433497537,...","[0.0, 0.03424657534246575, 0.16536203522504891...","[2.333333333333333, 1.3333333333333333, 1.0, 0...",0.628144,0.482759,0.714286,0.157276
4,Train,LogR,0.862301,0.86786,"[0.0, 0.8547439126784215, 1.0]","[0.0, 0.13014273719563393, 1.0]","[2, 1, 0]",0.862301,0.854744,0.869857,0.724684
5,Independent,LogR,0.609796,0.238095,"[0.0, 0.6157635467980296, 1.0]","[0.0, 0.3913894324853229, 1.0]","[2, 1, 0]",0.612187,0.615764,0.608611,0.168584
6,Train,LinR,0.930311,0.965486,"[0.0, 0.0008396305625524769, 0.209068010075566...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2.0, 1.0, 0.6702897, 0.67019, 0.66940117, 0.6...",0.984938,0.892527,0.968094,0.863089
7,Independent,LinR,0.693878,0.263736,"[0.0, 0.0049261083743842365, 0.004926108374384...","[0.0, 0.0, 0.005870841487279843, 0.00587084148...","[2.0, 1.0, 0.84921, 0.8389519, 0.83037305, 0.8...",0.64614,0.472906,0.737769,0.17141
