In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_MLDL_ensemble"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "PredNTS_ensemble_data"

train_file_name = "PredNTS_input_train_{}.csv"
indpe_file_name = "PredNTS_input_indpe_{}.csv"

In [2]:
enc_dict_rf_treeCount = {
    "ASDC": 1000,
    "CKSAAP4": 1000,
    "DistancePair": 1000,
    "Kmer": 1000,
    "SEQ": "dlnn"
}

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

import tensorflow as tf

import math

In [4]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Utility functions

In [5]:
all_char_dict = {
    '-':0,
    'A':1,
    'C':2,
    'D':3,
    'E':4,
    'F':5,
    'G':6,
    'H':7,
    'I':8,
    'K':9,
    'L':10,
    'M':11,
    'N':12,
    'P':13,
    'Q':14,
    'R':15,
    'S':16,
    'T':17,
    'V':18,
    'W':19,
    'Y':20
}

##################################################################################
##### define all CUSTOM functions
##################################################################################

def one_hot_encode_nt(sequence, char_dict=all_char_dict):
    
    seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
    i = 0
    for single_character in sequence:
        if(single_character.upper() in char_dict.keys()):
            seq_encoded[i][char_dict[single_character.upper()]] = 1
            i = i+1
        else:
            raise ValueError('Incorrect character in NT sequence: '+sequence+' | Character: '+single_character)
    return seq_encoded

def pred2label(y_pred):
    y_pred = np.round(y_pred).astype(int)
    return y_pred

# Define models

In [6]:
callback_monitor = 'val_loss'
train_val_split = 0.2
ohe_epochs = 100
ohe_batch_size = 16
    
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def OHE_DLNN(input_seq_shape = (41, 21),
             conv_filters_per_layer_1 = 25, kernel_length_1 = 10, conv_strides_1 = 1, ## 1st Convolutional layer parameters
             max_pool_width_1 = 3, max_pool_stride_1 = 3, ## 1st Maxpool layer parameters
             lstm_decode_units = 25, ## LSTM layer parameters
             conv_filters_per_layer_2 = 25,  kernel_length_2 = 5, conv_strides_2 = 1, ## 2nd Convolutional layer parameters
             max_pool_width_2 = 3, max_pool_stride_2 = 3, ## 2nd Maxpool layer parameters
             dense_decode_units = 256, ## Dense layer parameters
             prob = 0.5, learn_rate = 0.0005, 
             loss = 'binary_crossentropy', metrics = 'accuracy'):
    
    beta = 0.001
    
    ######################################################################################################
    ########  SEQUENCE  ##################################################################################
    ######################################################################################################
    
    input1 = tf.keras.layers.Input(shape=input_seq_shape)

    x1 = tf.keras.layers.Conv1D(conv_filters_per_layer_1, kernel_length_1,
                                strides = conv_strides_1, kernel_regularizer = tf.keras.regularizers.l2(beta), 
                                padding = "same")(input1)
    x1 = tf.keras.layers.Activation('relu')(x1)
    x1 = tf.keras.layers.MaxPool1D(pool_size = max_pool_width_1, strides = max_pool_stride_1)(x1)
    x1 = tf.keras.layers.Dropout(prob)(x1)
    
    x1 = tf.keras.layers.GaussianNoise(stddev=0.1)(x1)

    ## LSTM Path

    x2 = tf.keras.layers.LSTM(lstm_decode_units, return_sequences = True, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta))(x1)
    
    x2 = tf.keras.layers.Dropout(prob)(x2)
    
    x2 = tf.keras.layers.Flatten()(x2)
    
    x2 = tf.keras.layers.GaussianNoise(stddev=0.1)(x2)

    ## Conv Path

    x3 = tf.keras.layers.Conv1D(conv_filters_per_layer_2, kernel_length_2, strides = conv_strides_2, 
                                kernel_regularizer = tf.keras.regularizers.l2(beta), padding = 'same')(x1)
    x3 = tf.keras.layers.Activation('relu')(x3)
    x3 = tf.keras.layers.MaxPooling1D(pool_size = max_pool_width_2, strides = max_pool_stride_2)(x3)
    x3 = tf.keras.layers.Dropout(prob)(x3)
    
    x3 = tf.keras.layers.Flatten()(x3)
    
    x3 = tf.keras.layers.GaussianNoise(stddev=0.1)(x3)
    
    x4 = tf.keras.layers.Concatenate(1)([x2,x3])
    
    ######################################################################################################
    ########  Classifier  ################################################################################
    ######################################################################################################
    
    y = tf.keras.layers.Dense(dense_decode_units, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'relu')(x4)
    
    y = tf.keras.layers.GaussianNoise(stddev=0.1)(y)
    
    y = tf.keras.layers.Dropout(prob)(y)
    
    y = tf.keras.layers.Dense(1, 
                              kernel_regularizer = tf.keras.regularizers.l2(beta), 
                              activation = 'sigmoid')(y)

    ## Generate Model from input and output
    model = tf.keras.models.Model(inputs=input1, outputs=y)
    
    ## Compile model
    if(metrics != None):
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss, metrics = metrics)
    else:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate), loss = loss)

    return model

In [7]:
def get_model(dlnn = False, trees = None, cw = None):
    if dlnn and (trees is not None):
        raise ValueError('Both dlnn and trees parameters are set. Pass either of the two, not both.')
    elif dlnn:
        model = OHE_DLNN()
    elif trees is not None:
        model = RandomForestClassifier(n_estimators=trees, 
                                       criterion='entropy',
                                       class_weight=cw,
                                       bootstrap=True,
                                       oob_score=True
                                      )
    else:
        model = LogisticRegression(class_weight=cw)
    
    return model

# Build kFolds as indices

In [8]:
##################################################################################
##### Build k-fold splits
##################################################################################

sample_train_input_data_file = os.path.join(input_data_folder, 
                                            train_file_name.format(list(enc_dict_rf_treeCount.keys())[0]))
sample_data = pd.read_csv(sample_train_input_data_file, sep=',', header=None)

sample_features = np.array(range(sample_data.shape[0]))
sample_labels = np.array(sample_data[0])

skf = StratifiedKFold(n_splits=n_fold, shuffle=shuffle, random_state=seed)
kfold_list = []
for train_index, test_index in skf.split(sample_features, sample_labels):
    kfold_list.append({
        "train_indices": train_index,
        "test_indices": test_index,
    })

# kFold Training evaluation

In [9]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
train_evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

In [10]:
## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)
    
i = -1
for fold in kfold_list:
    
    i += 1
    print("\n======================================================================")
    print("Train/Test ensemble on Fold #"+str(i)+".")
    
    ##################################################################################
    ##### Training Ensemble
    ##################################################################################
    
    fold_X_lr_train_proba_list = []
    fold_X_lr_train_label_list = []
    fold_X_lr_test_proba_list = []
    fold_X_lr_test_label_list = []
    
    for current_dataset_variety in enc_dict_rf_treeCount.keys():
        
        print("Training variety:", current_dataset_variety)
        
        ##################################################################################
        ##### Fetching training data
        ##################################################################################
    
        train_input_data_file = os.path.join(input_data_folder, train_file_name.format(current_dataset_variety))

        train_data = pd.read_csv(train_input_data_file, sep=',', header=None)
        
        if current_dataset_variety == 'SEQ':
            train_sequences = np.array(train_data.drop(0, axis=1))
            train_labels = np.array(train_data[0])
            train_features = np.array([one_hot_encode_nt(val[0], all_char_dict) for val in train_sequences])
            
        else:
            train_features = np.array(train_data.drop(0, axis=1))
            train_labels = np.array(train_data[0])

        fold_train_features = train_features[fold['train_indices'], :]
        fold_train_labels = train_labels[fold['train_indices']]
        fold_test_features = train_features[fold['test_indices'], :]
        fold_test_labels = train_labels[fold['test_indices']]
        
        ##################################################################################
        ##### Model training on fold
        ##################################################################################
        
        # adding random shuffling of the dataset for training purpose
        randomized_index_arr = np.arange(fold_train_features.shape[0])
        randomized_index_arr = np.random.permutation(randomized_index_arr)
        
        model_file_path = os.path.join(modelPath, "{}_bestModel-fold{}.hdf5".format(current_dataset_variety, i))
        
        if current_dataset_variety == 'SEQ':
            
            model = get_model(dlnn=True)
            
            ## Define the model callbacks for early stopping and saving the model. Then train model
            modelCallbacks = [
                tf.keras.callbacks.ModelCheckpoint(model_file_path, monitor = callback_monitor, 
                                                   verbose = 0, save_best_only = True, 
                                                   save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
            ]
            
            model.fit(x = fold_train_features[randomized_index_arr], y = fold_train_labels[randomized_index_arr], 
                      batch_size = ohe_batch_size, epochs = ohe_epochs, 
                      verbose = 0, callbacks = modelCallbacks, 
                      validation_split = train_val_split)
            
            model = tf.keras.models.load_model(model_file_path)
        
        else:
            
            # fetch model
            model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                              cw = {0:1, 1:1})

            # train model
            model.fit(X = fold_train_features[randomized_index_arr], y = fold_train_labels[randomized_index_arr])

            # saving model to file
            model_file_obj = open(model_file_path, 'wb')
            pickle.dump(model, model_file_obj)
            model_file_obj.close()
        
        ##################################################################################
        ##### Prediction and metrics for TRAIN dataset
        ##################################################################################
        
        if current_dataset_variety == 'SEQ':
            y_pred = model.predict(fold_train_features)
            y_pred = y_pred.reshape((y_pred.shape[0],))
            label_pred = pred2label(y_pred)
        else:
            y_pred = model.predict_proba(fold_train_features)[:, 1]
            label_pred = model.predict(fold_train_features)
        
        fold_X_lr_train_proba_list.append(y_pred)
        fold_X_lr_train_label_list.append(label_pred)

        ##################################################################################
        ##### Prediction and metrics for TEST dataset
        ##################################################################################
        
        if current_dataset_variety == 'SEQ':
            y_pred = model.predict(fold_test_features)
            y_pred = y_pred.reshape((y_pred.shape[0],))
            label_pred = pred2label(y_pred)
        else:
            y_pred = model.predict_proba(fold_test_features)[:, 1]
            label_pred = model.predict(fold_test_features)
        
        fold_X_lr_test_proba_list.append(y_pred)
        fold_X_lr_test_label_list.append(label_pred)
        
    ##################################################################################
    ##### Training logistic regression model
    ##################################################################################
    
    print("Training Logistic Regression of Ensemble..")
    
    # generating features from scores
    X_lr_train_features = np.array(fold_X_lr_train_proba_list).T
    X_lr_test_features = np.array(fold_X_lr_test_proba_list).T
    
    # fetch model
    lr_model = get_model(trees=None, cw={0:1, 1:1})
    
    # train model
    lr_model.fit(X = X_lr_train_features, y = fold_train_labels)

    # saving model to file
    model_file_path = os.path.join(modelPath, "full_LR_Model-fold{}.hdf5".format(i))
    model_file_obj = open(model_file_path, 'wb')
    pickle.dump(lr_model, model_file_obj)
    model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = lr_model.predict_proba(X_lr_train_features)[:, 1]
    label_pred = lr_model.predict(X_lr_train_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_train_labels, label_pred)
    prec = precision_score(fold_train_labels, label_pred)
    mcc = matthews_corrcoef(fold_train_labels, label_pred)

    conf = confusion_matrix(fold_train_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(fold_train_labels, y_pred)
    auc = roc_auc_score(fold_train_labels, y_pred)
    
    train_evaluations["Fold"].append(i)
    train_evaluations["Train_Test"].append("Train")
    train_evaluations["Accuracy"].append(acc)
    train_evaluations["Precision"].append(prec)
    train_evaluations["TPR"].append(tpr)
    train_evaluations["FPR"].append(fpr)
    train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    train_evaluations["AUC"].append(auc)
    train_evaluations["Sensitivity"].append(sens)
    train_evaluations["Specificity"].append(spec)
    train_evaluations["MCC"].append(mcc)

    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = lr_model.predict_proba(X_lr_test_features)[:, 1]
    label_pred = lr_model.predict(X_lr_test_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_test_labels, label_pred)
    prec = precision_score(fold_test_labels, label_pred)
    mcc = matthews_corrcoef(fold_test_labels, label_pred)

    conf = confusion_matrix(fold_test_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(fold_test_labels, y_pred)
    auc = roc_auc_score(fold_test_labels, y_pred)
    
    train_evaluations["Fold"].append(i)
    train_evaluations["Train_Test"].append("Test")
    train_evaluations["Accuracy"].append(acc)
    train_evaluations["Precision"].append(prec)
    train_evaluations["TPR"].append(tpr)
    train_evaluations["FPR"].append(fpr)
    train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    train_evaluations["AUC"].append(auc)
    train_evaluations["Sensitivity"].append(sens)
    train_evaluations["Specificity"].append(spec)
    train_evaluations["MCC"].append(mcc)


Train/Test ensemble on Fold #0.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training variety: SEQ
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #1.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training variety: SEQ
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #2.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training variety: SEQ
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #3.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training variety: SEQ
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #4.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training variety: SEQ
Training Logistic Regression of Ensemb

## k-fold Training evaluation

In [11]:
train_evaluations_df = pd.DataFrame.from_dict(train_evaluations)

In [12]:
train_evaluations_df.groupby(["Train_Test"]).mean().filter(['Sensitivity', 
                                                              'Specificity', 
                                                              'Accuracy',
                                                              'MCC', 
                                                              'AUC',
                                                             ])

Unnamed: 0_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test,0.806027,0.816951,0.811491,0.623078,0.895633
Train,0.995382,0.997901,0.996642,0.993289,0.999902


In [13]:
train_evaluations_df.groupby(["Train_Test"]).std().filter(['Sensitivity', 
                                                              'Specificity', 
                                                              'Accuracy',
                                                              'MCC', 
                                                              'AUC',
                                                             ])

Unnamed: 0_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test,0.016886,0.016015,0.014385,0.028728,0.018728
Train,0.000939,0.001963,0.000953,0.001911,8e-05


# Independent training

In [14]:
##################################################################################
##### Training Ensemble
##################################################################################

X_lr_train_proba_list = []
X_lr_train_label_list = []
X_lr_indpe_proba_list = []
X_lr_indpe_label_list = []

for current_dataset_variety in enc_dict_rf_treeCount.keys():

    print("Training variety:", current_dataset_variety)

    train_input_data_file = os.path.join(input_data_folder, train_file_name.format(current_dataset_variety))
    indpe_input_data_file = os.path.join(input_data_folder, indpe_file_name.format(current_dataset_variety))

    train_data = pd.read_csv(train_input_data_file, sep=',', header=None)
    indpe_data = pd.read_csv(indpe_input_data_file, sep=',', header=None)
    
    if current_dataset_variety == 'SEQ':
        train_sequences = np.array(train_data.drop(0, axis=1))
        train_labels = np.array(train_data[0])
        train_features = np.array([one_hot_encode_nt(val[0], all_char_dict) for val in train_sequences])
        
        indpe_sequences = np.array(indpe_data.drop(0, axis=1))
        indpe_labels = np.array(indpe_data[0])
        indpe_features = np.array([one_hot_encode_nt(val[0], all_char_dict) for val in indpe_sequences])

    else:
        train_features = np.array(train_data.drop(0, axis=1))
        train_labels = np.array(train_data[0])
    
        indpe_features = np.array(indpe_data.drop(0, axis=1))
        indpe_labels = np.array(indpe_data[0])
    
    ##################################################################################
    ##### Model training on fold
    ##################################################################################

    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(train_features.shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)

    model_file_path = os.path.join(modelPath, "{}_fullModel.hdf5".format(current_dataset_variety))

    if current_dataset_variety == 'SEQ':
        
        model = get_model(dlnn=True)

        ## Define the model callbacks for early stopping and saving the model. Then train model
        modelCallbacks = [
            tf.keras.callbacks.ModelCheckpoint(model_file_path, monitor = callback_monitor, 
                                               verbose = 0, save_best_only = True, 
                                               save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
        ]

        model.fit(x = train_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
                  batch_size = ohe_batch_size, epochs = ohe_epochs, 
                  verbose = 0, callbacks = modelCallbacks, 
                  validation_split = train_val_split)

        model = tf.keras.models.load_model(model_file_path)

    else:
        # fetch model
        model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                          cw = {0:1, 1:1})

        # train model
        model.fit(X = train_features[randomized_index_arr], y = train_labels[randomized_index_arr])

        # saving model to file
        model_file_obj = open(model_file_path, 'wb')
        pickle.dump(model, model_file_obj)
        model_file_obj.close()

    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    if current_dataset_variety == 'SEQ':
        y_pred = model.predict(train_features)
        y_pred = y_pred.reshape((y_pred.shape[0],))
        label_pred = pred2label(y_pred)
    else:
        y_pred = model.predict_proba(train_features)[:, 1]
        label_pred = model.predict(train_features)

    X_lr_train_proba_list.append(y_pred)
    X_lr_train_label_list.append(label_pred)

    ##################################################################################
    ##### Prediction and metrics for INDEPENDENT dataset
    ##################################################################################

    if current_dataset_variety == 'SEQ':
        y_pred = model.predict(indpe_features)
        y_pred = y_pred.reshape((y_pred.shape[0],))
        label_pred = pred2label(y_pred)
    else:
        y_pred = model.predict_proba(indpe_features)[:, 1]
        label_pred = model.predict(indpe_features)

    X_lr_indpe_proba_list.append(y_pred)
    X_lr_indpe_label_list.append(label_pred)

Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training variety: SEQ


In [20]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

indpe_evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Training logistic regression model
##################################################################################

print("Training Logistic Regression of Ensemble..")

# generating features from scores
X_lr_train_features = np.array(X_lr_train_proba_list).T
X_lr_indpe_features = np.array(X_lr_indpe_proba_list).T

# fetch model
lr_model = get_model(trees=None, 
                     cw={0:100, 1:1})

# train model
lr_model.fit(X = X_lr_train_features, y = train_labels)

# saving model to file
model_file_path = os.path.join(modelPath, "full_LR_Model.hdf5")
model_file_obj = open(model_file_path, 'wb')
pickle.dump(lr_model, model_file_obj)
model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

y_pred = lr_model.predict_proba(X_lr_train_features)[:, 1]
label_pred = lr_model.predict(X_lr_train_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels, label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

indpe_evaluations["Train_Test"].append("Train")
indpe_evaluations["Accuracy"].append(acc)
indpe_evaluations["Precision"].append(prec)
indpe_evaluations["TPR"].append(tpr)
indpe_evaluations["FPR"].append(fpr)
indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
indpe_evaluations["AUC"].append(auc)
indpe_evaluations["Sensitivity"].append(sens)
indpe_evaluations["Specificity"].append(spec)
indpe_evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

y_pred = lr_model.predict_proba(X_lr_indpe_features)[:, 1]
label_pred = lr_model.predict(X_lr_indpe_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels, label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

indpe_evaluations["Train_Test"].append("Independent")
indpe_evaluations["Accuracy"].append(acc)
indpe_evaluations["Precision"].append(prec)
indpe_evaluations["TPR"].append(tpr)
indpe_evaluations["FPR"].append(fpr)
indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
indpe_evaluations["AUC"].append(auc)
indpe_evaluations["Sensitivity"].append(sens)
indpe_evaluations["Specificity"].append(spec)
indpe_evaluations["MCC"].append(mcc)

Training Logistic Regression of Ensemble..


## Independent evaluation

In [21]:
indpe_evaluations_df = pd.DataFrame.from_dict(indpe_evaluations)

In [22]:
indpe_evaluations_df.filter(['Train_Test', 'Accuracy', 'AUC', 'Sensitivity', 'Specificity', 'MCC'])

Unnamed: 0,Train_Test,Accuracy,AUC,Sensitivity,Specificity,MCC
0,Train,0.996222,0.999878,0.992443,1.0,0.992472
1,Independent,0.773878,0.654657,0.428571,0.842466,0.250801


# Feature importances

In [18]:
rf_model = get_model(trees=10, 
                     cw={0:1, 1:1})

# train model
rf_model.fit(X = X_lr_train_features, y = train_labels)

rf_model.feature_importances_

  warn(


array([0.11023905, 0.29932231, 0.1957017 , 0.39411131, 0.00062564])

In [19]:
rf_model = get_model(trees=10, 
                     cw={0:100, 1:1})

# train model
rf_model.fit(X = X_lr_train_features, y = train_labels)

rf_model.feature_importances_

  warn(


array([2.01284792e-01, 2.99243433e-01, 2.99535754e-01, 1.99724860e-01,
       2.11161450e-04])