In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_ML_KgapRFE"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "PredNTS_MathFeature_ENC"

monitor = 'val_loss'

sub_feature_count = 200

In [2]:
kgap_max = 4

train_data_filename = 'Training-datasets-PredNTS_kgap_{}.csv'
indpe_data_filename = 'independent-dataset-PredNTS_kgap_{}.csv'

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

# import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report, matthews_corrcoef

from sklearn.feature_selection import RFE

import math

In [4]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

In [5]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

In [6]:
def get_model(cw = None):
    
    model = RandomForestClassifier(n_estimators=1000,
                                   criterion='entropy', 
                                   class_weight=cw,
                                   bootstrap=True,
                                   oob_score=True, 
                                   max_depth=
                                  )
    
    return model

# Train data preparation

In [7]:
##################################################################################
##### Read CSV data
##################################################################################

for i in range(kgap_max+1):
    
    current_train_data_filepath = os.path.join(input_data_folder, train_data_filename.format(i))
    current_train_data = pd.read_csv(current_train_data_filepath, sep=',', header=0)
    current_train_data = current_train_data.drop('label', axis=1)
    
    if i == 0:
        train_data = current_train_data
    else:
        train_data = pd.merge(
            train_data,
            current_train_data,
            how="inner",
            on='nameseq'
        )

train_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in train_data['nameseq']])

train_data = train_data.drop('nameseq', axis=1)

train_features = np.array(train_data.drop('label', axis=1))
train_labels = np.array(train_data['label'])
train_labels = train_labels.reshape((train_labels.shape[0], 1))

In [8]:
##################################################################################
##### Recursive feature selection
##################################################################################

# model = DecisionTreeClassifier(criterion="gini")
model = get_model()

selector = RFE(model, n_features_to_select=sub_feature_count, step=50)
selector = selector.fit(train_features, train_labels.reshape(train_labels.shape[0]))

feature_indices = np.where(selector.ranking_ == 1)[0]

In [9]:
# feature_indices = range(train_features.shape[1])

In [10]:
##################################################################################
##### Extract features and labels, create folds
##################################################################################

train_features = train_features[:, feature_indices]

folds = build_kfold(train_features, train_labels, k=n_fold, shuffle=shuffle, seed=seed)

input_vec_shape = train_features[0].shape

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

# Independent data

In [11]:
##################################################################################
##### Read CSV data
##################################################################################

for i in range(kgap_max+1):

    current_indpe_data_filepath = os.path.join(input_data_folder, indpe_data_filename.format(i))
    current_indpe_data = pd.read_csv(current_indpe_data_filepath, sep=',', header=0)
    current_indpe_data = current_indpe_data.drop('label', axis=1)
    
    if i == 0:
        indpe_data = current_indpe_data
    else:
        indpe_data = pd.merge(
            indpe_data,
            current_indpe_data,
            how="inner",
            on='nameseq'
        )

indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in indpe_data['nameseq']])

indpe_data = indpe_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

indpe_features = np.array(indpe_data.drop('label', axis=1))
indpe_features = indpe_features[:, feature_indices]

indpe_labels = np.array(indpe_data['label'])
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

# Training

In [12]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    # adding random shuffling of the dataset for training purpose
    index_arr = np.arange(fold["X_train"].shape[0])
    index_arr = np.random.permutation(index_arr)
    
    model = get_model(cw={0:1, 1:1})
    
    ## Define the model callbacks for early stopping and saving the model. Then train model
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    
    model.fit(fold["X_train"][index_arr], fold["y_train"][index_arr].reshape(fold["y_train"].shape[0]))
    
    model_file_obj = open(current_model_path, 'wb')
    pickle.dump(model, model_file_obj)
    model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict(fold["X_train"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.predict(fold["X_test"])
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)


Train/Test model on Fold #0.

Train/Test model on Fold #1.

Train/Test model on Fold #2.

Train/Test model on Fold #3.

Train/Test model on Fold #4.


## k-fold Training evaluation

In [13]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test,0.806474,0.807655,0.806475,0.804395,0.808555,0.613283
Train,0.996746,0.998736,0.996747,0.994753,0.998741,0.993502


In [14]:
# 1000 entropy, no rfe
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.812348	0.811298	0.812355	0.814458	0.810253	0.625033
# Train	0.996746	0.997898	0.996746	0.995592	0.997900	0.993499

In [15]:
# 1000 gini, no rfe
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Test	0.814029	0.811770	0.814015	0.817777	0.810253	0.628354
# Train	0.996432	0.998525	0.996432	0.994333	0.998531	0.992872

In [16]:
evaluations_df[evaluations_df["Train_Test"] == 'Test']

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
1,0,Test,0.786164,0.796537,"[0.0, 0.7698744769874477, 1.0]","[0.0, 0.19747899159663865, 1.0]","[2, 1, 0]",0.786198,0.769874,0.802521,0.572677
3,1,Test,0.805031,0.813853,"[0.0, 0.7899159663865546, 1.0]","[0.0, 0.1799163179916318, 1.0]","[2, 1, 0]",0.805,0.789916,0.820084,0.6103
5,2,Test,0.815126,0.804878,"[0.0, 0.8319327731092437, 1.0]","[0.0, 0.20168067226890757, 1.0]","[2, 1, 0]",0.815126,0.831933,0.798319,0.630608
7,3,Test,0.829832,0.817814,"[0.0, 0.8487394957983193, 1.0]","[0.0, 0.18907563025210083, 1.0]","[2, 1, 0]",0.829832,0.848739,0.810924,0.660136
9,4,Test,0.796218,0.805195,"[0.0, 0.7815126050420168, 1.0]","[0.0, 0.18907563025210083, 1.0]","[2, 1, 0]",0.796218,0.781513,0.810924,0.592693


In [17]:
# evaluations_df

# Independent data evaluation

## Using k-fold Models

### Performance of each k-fold model

In [18]:
## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model_file_obj = open(current_model_path, 'rb')
    model = pickle.load(model_file_obj)
    model_file_obj.close()

    y_pred = model.predict(indpe_features)
    label_pred = pred2label(y_pred)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(indpe_labels, label_pred)
    prec = precision_score(indpe_labels,label_pred)
    mcc = matthews_corrcoef(indpe_labels, label_pred)

    conf = confusion_matrix(indpe_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
    auc = roc_auc_score(indpe_labels, y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Independent")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.633796,0.255557,0.633282,0.632512,0.634051,0.201514


In [19]:
evaluations_df

Unnamed: 0,Fold,Train_Test,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,0,Independent,0.629388,0.254403,"[0.0, 0.6403940886699507, 1.0]","[0.0, 0.37279843444227007, 1.0]","[2, 1, 0]",0.633798,0.640394,0.627202,0.201787
1,1,Independent,0.634286,0.253521,"[0.0, 0.6206896551724138, 1.0]","[0.0, 0.363013698630137, 1.0]","[2, 1, 0]",0.628838,0.62069,0.636986,0.195121
2,2,Independent,0.642449,0.265469,"[0.0, 0.6551724137931034, 1.0]","[0.0, 0.36007827788649704, 1.0]","[2, 1, 0]",0.647547,0.655172,0.639922,0.223175
3,3,Independent,0.63102,0.255403,"[0.0, 0.6403940886699507, 1.0]","[0.0, 0.37084148727984345, 1.0]","[2, 1, 0]",0.634776,0.640394,0.629159,0.203377
4,4,Independent,0.631837,0.248988,"[0.0, 0.6059113300492611, 1.0]","[0.0, 0.363013698630137, 1.0]","[2, 1, 0]",0.621449,0.605911,0.636986,0.184109


### Mean score with k-fold models

In [20]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model_file_obj = open(current_model_path, 'rb')
    model = pickle.load(model_file_obj)
    model_file_obj.close()

    y_pred = model.predict(indpe_features)
    total_pred += y_pred[:, np.newaxis]
    all_preds.append(y_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.647347,0.26294,0.659674,0.625616,0.651663,0.210967


### Voting score with k-fold models

In [21]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

total_pred = np.zeros(indpe_labels.shape)
all_preds = []

for i in range(n_fold):
    
    current_model_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))
    model_file_obj = open(current_model_path, 'rb')
    model = pickle.load(model_file_obj)
    model_file_obj.close()

    y_pred = model.predict(indpe_features)
    vote_pred = pred2label(y_pred)
    total_pred += vote_pred[:, np.newaxis]
    all_preds.append(vote_pred)
    
total_pred = total_pred / n_fold
label_pred = pred2label(total_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, total_pred)
auc = roc_auc_score(indpe_labels, total_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.647347,0.26294,0.659674,0.625616,0.651663,0.210967


## Using New Model

Train model on full data from training. Predict and evaluate on Independent data.

In [27]:
model = get_model(cw={0:100, 1:1})
    
## Define the model callbacks for early stopping and saving the model. Then train model
current_model_path = os.path.join(modelPath, "_fullModel.hdf5")

# adding random shuffling of the dataset for training purpose
index_arr = np.arange(train_features.shape[0])
index_arr = np.random.permutation(index_arr)

model.fit(train_features[index_arr], train_labels[index_arr].reshape(train_labels.shape[0]))

RandomForestClassifier(class_weight={0: 100, 1: 1}, criterion='entropy',
                       n_estimators=1000, oob_score=True)

In [28]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Train dataset
##################################################################################

y_pred = model.predict(train_features)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = model.predict(indpe_features)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################

evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.626939,0.25098,0.628382,0.630542,0.626223,0.193674
Train,0.996222,1.0,0.996222,0.992443,1.0,0.992472


In [29]:
# 1000 entropy, no rfe
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Independent	0.635102	0.258893	0.639196	0.645320	0.633072	0.210228
# Train	0.996222	1.000000	0.996222	0.992443	1.000000	0.992472

In [30]:
# 1000 gini, no rfe
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Independent	0.608980	0.248175	0.633410	0.669951	0.596869	0.199529
# Train	0.996222	1.000000	0.996222	0.992443	1.000000	0.992472

In [31]:
print(classification_report(indpe_labels, np.round(y_pred).astype(int)))

              precision    recall  f1-score   support

           0       0.90      0.63      0.74      1022
           1       0.25      0.63      0.36       203

    accuracy                           0.63      1225
   macro avg       0.57      0.63      0.55      1225
weighted avg       0.79      0.63      0.67      1225

