In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_ML_iLearnPlus_ensemble"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "PredNTS_iLearnPlus_ENC"

train_file_name = "PredNTS_input_train_{}.csv"
indpe_file_name = "PredNTS_input_test_{}.csv"

In [2]:
# enc_dict_rf_treeCount = {
#     "AAC": 100,
#     "ASDC": 1000,
#     "CKSAAGP4": 100,
#     "CKSAAP4": 1000,
#     "CTDC": 100,
#     "CTDD": 100,
#     "CTDT": 100,
#     "Ctriad": 100,
#     "DDE": 100,
#     "DistancePair": 1000,
#     "GAAC": None,
#     "GDPC": 100,
#     "TPC": 1000
# }

enc_dict_rf_treeCount = {
#     "AAC": 100,
    "ASDC": 1000,
#     "CKSAAGP4": 100,
    "CKSAAP4": 1000,
#     "CTDC": 100,
#     "CTDD": 100,
#     "CTDT": 100,
#     "Ctriad": 100,
#     "DDE": 100,
    "DistancePair": 1000,
#     "GAAC": None,
#     "GDPC": 100,
#     "TPC": 1000
}

# enc_dict_rf_treeCount = {
# #     "AAC": 100,
#     "ASDC": 2000,
# #     "CKSAAGP4": 100,
#     "CKSAAP4": 2000,
# #     "CTDC": 100,
# #     "CTDD": 100,
# #     "CTDT": 100,
# #     "Ctriad": 100,
# #     "DDE": 100,
#     "DistancePair": 2000,
# #     "GAAC": None,
# #     "GDPC": 100,
# #     "TPC": 1000
# }

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

import math

# Define models

In [4]:
def get_model(trees = None, cw = None):
    
    if trees is not None:
        model = RandomForestClassifier(n_estimators=trees, 
                                       criterion='entropy',
                                       class_weight=cw,
                                       bootstrap=True,
                                       oob_score=True
                                      )
    else:
        model = LogisticRegression(class_weight=cw)
    
    return model

# Build kFolds

In [5]:
##################################################################################
##### Build k-fold splits
##################################################################################

sample_train_input_data_file = os.path.join(input_data_folder, train_file_name.format("AAC"))
sample_data = pd.read_csv(sample_train_input_data_file, sep=',', header=None)

sample_features = np.array(range(sample_data.shape[0]))
sample_labels = np.array(sample_data[0])

skf = StratifiedKFold(n_splits=n_fold, shuffle=shuffle, random_state=seed)
kfold_list = []
for train_index, test_index in skf.split(sample_features, sample_labels):
    kfold_list.append({
        "train_indices": train_index,
        "test_indices": test_index,
    })

# kFold Training evaluation

In [6]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
train_evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

In [7]:
## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)
    
i = -1
for fold in kfold_list:
    
    i += 1
    print("\n======================================================================")
    print("Train/Test ensemble on Fold #"+str(i)+".")
    
    ##################################################################################
    ##### Training Ensemble
    ##################################################################################
    
    fold_X_lr_train_proba_list = []
    fold_X_lr_train_label_list = []
    fold_X_lr_test_proba_list = []
    fold_X_lr_test_label_list = []
    
    for current_dataset_variety in enc_dict_rf_treeCount.keys():
        
        print("Training variety:", current_dataset_variety)
    
        train_input_data_file = os.path.join(input_data_folder, train_file_name.format(current_dataset_variety))

        train_data = pd.read_csv(train_input_data_file, sep=',', header=None)

        train_features = np.array(train_data.drop(0, axis=1))
        train_labels = np.array(train_data[0])

        fold_train_features = train_features[fold['train_indices'], :]
        fold_train_labels = train_labels[fold['train_indices']]
        fold_test_features = train_features[fold['test_indices'], :]
        fold_test_labels = train_labels[fold['test_indices']]
        
        # adding random shuffling of the dataset for training purpose
        randomized_index_arr = np.arange(fold_train_features.shape[0])
        randomized_index_arr = np.random.permutation(randomized_index_arr)
        
        # fetch model
        model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                          cw = {0:1, 1:1})
        
        # train model
        model.fit(X = fold_train_features[randomized_index_arr], y = fold_train_labels[randomized_index_arr])
        
        # saving model to file
        model_file_path = os.path.join(modelPath, "{}_bestModel-fold{}.hdf5".format(current_dataset_variety, i))
        model_file_obj = open(model_file_path, 'wb')
        pickle.dump(model, model_file_obj)
        model_file_obj.close()
        
        ##################################################################################
        ##### Prediction and metrics for TRAIN dataset
        ##################################################################################

        y_pred = model.predict_proba(fold_train_features)[:, 1]
        label_pred = model.predict(fold_train_features)
        
        fold_X_lr_train_proba_list.append(y_pred)
        fold_X_lr_train_label_list.append(label_pred)

        ##################################################################################
        ##### Prediction and metrics for TEST dataset
        ##################################################################################

        y_pred = model.predict_proba(fold_test_features)[:, 1]
        label_pred = model.predict(fold_test_features)
        
        fold_X_lr_test_proba_list.append(y_pred)
        fold_X_lr_test_label_list.append(label_pred)
        
    ##################################################################################
    ##### Training logistic regression model
    ##################################################################################
    
    print("Training Logistic Regression of Ensemble..")
    
    # generating features from scores
    X_lr_train_features = np.array(fold_X_lr_train_proba_list).T
    X_lr_test_features = np.array(fold_X_lr_test_proba_list).T
    
    # fetch model
    lr_model = get_model(trees=None, cw={0:1, 1:1})
    
    # train model
    lr_model.fit(X = X_lr_train_features, y = fold_train_labels)

    # saving model to file
    model_file_path = os.path.join(modelPath, "full_LR_Model-fold{}.hdf5".format(i))
    model_file_obj = open(model_file_path, 'wb')
    pickle.dump(lr_model, model_file_obj)
    model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = lr_model.predict_proba(X_lr_train_features)[:, 1]
    label_pred = lr_model.predict(X_lr_train_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_train_labels, label_pred)
    prec = precision_score(fold_train_labels, label_pred)
    mcc = matthews_corrcoef(fold_train_labels, label_pred)

    conf = confusion_matrix(fold_train_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(fold_train_labels, y_pred)
    auc = roc_auc_score(fold_train_labels, y_pred)
    
    train_evaluations["Fold"].append(i)
    train_evaluations["Train_Test"].append("Train")
    train_evaluations["Accuracy"].append(acc)
    train_evaluations["Precision"].append(prec)
    train_evaluations["TPR"].append(tpr)
    train_evaluations["FPR"].append(fpr)
    train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    train_evaluations["AUC"].append(auc)
    train_evaluations["Sensitivity"].append(sens)
    train_evaluations["Specificity"].append(spec)
    train_evaluations["MCC"].append(mcc)

    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = lr_model.predict_proba(X_lr_test_features)[:, 1]
    label_pred = lr_model.predict(X_lr_test_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_test_labels, label_pred)
    prec = precision_score(fold_test_labels, label_pred)
    mcc = matthews_corrcoef(fold_test_labels, label_pred)

    conf = confusion_matrix(fold_test_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(fold_test_labels, y_pred)
    auc = roc_auc_score(fold_test_labels, y_pred)
    
    train_evaluations["Fold"].append(i)
    train_evaluations["Train_Test"].append("Test")
    train_evaluations["Accuracy"].append(acc)
    train_evaluations["Precision"].append(prec)
    train_evaluations["TPR"].append(tpr)
    train_evaluations["FPR"].append(fpr)
    train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    train_evaluations["AUC"].append(auc)
    train_evaluations["Sensitivity"].append(sens)
    train_evaluations["Specificity"].append(spec)
    train_evaluations["MCC"].append(mcc)


Train/Test ensemble on Fold #0.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #1.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #2.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #3.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #4.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training Logistic Regression of Ensemble..


## k-fold Training evaluation

In [8]:
train_evaluations_df = pd.DataFrame.from_dict(train_evaluations)

In [9]:
train_evaluations_df.groupby(["Train_Test"]).mean().filter(['Sensitivity', 
                                                              'Specificity', 
                                                              'Accuracy',
                                                              'MCC', 
                                                              'AUC',
                                                             ])

Unnamed: 0_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test,0.814416,0.802704,0.80856,0.617586,0.890334
Train,0.996431,0.997061,0.996746,0.993494,0.999942


In [10]:
# "ASDC", "CKSAAP4", "DistancePair", 2000 trees
# Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.819468	0.815281	0.817375	0.634903	0.891053
# Train	0.996432	0.997061	0.996746	0.993494	0.999925

In [11]:
# "ASDC", "CKSAAP4", "DistancePair", 1000 trees
# 	Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.806867	0.805211	0.806039	0.612270	0.888691
# Train	0.995802	0.997691	0.996747	0.993496	0.999924

In [12]:
# "ASDC", "CKSAAP4", "DistancePair", 50
# 	Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.678399	0.878264	0.778333	0.579928	0.888576
# Train	0.994647	0.998846	0.996747	0.993506	0.999922

In [13]:
# "ASDC", "CKSAAP4", "CTDC" "DistancePair"
# 	Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.801842	0.796804	0.799318	0.599339	0.884759
# Train	0.996432	0.996642	0.996537	0.993075	0.999907

In [14]:
# full
# 	Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.825330	0.787574	0.806461	0.613847	0.892374
# Train	0.996642	0.996222	0.996432	0.992865	0.999948

In [15]:
train_evaluations_df.groupby(["Train_Test"]).std().filter(['Sensitivity', 
                                                              'Specificity', 
                                                              'Accuracy',
                                                              'MCC', 
                                                              'AUC',
                                                             ])

Unnamed: 0_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test,0.034968,0.017306,0.017886,0.035493,0.011817
Train,0.000576,0.001369,0.000685,0.00137,5.9e-05


In [16]:
# full
# Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.037895	0.015013	0.019421	0.039813	0.011139
# Train	0.000878	0.001196	0.000234	0.000468	0.000043

# Independent training

In [17]:
##################################################################################
##### Training Ensemble
##################################################################################

X_lr_train_proba_list = []
X_lr_train_label_list = []
X_lr_indpe_proba_list = []
X_lr_indpe_label_list = []

for current_dataset_variety in enc_dict_rf_treeCount.keys():

    print("Training variety:", current_dataset_variety)

    train_input_data_file = os.path.join(input_data_folder, train_file_name.format(current_dataset_variety))
    indpe_input_data_file = os.path.join(input_data_folder, indpe_file_name.format(current_dataset_variety))

    train_data = pd.read_csv(train_input_data_file, sep=',', header=None)
    indpe_data = pd.read_csv(indpe_input_data_file, sep=',', header=None)

    train_features = np.array(train_data.drop(0, axis=1))
    train_labels = np.array(train_data[0])
    
    indpe_features = np.array(indpe_data.drop(0, axis=1))
    indpe_labels = np.array(indpe_data[0])

    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(train_features.shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)

    # fetch model
    model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                      cw = {0:1, 1:1})

    # train model
    model.fit(X = train_features[randomized_index_arr], y = train_labels[randomized_index_arr])

    # saving model to file
    model_file_path = os.path.join(modelPath, "{}_fullModel.hdf5".format(current_dataset_variety))
    model_file_obj = open(model_file_path, 'wb')
    pickle.dump(model, model_file_obj)
    model_file_obj.close()

    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict_proba(train_features)[:, 1]
    label_pred = model.predict(train_features)

    X_lr_train_proba_list.append(y_pred)
    X_lr_train_label_list.append(label_pred)

    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.predict_proba(indpe_features)[:, 1]
    label_pred = model.predict(indpe_features)

    X_lr_indpe_proba_list.append(y_pred)
    X_lr_indpe_label_list.append(label_pred)

Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair


In [18]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

indpe_evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Training logistic regression model
##################################################################################

print("Training Logistic Regression of Ensemble..")

# generating features from scores
X_lr_train_features = np.array(X_lr_train_proba_list).T
X_lr_indpe_features = np.array(X_lr_indpe_proba_list).T

# fetch model
lr_model = get_model(trees=10, 
                     cw={0:50, 1:1})

# train model
lr_model.fit(X = X_lr_train_features, y = train_labels)

# saving model to file
model_file_path = os.path.join(modelPath, "full_LR_Model.hdf5")
model_file_obj = open(model_file_path, 'wb')
pickle.dump(lr_model, model_file_obj)
model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

y_pred = lr_model.predict_proba(X_lr_train_features)[:, 1]
label_pred = lr_model.predict(X_lr_train_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels, label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

indpe_evaluations["Train_Test"].append("Train")
indpe_evaluations["Accuracy"].append(acc)
indpe_evaluations["Precision"].append(prec)
indpe_evaluations["TPR"].append(tpr)
indpe_evaluations["FPR"].append(fpr)
indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
indpe_evaluations["AUC"].append(auc)
indpe_evaluations["Sensitivity"].append(sens)
indpe_evaluations["Specificity"].append(spec)
indpe_evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

y_pred = lr_model.predict_proba(X_lr_indpe_features)[:, 1]
label_pred = lr_model.predict(X_lr_indpe_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels, label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

indpe_evaluations["Train_Test"].append("Independent")
indpe_evaluations["Accuracy"].append(acc)
indpe_evaluations["Precision"].append(prec)
indpe_evaluations["TPR"].append(tpr)
indpe_evaluations["FPR"].append(fpr)
indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
indpe_evaluations["AUC"].append(auc)
indpe_evaluations["Sensitivity"].append(sens)
indpe_evaluations["Specificity"].append(spec)
indpe_evaluations["MCC"].append(mcc)

Training Logistic Regression of Ensemble..


  warn(


## Independent evaluation

In [19]:
indpe_evaluations_df = pd.DataFrame.from_dict(indpe_evaluations)

In [20]:
indpe_evaluations_df.filter(['Train_Test', 'Accuracy', 'AUC', 'Sensitivity', 'Specificity', 'MCC'])

Unnamed: 0,Train_Test,Accuracy,AUC,Sensitivity,Specificity,MCC
0,Train,0.996222,0.999964,0.992443,1.0,0.992472
1,Independent,0.706939,0.664131,0.566502,0.734834,0.241185


In [21]:
# "ASDC", "CKSAAP4", "DistancePair", 2000 trees, 50
# Train_Test	Accuracy	AUC	Sensitivity	Specificity	MCC
# 0	Train	0.996222	0.999879	0.992443	1.000000	0.992472
# 1	Independent	0.756735	0.653654	0.477833	0.812133	0.253941

In [22]:
# "ASDC", "CKSAAP4", "DistancePair", 50
# 	Train_Test	Accuracy	AUC	Sensitivity	Specificity	MCC
# 0	Train	0.996222	0.999868	0.992443	1.000000	0.992472
# 1	Independent	0.772245	0.652864	0.458128	0.834638	0.265479

In [23]:
# "ASDC", "CKSAAP4", "CTDC", "DistancePair", 100
# Train_Test	Accuracy	AUC	Sensitivity	Specificity	MCC
# 0	Train	0.996222	0.999885	0.992443	1.000000	0.992472
# 1	Independent	0.770612	0.649061	0.418719	0.840509	0.239874

In [24]:
# full, 50
# Train_Test	Accuracy	AUC	Sensitivity	Specificity	MCC
# 0	Train	0.996222	0.999873	0.992443	1.000000	0.992472
# 1	Independent	0.766531	0.648299	0.418719	0.835616	0.233608

In [25]:
# full, 100
# Train_Test	Accuracy	AUC	Sensitivity	Specificity	MCC
# 0	Train	0.996222	0.999868	0.992443	1.000000	0.992472
# 1	Independent	0.774694	0.648463	0.389163	0.851272	0.228545

In [26]:
lr_model

RandomForestClassifier(class_weight={0: 50, 1: 1}, criterion='entropy',
                       n_estimators=10, oob_score=True)

In [27]:
lr_model.feature_importances_

array([0.39992917, 0.29949548, 0.30057535])