In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_iNitroY_Classification_ML_ensemble"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "iNitroY_ensemble_data"

current_data_file_name = "iNitroY_full_{}.csv"

In [2]:
enc_dict_rf_treeCount = {
    "ASDC": 1000,
    "CKSAAP4": 1000,
    "DistancePair": 1000,
    "Kmer": 1000
}

# enc_dict_rf_treeCount = {
#     "ASDC": 1000,
#     "CKSAAP5": 1000,
#     "DistancePair": 1000,
#     "Kmer": 1000
# }

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

import math

# Define models

In [4]:
def get_model(trees = None, cw = None):
    
    if trees is not None:
        model = RandomForestClassifier(n_estimators=trees, 
                                       criterion='entropy',
                                       class_weight=cw,
                                       bootstrap=True,
                                       oob_score=True
                                      )
    else:
        model = LogisticRegression(class_weight=cw)
    
    return model

# Build kFolds

In [5]:
##################################################################################
##### Build k-fold splits
##################################################################################

sample_input_data_file = os.path.join(input_data_folder, 
                                      current_data_file_name.format(list(enc_dict_rf_treeCount.keys())[0]))
sample_data = pd.read_csv(sample_input_data_file, sep=',', header=None)

sample_features = np.array(range(sample_data.shape[0]))
sample_labels = np.array(sample_data[0])

X_train, X_indpe = train_test_split(sample_features, 
                                    test_size=0.3, shuffle=True, stratify=sample_labels)

skf = StratifiedKFold(n_splits=n_fold, shuffle=shuffle, random_state=seed)
kfold_list = []
for train_index, test_index in skf.split(X_train, sample_labels[X_train]):
    kfold_list.append({
        "train_indices": X_train[train_index],
        "test_indices": X_train[test_index],
    })

# kFold Training evaluation

In [6]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
train_evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

In [7]:
## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)
    
i = -1
for fold in kfold_list:
    
    i += 1
    print("\n======================================================================")
    print("Train/Test ensemble on Fold #"+str(i)+".")
    
    ##################################################################################
    ##### Training Ensemble
    ##################################################################################
    
    fold_X_lr_train_proba_list = []
    fold_X_lr_train_label_list = []
    fold_X_lr_test_proba_list = []
    fold_X_lr_test_label_list = []
    
    for current_dataset_variety in enc_dict_rf_treeCount.keys():
        
        print("Training variety:", current_dataset_variety)
    
        train_input_data_file = os.path.join(input_data_folder, current_data_file_name.format(current_dataset_variety))

        train_data = pd.read_csv(train_input_data_file, sep=',', header=None)

        train_features = np.array(train_data.drop(0, axis=1))
        train_labels = np.array(train_data[0])

        fold_train_features = train_features[fold['train_indices'], :]
        fold_train_labels = train_labels[fold['train_indices']]
        fold_test_features = train_features[fold['test_indices'], :]
        fold_test_labels = train_labels[fold['test_indices']]
        
        # adding random shuffling of the dataset for training purpose
        randomized_index_arr = np.arange(fold_train_features.shape[0])
        randomized_index_arr = np.random.permutation(randomized_index_arr)
        
        # fetch model
        model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                          cw = {0:1, 1:10})
        
        # train model
        model.fit(X = fold_train_features[randomized_index_arr], y = fold_train_labels[randomized_index_arr])
        
        # saving model to file
        model_file_path = os.path.join(modelPath, "{}_bestModel-fold{}.hdf5".format(current_dataset_variety, i))
        model_file_obj = open(model_file_path, 'wb')
        pickle.dump(model, model_file_obj)
        model_file_obj.close()
        
        ##################################################################################
        ##### Prediction and metrics for TRAIN dataset
        ##################################################################################

        y_pred = model.predict_proba(fold_train_features)[:, 1]
        label_pred = model.predict(fold_train_features)
        
        fold_X_lr_train_proba_list.append(y_pred)
        fold_X_lr_train_label_list.append(label_pred)

        ##################################################################################
        ##### Prediction and metrics for TEST dataset
        ##################################################################################

        y_pred = model.predict_proba(fold_test_features)[:, 1]
        label_pred = model.predict(fold_test_features)
        
        fold_X_lr_test_proba_list.append(y_pred)
        fold_X_lr_test_label_list.append(label_pred)
        
    ##################################################################################
    ##### Training logistic regression model
    ##################################################################################
    
    print("Training Logistic Regression of Ensemble..")
    
    # generating features from scores
    X_lr_train_features = np.array(fold_X_lr_train_proba_list).T
    X_lr_test_features = np.array(fold_X_lr_test_proba_list).T
    
    # fetch model
    lr_model = get_model(trees=None, 
                         cw={0:1, 1:1}
                        )
    
    # train model
    lr_model.fit(X = X_lr_train_features, y = fold_train_labels)

    # saving model to file
    model_file_path = os.path.join(modelPath, "full_LR_Model-fold{}.hdf5".format(i))
    model_file_obj = open(model_file_path, 'wb')
    pickle.dump(lr_model, model_file_obj)
    model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = lr_model.predict_proba(X_lr_train_features)[:, 1]
    label_pred = lr_model.predict(X_lr_train_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_train_labels, label_pred)
    prec = precision_score(fold_train_labels, label_pred)
    mcc = matthews_corrcoef(fold_train_labels, label_pred)

    conf = confusion_matrix(fold_train_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(fold_train_labels, y_pred)
    auc = roc_auc_score(fold_train_labels, y_pred)
    
    train_evaluations["Fold"].append(i)
    train_evaluations["Train_Test"].append("Train")
    train_evaluations["Accuracy"].append(acc)
    train_evaluations["Precision"].append(prec)
    train_evaluations["TPR"].append(tpr)
    train_evaluations["FPR"].append(fpr)
    train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    train_evaluations["AUC"].append(auc)
    train_evaluations["Sensitivity"].append(sens)
    train_evaluations["Specificity"].append(spec)
    train_evaluations["MCC"].append(mcc)

    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = lr_model.predict_proba(X_lr_test_features)[:, 1]
    label_pred = lr_model.predict(X_lr_test_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_test_labels, label_pred)
    prec = precision_score(fold_test_labels, label_pred)
    mcc = matthews_corrcoef(fold_test_labels, label_pred)

    conf = confusion_matrix(fold_test_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(fold_test_labels, y_pred)
    auc = roc_auc_score(fold_test_labels, y_pred)
    
    train_evaluations["Fold"].append(i)
    train_evaluations["Train_Test"].append("Test")
    train_evaluations["Accuracy"].append(acc)
    train_evaluations["Precision"].append(prec)
    train_evaluations["TPR"].append(tpr)
    train_evaluations["FPR"].append(fpr)
    train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    train_evaluations["AUC"].append(auc)
    train_evaluations["Sensitivity"].append(sens)
    train_evaluations["Specificity"].append(spec)
    train_evaluations["MCC"].append(mcc)


Train/Test ensemble on Fold #0.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #1.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #2.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #3.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training Logistic Regression of Ensemble..

Train/Test ensemble on Fold #4.
Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer
Training Logistic Regression of Ensemble..


## k-fold Training evaluation

In [8]:
train_evaluations_df = pd.DataFrame.from_dict(train_evaluations)

In [9]:
train_evaluations_df.groupby(["Train_Test"]).mean().filter(['Sensitivity', 
                                                              'Specificity', 
                                                              'Accuracy',
                                                              'MCC', 
                                                              'AUC',
                                                             ])

Unnamed: 0_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test,0.877585,0.971791,0.934822,0.863393,0.94363
Train,1.0,1.0,1.0,1.0,1.0


In [10]:
# CKSAAP5 1000
# 	Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.921353	0.980201	0.957088	0.910179	0.970142
# Train	1.000000	0.997176	0.998285	0.996416	0.999906

In [11]:
# CKSAAP4 1000
# 	Sensitivity	Specificity	Accuracy	MCC	AUC
# Train_Test					
# Test	0.895556	0.968934	0.939979	0.874574	0.953844
# Train	1.000000	0.997176	0.998285	0.996416	0.999981

In [12]:
train_evaluations_df.groupby(["Train_Test"]).std().filter(['Sensitivity', 
                                                              'Specificity', 
                                                              'Accuracy',
                                                              'MCC', 
                                                              'AUC',
                                                             ])

Unnamed: 0_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Test,0.033774,0.014034,0.017824,0.038018,0.015316
Train,0.0,0.0,0.0,0.0,0.0


# Independent training

In [13]:
##################################################################################
##### Training Ensemble
##################################################################################

X_lr_train_proba_list = []
X_lr_train_label_list = []
X_lr_indpe_proba_list = []
X_lr_indpe_label_list = []

for current_dataset_variety in enc_dict_rf_treeCount.keys():

    print("Training variety:", current_dataset_variety)

    input_data_file = os.path.join(input_data_folder, current_data_file_name.format(current_dataset_variety))

    current_data = pd.read_csv(input_data_file, sep=',', header=None)

    train_features = np.array(current_data.drop(0, axis=1))[X_train, :]
    train_labels = np.array(current_data[0])[X_train]
    
    indpe_features = np.array(current_data.drop(0, axis=1))[X_indpe, :]
    indpe_labels = np.array(current_data[0])[X_indpe]

    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(train_features.shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)

    # fetch model
    model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                      cw = {0:1, 1:1})

    # train model
    model.fit(X = train_features[randomized_index_arr], y = train_labels[randomized_index_arr])

    # saving model to file
    model_file_path = os.path.join(modelPath, "{}_fullModel.hdf5".format(current_dataset_variety))
    model_file_obj = open(model_file_path, 'wb')
    pickle.dump(model, model_file_obj)
    model_file_obj.close()

    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict_proba(train_features)[:, 1]
    label_pred = model.predict(train_features)

    X_lr_train_proba_list.append(y_pred)
    X_lr_train_label_list.append(label_pred)

    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.predict_proba(indpe_features)[:, 1]
    label_pred = model.predict(indpe_features)

    X_lr_indpe_proba_list.append(y_pred)
    X_lr_indpe_label_list.append(label_pred)

Training variety: ASDC
Training variety: CKSAAP4
Training variety: DistancePair
Training variety: Kmer


In [14]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

indpe_evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Training logistic regression model
##################################################################################

print("Training Logistic Regression of Ensemble..")

# generating features from scores
X_lr_train_features = np.array(X_lr_train_proba_list).T
X_lr_indpe_features = np.array(X_lr_indpe_proba_list).T

# fetch model
lr_model = get_model(trees=None, 
                     cw={0:1, 1:1})

# train model
lr_model.fit(X = X_lr_train_features, y = train_labels)

# saving model to file
model_file_path = os.path.join(modelPath, "full_LR_Model.hdf5")
model_file_obj = open(model_file_path, 'wb')
pickle.dump(lr_model, model_file_obj)
model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

y_pred = lr_model.predict_proba(X_lr_train_features)[:, 1]
label_pred = lr_model.predict(X_lr_train_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels, label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

indpe_evaluations["Train_Test"].append("Train")
indpe_evaluations["Accuracy"].append(acc)
indpe_evaluations["Precision"].append(prec)
indpe_evaluations["TPR"].append(tpr)
indpe_evaluations["FPR"].append(fpr)
indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
indpe_evaluations["AUC"].append(auc)
indpe_evaluations["Sensitivity"].append(sens)
indpe_evaluations["Specificity"].append(spec)
indpe_evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

y_pred = lr_model.predict_proba(X_lr_indpe_features)[:, 1]
label_pred = lr_model.predict(X_lr_indpe_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels, label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

indpe_evaluations["Train_Test"].append("Independent")
indpe_evaluations["Accuracy"].append(acc)
indpe_evaluations["Precision"].append(prec)
indpe_evaluations["TPR"].append(tpr)
indpe_evaluations["FPR"].append(fpr)
indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
indpe_evaluations["AUC"].append(auc)
indpe_evaluations["Sensitivity"].append(sens)
indpe_evaluations["Specificity"].append(spec)
indpe_evaluations["MCC"].append(mcc)

Training Logistic Regression of Ensemble..


## Independent evaluation

In [15]:
indpe_evaluations_df = pd.DataFrame.from_dict(indpe_evaluations)

In [16]:
indpe_evaluations_df.filter(['Train_Test', 'Accuracy', 'AUC', 'Sensitivity', 'Specificity', 'MCC'])

Unnamed: 0,Train_Test,Accuracy,AUC,Sensitivity,Specificity,MCC
0,Train,1.0,1.0,1.0,1.0,1.0
1,Independent,0.968127,0.987562,0.94898,0.980392,0.932925


In [17]:
# CKSAAP5 1000
# 	Train_Test	Accuracy	AUC	Sensitivity	Specificity	MCC
# 0	Train	0.998285	0.999821	1.000000	0.997175	0.996413
# 1	Independent	0.928287	0.922369	0.857143	0.973856	0.849619

In [18]:
# CKSAAP4 1000
# Train_Test	Accuracy	AUC	Sensitivity	Specificity	MCC
# 0	Train	0.998285	0.999975	1.000000	0.997175	0.996413
# 1	Independent	0.944223	0.982993	0.877551	0.986928	0.883849

In [19]:
rf_model = get_model(trees=10, 
                     cw={0:1, 1:1})

# train model
rf_model.fit(X = X_lr_train_features, y = train_labels)

rf_model.feature_importances_

  warn(


array([0.1, 0.5, 0.3, 0.1])