In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_ML_iLearnPlus_individual"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "PredNTS_iLearnPlus_ENC"

train_file_name = "PredNTS_input_train_{}.csv"
indpe_file_name = "PredNTS_input_test_{}.csv"

In [2]:
enc_dict_rf_treeCount = {
    "AAC": 100,
    "ASDC": 1000,
    "CKSAAGP4": 100,
    "CKSAAP4": 1000,
    "CTDC": 100,
    "CTDD": 100,
    "CTDT": 100,
    "Ctriad": 100,
    "DDE": 100,
    "DistancePair": 1000,
    "GAAC": None,
    "GDPC": 100,
    "TPC": 1000
}

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

import math

In [4]:
##################################################################################
##### Build k-fold splits
##################################################################################

sample_train_input_data_file = os.path.join(input_data_folder, train_file_name.format("AAC"))
sample_data = pd.read_csv(sample_train_input_data_file, sep=',', header=None)

sample_features = np.array(range(sample_data.shape[0]))
sample_labels = np.array(sample_data[0])

skf = StratifiedKFold(n_splits=n_fold, shuffle=shuffle, random_state=seed)
kfold_list = []
for train_index, test_index in skf.split(sample_features, sample_labels):
    kfold_list.append({
        "train_indices": train_index,
        "test_indices": test_index,
    })

In [5]:
def get_model(trees = None, cw = None):
    
    if trees is not None:
        model = RandomForestClassifier(n_estimators=trees, 
                                       criterion='entropy',
                                       class_weight=cw,
                                       bootstrap=True,
                                       oob_score=True
                                      )
    else:
        model = LogisticRegression(class_weight=cw)
    
    return model

# Training

In [6]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
train_evaluations = {
    "Dataset" : [],
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

indpe_evaluations = {
    "Dataset" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

In [7]:
## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

for current_dataset_variety in enc_dict_rf_treeCount.keys():
    
    print("\n======================================================================")
    print("\nDataset vareity:", current_dataset_variety)
    
    train_input_data_file = os.path.join(input_data_folder, train_file_name.format(current_dataset_variety))
    indpe_input_data_file = os.path.join(input_data_folder, indpe_file_name.format(current_dataset_variety))
    
    train_data = pd.read_csv(train_input_data_file, sep=',', header=None)
    indpe_data = pd.read_csv(indpe_input_data_file, sep=',', header=None)

    train_features = np.array(train_data.drop(0, axis=1))
    train_labels = np.array(train_data[0])
    
    indpe_features = np.array(indpe_data.drop(0, axis=1))
    indpe_labels = np.array(indpe_data[0])
    
    i = -1
    for fold in kfold_list:
        
        i += 1
        print("Train/Test model on Fold #"+str(i)+".")
        
        fold_train_features = train_features[fold['train_indices'], :]
        fold_train_labels = train_labels[fold['train_indices']]
        fold_test_features = train_features[fold['test_indices'], :]
        fold_test_labels = train_labels[fold['test_indices']]
        
        # adding random shuffling of the dataset for training purpose
        randomized_index_arr = np.arange(fold_train_features.shape[0])
        randomized_index_arr = np.random.permutation(randomized_index_arr)
        
        # fetch model
        model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                          cw = {0:1, 1:1})
        
        # train model
        model.fit(X = fold_train_features[randomized_index_arr], y = fold_train_labels[randomized_index_arr])
        
        # saving model to file
        model_file_path = os.path.join(modelPath, "{}_bestModel-fold{}.hdf5".format(current_dataset_variety, i))
        model_file_obj = open(model_file_path, 'wb')
        pickle.dump(model, model_file_obj)
        model_file_obj.close()
        
        ##################################################################################
        ##### Prediction and metrics for TRAIN dataset
        ##################################################################################

        y_pred = model.predict_proba(fold_train_features)[:, 1]
        label_pred = model.predict(fold_train_features)
        
        # Compute precision, recall, sensitivity, specifity, mcc
        acc = accuracy_score(fold_train_labels, label_pred)
        prec = precision_score(fold_train_labels,label_pred)
        mcc = matthews_corrcoef(fold_train_labels, label_pred)

        conf = confusion_matrix(fold_train_labels, label_pred)
        tn, fp, fn, tp = conf.ravel()
        sens = tp/(tp+fn)
        spec = tn/(tn+fp)

        fpr, tpr, thresholds = roc_curve(fold_train_labels, y_pred)
        auc = roc_auc_score(fold_train_labels, y_pred)

        train_evaluations["Dataset"].append(current_dataset_variety)
        train_evaluations["Fold"].append(i)
        train_evaluations["Train_Test"].append("Train")
        train_evaluations["Accuracy"].append(acc)
        train_evaluations["Precision"].append(prec)
        train_evaluations["TPR"].append(tpr)
        train_evaluations["FPR"].append(fpr)
        train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
        train_evaluations["AUC"].append(auc)
        train_evaluations["Sensitivity"].append(sens)
        train_evaluations["Specificity"].append(spec)
        train_evaluations["MCC"].append(mcc)

        ##################################################################################
        ##### Prediction and metrics for TEST dataset
        ##################################################################################

        y_pred = model.predict_proba(fold_test_features)[:, 1]
        label_pred = model.predict(fold_test_features)
        
        # Compute precision, recall, sensitivity, specifity, mcc
        acc = accuracy_score(fold_test_labels, label_pred)
        prec = precision_score(fold_test_labels,label_pred)
        mcc = matthews_corrcoef(fold_test_labels, label_pred)

        conf = confusion_matrix(fold_test_labels, label_pred)
        tn, fp, fn, tp = conf.ravel()
        sens = tp/(tp+fn)
        spec = tn/(tn+fp)

        fpr, tpr, thresholds = roc_curve(fold_test_labels, y_pred)
        auc = roc_auc_score(fold_test_labels, y_pred)

        train_evaluations["Dataset"].append(current_dataset_variety)
        train_evaluations["Fold"].append(i)
        train_evaluations["Train_Test"].append("Test")
        train_evaluations["Accuracy"].append(acc)
        train_evaluations["Precision"].append(prec)
        train_evaluations["TPR"].append(tpr)
        train_evaluations["FPR"].append(fpr)
        train_evaluations["TPR_FPR_Thresholds"].append(thresholds)
        train_evaluations["AUC"].append(auc)
        train_evaluations["Sensitivity"].append(sens)
        train_evaluations["Specificity"].append(spec)
        train_evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##################################################################################
    ##### Independent Data performance
    ##################################################################################
    ##################################################################################

    print("\nIndependent evaluation for model.")
    
    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(train_features.shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)

    # fetch model
    model = get_model(trees = enc_dict_rf_treeCount[current_dataset_variety], 
                      cw = {0:10, 1:1})

    # train model
    model.fit(X = train_features[randomized_index_arr], y = train_labels[randomized_index_arr])

    # saving model to file
    model_file_path = os.path.join(modelPath, "{}_fullModel.hdf5".format(current_dataset_variety))
    model_file_obj = open(model_file_path, 'wb')
    pickle.dump(model, model_file_obj)
    model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict_proba(train_features)[:, 1]
    label_pred = model.predict(train_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(train_labels, label_pred)
    prec = precision_score(train_labels,label_pred)
    mcc = matthews_corrcoef(train_labels, label_pred)

    conf = confusion_matrix(train_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
    auc = roc_auc_score(train_labels, y_pred)

    indpe_evaluations["Dataset"].append(current_dataset_variety)
    indpe_evaluations["Train_Test"].append("Train")
    indpe_evaluations["Accuracy"].append(acc)
    indpe_evaluations["Precision"].append(prec)
    indpe_evaluations["TPR"].append(tpr)
    indpe_evaluations["FPR"].append(fpr)
    indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    indpe_evaluations["AUC"].append(auc)
    indpe_evaluations["Sensitivity"].append(sens)
    indpe_evaluations["Specificity"].append(spec)
    indpe_evaluations["MCC"].append(mcc)

    ##################################################################################
    ##### Prediction and metrics for INDEPENDENT dataset
    ##################################################################################

    y_pred = model.predict_proba(indpe_features)[:, 1]
    label_pred = model.predict(indpe_features)

    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(indpe_labels, label_pred)
    prec = precision_score(indpe_labels,label_pred)
    mcc = matthews_corrcoef(indpe_labels, label_pred)

    conf = confusion_matrix(indpe_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)

    fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
    auc = roc_auc_score(indpe_labels, y_pred)

    indpe_evaluations["Dataset"].append(current_dataset_variety)
    indpe_evaluations["Train_Test"].append("Independent")
    indpe_evaluations["Accuracy"].append(acc)
    indpe_evaluations["Precision"].append(prec)
    indpe_evaluations["TPR"].append(tpr)
    indpe_evaluations["FPR"].append(fpr)
    indpe_evaluations["TPR_FPR_Thresholds"].append(thresholds)
    indpe_evaluations["AUC"].append(auc)
    indpe_evaluations["Sensitivity"].append(sens)
    indpe_evaluations["Specificity"].append(spec)
    indpe_evaluations["MCC"].append(mcc)



Dataset vareity: AAC
Train/Test model on Fold #0.
Train/Test model on Fold #1.
Train/Test model on Fold #2.
Train/Test model on Fold #3.
Train/Test model on Fold #4.

Independent evaluation for model.


Dataset vareity: ASDC
Train/Test model on Fold #0.
Train/Test model on Fold #1.
Train/Test model on Fold #2.
Train/Test model on Fold #3.
Train/Test model on Fold #4.

Independent evaluation for model.


Dataset vareity: CKSAAGP4
Train/Test model on Fold #0.
Train/Test model on Fold #1.
Train/Test model on Fold #2.
Train/Test model on Fold #3.
Train/Test model on Fold #4.

Independent evaluation for model.


Dataset vareity: CKSAAP4
Train/Test model on Fold #0.
Train/Test model on Fold #1.
Train/Test model on Fold #2.
Train/Test model on Fold #3.
Train/Test model on Fold #4.

Independent evaluation for model.


Dataset vareity: CTDC
Train/Test model on Fold #0.
Train/Test model on Fold #1.
Train/Test model on Fold #2.
Train/Test model on Fold #3.
Train/Test model on Fold #4.

Independ

## k-fold Training evaluation

In [8]:
train_evaluations_df = pd.DataFrame.from_dict(train_evaluations)

In [9]:
train_evaluations_df.groupby(['Dataset', "Train_Test"]).mean().filter(['Sensitivity', 
                                                                      'Specificity', 
                                                                      'Accuracy',
                                                                      'MCC', 
                                                                      'AUC',
                                                                     ])

Unnamed: 0_level_0,Unnamed: 1_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Dataset,Train_Test,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAC,Test,0.735495,0.765747,0.750628,0.501548,0.835465
AAC,Train,0.994962,0.998741,0.996851,0.993711,0.999843
ASDC,Test,0.76235,0.776682,0.769524,0.539213,0.850271
ASDC,Train,0.996222,0.997481,0.996851,0.993705,0.999924
CKSAAGP4,Test,0.746426,0.759042,0.752729,0.505629,0.832554
CKSAAGP4,Train,0.995381,0.995592,0.995487,0.990979,0.999825
CKSAAP4,Test,0.814419,0.819489,0.816964,0.634079,0.897627
CKSAAP4,Train,0.996012,0.997691,0.996851,0.993704,0.999949
CTDC,Test,0.727102,0.764084,0.745599,0.491618,0.820576
CTDC,Train,0.996641,0.997061,0.996851,0.993706,0.999846


In [10]:
train_evaluations_df.groupby(['Dataset', "Train_Test"]).std().filter(['Sensitivity', 
                                                                      'Specificity', 
                                                                      'Accuracy',
                                                                      'MCC', 
                                                                      'AUC',
                                                                     ])

Unnamed: 0_level_0,Unnamed: 1_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Dataset,Train_Test,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAC,Test,0.017439,0.012299,0.011576,0.023151,0.009075
AAC,Train,0.001368,0.000878,0.000643,0.001284,9.9e-05
ASDC,Test,0.027792,0.01919,0.020802,0.041644,0.005868
ASDC,Train,0.001197,0.001408,0.000643,0.001287,6.1e-05
CKSAAGP4,Test,0.015429,0.013097,0.007021,0.013971,0.009726
CKSAAGP4,Train,0.00218,0.001726,0.000796,0.001588,0.000149
CKSAAP4,Test,0.030789,0.00863,0.018864,0.03769,0.009652
CKSAAP4,Train,0.00047,0.000879,0.000643,0.001287,4.1e-05
CTDC,Test,0.01977,0.018053,0.015505,0.031091,0.009767
CTDC,Train,0.00137,0.001726,0.000643,0.001286,7.1e-05


In [18]:
train_evaluations_df_grouped = train_evaluations_df.groupby(['Dataset', "Train_Test"]).mean().filter(['Sensitivity', 
                                                                      'Specificity', 
                                                                      'Accuracy',
                                                                      'MCC', 
                                                                      'AUC',
                                                                     ]).reset_index()

In [21]:
train_evaluations_df_grouped[train_evaluations_df_grouped['Train_Test'] == 'Test'].sort_values(by = 'MCC', 
                                                                                      ascending=False).filter(['Dataset',
                                                                                                               'Train_Test',
                                                                                                               'Sensitivity', 
                                                                                                              'Specificity', 
                                                                                                              'Accuracy',
                                                                                                              'MCC', 
                                                                                                              'AUC',
                                                                                                             ])

Unnamed: 0,Dataset,Train_Test,Sensitivity,Specificity,Accuracy,MCC,AUC
6,CKSAAP4,Test,0.814419,0.819489,0.816964,0.634079,0.897627
18,DistancePair,Test,0.804342,0.81277,0.80857,0.617416,0.893665
16,DDE,Test,0.763229,0.81193,0.787576,0.576031,0.871383
24,TPC,Test,0.7968,0.772462,0.784635,0.569525,0.866955
2,ASDC,Test,0.76235,0.776682,0.769524,0.539213,0.850271
4,CKSAAGP4,Test,0.746426,0.759042,0.752729,0.505629,0.832554
0,AAC,Test,0.735495,0.765747,0.750628,0.501548,0.835465
14,Ctriad,Test,0.746415,0.753985,0.750203,0.500555,0.841376
8,CTDC,Test,0.727102,0.764084,0.745599,0.491618,0.820576
12,CTDT,Test,0.712837,0.761577,0.737201,0.475305,0.812816


In [22]:
train_evaluations_df_grouped[train_evaluations_df_grouped['Train_Test'] == 'Test'].sort_values(by = 'Accuracy', 
                                                                                      ascending=False).filter(['Dataset',
                                                                                                               'Train_Test',
                                                                                                               'Sensitivity', 
                                                                                                              'Specificity', 
                                                                                                              'Accuracy',
                                                                                                              'MCC', 
                                                                                                              'AUC',
                                                                                                             ])

Unnamed: 0,Dataset,Train_Test,Sensitivity,Specificity,Accuracy,MCC,AUC
6,CKSAAP4,Test,0.814419,0.819489,0.816964,0.634079,0.897627
18,DistancePair,Test,0.804342,0.81277,0.80857,0.617416,0.893665
16,DDE,Test,0.763229,0.81193,0.787576,0.576031,0.871383
24,TPC,Test,0.7968,0.772462,0.784635,0.569525,0.866955
2,ASDC,Test,0.76235,0.776682,0.769524,0.539213,0.850271
4,CKSAAGP4,Test,0.746426,0.759042,0.752729,0.505629,0.832554
0,AAC,Test,0.735495,0.765747,0.750628,0.501548,0.835465
14,Ctriad,Test,0.746415,0.753985,0.750203,0.500555,0.841376
8,CTDC,Test,0.727102,0.764084,0.745599,0.491618,0.820576
12,CTDT,Test,0.712837,0.761577,0.737201,0.475305,0.812816


## Independent evaluation

In [13]:
indpe_evaluations_df = pd.DataFrame.from_dict(indpe_evaluations)

In [14]:
indpe_evaluations_df.groupby(['Dataset', "Train_Test"]).mean().filter(['Sensitivity', 
                                                                      'Specificity', 
                                                                      'Accuracy',
                                                                      'MCC', 
                                                                      'AUC',
                                                                     ])

Unnamed: 0_level_0,Unnamed: 1_level_0,Sensitivity,Specificity,Accuracy,MCC,AUC
Dataset,Train_Test,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAC,Independent,0.610837,0.633072,0.629388,0.18458,0.62383
AAC,Train,0.993283,0.99916,0.996222,0.99246,0.997881
ASDC,Independent,0.630542,0.625245,0.626122,0.192882,0.636403
ASDC,Train,0.992443,1.0,0.996222,0.992472,0.997989
CKSAAGP4,Independent,0.600985,0.624266,0.620408,0.170099,0.628761
CKSAAGP4,Train,0.989085,1.0,0.994542,0.989144,0.997294
CKSAAP4,Independent,0.630542,0.678082,0.670204,0.237282,0.656127
CKSAAP4,Train,0.992443,1.0,0.996222,0.992472,0.99833
CTDC,Independent,0.605911,0.652642,0.644898,0.197083,0.628276
CTDC,Train,0.992443,1.0,0.996222,0.992472,0.99788


In [15]:
indpe_evaluations_df[indpe_evaluations_df['Train_Test'] == 'Independent'].sort_values(by = 'MCC', 
                                                                                      ascending=False).filter(['Dataset',
                                                                                                               'Train_Test',
                                                                                                               'Sensitivity', 
                                                                                                              'Specificity', 
                                                                                                              'Accuracy',
                                                                                                              'MCC', 
                                                                                                              'AUC',
                                                                                                             ])

Unnamed: 0,Dataset,Train_Test,Sensitivity,Specificity,Accuracy,MCC,AUC
19,DistancePair,Independent,0.635468,0.680039,0.672653,0.242682,0.656674
7,CKSAAP4,Independent,0.630542,0.678082,0.670204,0.237282,0.656127
17,DDE,Independent,0.600985,0.681996,0.668571,0.218569,0.641483
9,CTDC,Independent,0.605911,0.652642,0.644898,0.197083,0.628276
3,ASDC,Independent,0.630542,0.625245,0.626122,0.192882,0.636403
1,AAC,Independent,0.610837,0.633072,0.629388,0.18458,0.62383
13,CTDT,Independent,0.600985,0.637965,0.631837,0.181237,0.619473
15,Ctriad,Independent,0.625616,0.603718,0.607347,0.172036,0.621483
5,CKSAAGP4,Independent,0.600985,0.624266,0.620408,0.170099,0.628761
11,CTDD,Independent,0.596059,0.617417,0.613878,0.160932,0.615872


In [16]:
indpe_evaluations_df[indpe_evaluations_df['Train_Test'] == 'Independent'].sort_values(by = 'Accuracy', 
                                                                                      ascending=False).filter(['Dataset',
                                                                                                               'Train_Test',
                                                                                                               'Sensitivity', 
                                                                                                              'Specificity', 
                                                                                                              'Accuracy',
                                                                                                              'MCC', 
                                                                                                              'AUC',
                                                                                                             ])

Unnamed: 0,Dataset,Train_Test,Sensitivity,Specificity,Accuracy,MCC,AUC
21,GAAC,Independent,0.004926,0.999022,0.834286,0.036357,0.628419
19,DistancePair,Independent,0.635468,0.680039,0.672653,0.242682,0.656674
7,CKSAAP4,Independent,0.630542,0.678082,0.670204,0.237282,0.656127
17,DDE,Independent,0.600985,0.681996,0.668571,0.218569,0.641483
9,CTDC,Independent,0.605911,0.652642,0.644898,0.197083,0.628276
13,CTDT,Independent,0.600985,0.637965,0.631837,0.181237,0.619473
1,AAC,Independent,0.610837,0.633072,0.629388,0.18458,0.62383
3,ASDC,Independent,0.630542,0.625245,0.626122,0.192882,0.636403
5,CKSAAGP4,Independent,0.600985,0.624266,0.620408,0.170099,0.628761
11,CTDD,Independent,0.596059,0.617417,0.613878,0.160932,0.615872
