In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_ML_AsimEmbedding_ensemble"
outPath = "Results"
foldName = "folds.pickle"

epochs = 100
batch_size = 32
shuffle = True
seed = None

input_data_folder = "Data_from_Asim"

dde_filename = "Protein_DDE[100, 0, 0, 0]-st-simplesequence.csv"
dpr_filename = "Protein_DistancePair[100, 0, 0, 0]-st-simplesequence.csv"
dpc_filename = "Protein_DPC[100, 0, 0, 0]-st-simplesequence.csv"
tpc_filename = "TPC[100, 0, 0, 0]-st-simplesequence.csv"

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import XGBClassifier as xgb
# from xgboost.sklearn import XGBClassifier as xgb
import xgboost as xgb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression, LinearRegression

import math

  from pandas import MultiIndex, Int64Index


In [3]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(dde_features, dpr_features, dpc_features, tpc_features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(dde_features, labels):
        kfoldList.append({
            "X_DDE_train": dde_features[train_index],
            "X_DDE_test": dde_features[test_index],
            "X_DPR_train": dpr_features[train_index],
            "X_DPR_test": dpr_features[test_index],
            "X_DPC_train": dpc_features[train_index],
            "X_DPC_test": dpc_features[test_index],
            "X_TPC_train": tpc_features[train_index],
            "X_TPC_test": tpc_features[test_index],
            "y_train": labels[train_index],
            "y_test": labels[test_index]
        })
    return kfoldList

In [4]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

# Model

In [5]:
def get_model():
    
    model = RandomForestClassifier(n_estimators=100, 
                                   criterion='gini', 
                                   bootstrap=True,
                                   oob_score=True)

#     model = xgb.XGBClassifier(objective="binary:logistic", use_label_encoder=False,
#                               learn_rate=0.001, eta=0.1, eval_metric='auc'
#                              )

#     model = xgb.XGBClassifier(objective="binary:logistic", use_label_encoder=False, eval_metric="logloss")
    
    return model

In [6]:
get_model()

RandomForestClassifier(oob_score=True)

# Data preparation

### Read all 4 data files

In [7]:
dde_data_file = os.path.join(input_data_folder, dde_filename)
dpr_data_file = os.path.join(input_data_folder, dpr_filename)
dpc_data_file = os.path.join(input_data_folder, dpc_filename)
tpc_data_file = os.path.join(input_data_folder, tpc_filename)

dde_data = pd.read_csv(dde_data_file, sep=',', header=0)
dpr_data = pd.read_csv(dpr_data_file, sep=',', header=0)
dpc_data = pd.read_csv(dpc_data_file, sep=',', header=0)
tpc_data = pd.read_csv(tpc_data_file, sep=',', header=0)

dde_train_data = dde_data[dde_data['set'] == 'train'].drop('set', axis=1)
dde_indpe_data = dde_data[dde_data['set'] == 'test'].drop('set', axis=1)
dpr_train_data = dpr_data[dpr_data['set'] == 'train'].drop('set', axis=1)
dpr_indpe_data = dpr_data[dpr_data['set'] == 'test'].drop('set', axis=1)
dpc_train_data = dpc_data[dpc_data['set'] == 'train'].drop('set', axis=1)
dpc_indpe_data = dpc_data[dpc_data['set'] == 'test'].drop('set', axis=1)
tpc_train_data = tpc_data[tpc_data['set'] == 'train'].drop('set', axis=1)
tpc_indpe_data = tpc_data[tpc_data['set'] == 'test'].drop('set', axis=1)

### Prepare Training folds

In [8]:
dde_train_features = np.array(dde_train_data.drop('labels', axis=1))
dpr_train_features = np.array(dpr_train_data.drop('labels', axis=1))
dpc_train_features = np.array(dpc_train_data.drop('labels', axis=1))
tpc_train_features = np.array(tpc_train_data.drop('labels', axis=1))

dde_train_features_shape = dde_train_features[0].shape
dpr_train_features_shape = dpr_train_features[0].shape
dpc_train_features_shape = dpc_train_features[0].shape
tpc_train_features_shape = tpc_train_features[0].shape

train_labels = np.array(dde_train_data["labels"])

folds = build_kfold(dde_train_features, dpr_train_features, dpc_train_features, tpc_train_features, train_labels, 
                    k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

### Prepare Independent data

In [9]:
dde_indpe_features = np.array(dde_indpe_data.drop('labels', axis=1))
dpr_indpe_features = np.array(dpr_indpe_data.drop('labels', axis=1))
dpc_indpe_features = np.array(dpc_indpe_data.drop('labels', axis=1))
tpc_indpe_features = np.array(tpc_indpe_data.drop('labels', axis=1))

indpe_labels = np.array(dde_indpe_data["labels"])

# Training

In [10]:
##################################################################################
##### Evaluation structure
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Type" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

In [11]:
##################################################################################
##### TRAIN and PREDICT for every Fold, using models
##################################################################################

# fold counter
i = -1

for fold in folds:
    
    i = i+1

    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(fold["X_DDE_train"].shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)

    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    ##################################################################################
    ##### TRAIN DDE model
    ##################################################################################
    
    print("Training DDE model.")
    
    dde_model = get_model()
    
    dde_model.fit(X = fold["X_DDE_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr])
    
    dde_model_file_path = os.path.join(modelPath, "DDE_bestModel-fold{}.hdf5".format(i))
    dde_model_file_obj = open(dde_model_file_path, 'wb')
    pickle.dump(dde_model, dde_model_file_obj)
    dde_model_file_obj.close()
    
    ##################################################################################
    ##### TRAIN DPR model
    ##################################################################################
    
    print("Training DPR model.")
    
    dpr_model = get_model()
    
    dpr_model.fit(X = fold["X_DPR_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr])
    
    dpr_model_file_path = os.path.join(modelPath, "DPR_bestModel-fold{}.hdf5".format(i))
    dpr_model_file_obj = open(dpr_model_file_path, 'wb')
    pickle.dump(dpr_model, dpr_model_file_obj)
    dpr_model_file_obj.close()
    
    ##################################################################################
    ##### TRAIN DPC model
    ##################################################################################
    
    print("Training DPC model.")
    
    dpc_model = get_model()
    
    dpc_model.fit(X = fold["X_DPC_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr])
    
    dpc_model_file_path = os.path.join(modelPath, "DPC_bestModel-fold{}.hdf5".format(i))
    dpc_model_file_obj = open(dpc_model_file_path, 'wb')
    pickle.dump(dpc_model, dpc_model_file_obj)
    dpc_model_file_obj.close()
    
    ##################################################################################
    ##### TRAIN TPC model
    ##################################################################################
    
    print("Training TPC model.")
    
    tpc_model = get_model()
    
    tpc_model.fit(X = fold["X_TPC_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr])
    
    tpc_model_file_path = os.path.join(modelPath, "TPC_bestModel-fold{}.hdf5".format(i))
    tpc_model_file_obj = open(tpc_model_file_path, 'wb')
    pickle.dump(tpc_model, tpc_model_file_obj)
    tpc_model_file_obj.close()
    
    
    ##################################################################################
    ##### Generate scores for Train/Test dataset
    ##################################################################################
    
    dde_train_y_pred = dde_model.predict(fold["X_DDE_train"])
    dde_test_y_pred = dde_model.predict(fold["X_DDE_test"])
    
    dpr_train_y_pred = dpr_model.predict(fold["X_DPR_train"])
    dpr_test_y_pred = dpr_model.predict(fold["X_DPR_test"])
    
    dpc_train_y_pred = dpc_model.predict(fold["X_DPC_train"])
    dpc_test_y_pred = dpc_model.predict(fold["X_DPC_test"])
    
    tpc_train_y_pred = tpc_model.predict(fold["X_TPC_train"])
    tpc_test_y_pred = tpc_model.predict(fold["X_TPC_test"])
    
    X_lr_train = np.concatenate((dde_train_y_pred[:, np.newaxis], 
                                 dpr_train_y_pred[:, np.newaxis], 
                                 dpc_train_y_pred[:, np.newaxis], 
                                 tpc_train_y_pred[:, np.newaxis]), 
                                axis=1)
    X_lr_test = np.concatenate((dde_test_y_pred[:, np.newaxis], 
                                dpr_test_y_pred[:, np.newaxis], 
                                dpc_test_y_pred[:, np.newaxis], 
                                tpc_test_y_pred[:, np.newaxis]), 
                               axis=1)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating TRAIN set SUM metrics.")
    
    y_pred = np.sum(X_lr_train, axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("Sum")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating TEST set SUM metrics.")

    y_pred = np.sum(X_lr_test, axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("Sum")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating TRAIN set VOTE metrics.")
    
    # y_pred = lr_model.predict(X_lr_train)
    y_pred = np.sum(pred2label(X_lr_train), axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("Vote")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating TEST set VOTE metrics.")

    # y_pred = lr_model.predict(X_lr_test)
    y_pred = np.sum(pred2label(X_lr_test), axis=1)/3
    y_pred = y_pred[:, np.newaxis]
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("Vote")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### LOGISTIC regression using the scores
    ##################################################################################
    
    print("Training logistic regression.")
    
    # lr_model = LogisticRegression(penalty='elasticnet')
    lr_model = LogisticRegression()
    lr_model.fit(X_lr_train, fold["y_train"])
    
    lr_current_model_path = os.path.join(modelPath, "LogR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'wb')
    pickle.dump(lr_model, lr_model_file_obj)
    lr_model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating train set metrics.")
    
    y_pred = lr_model.predict(X_lr_train)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("LogR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating test set metrics.")

    y_pred = lr_model.predict(X_lr_test)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("LogR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### LINEAR regression using the scores
    ##################################################################################
    
    print("Training linear regression.")
    
    # lr_model = LogisticRegression(penalty='elasticnet')
    lr_model = LinearRegression(positive=True)
    lr_model.fit(X_lr_train, fold["y_train"])
    
    lr_current_model_path = os.path.join(modelPath, "LinR_bestModel-fold{}.hdf5".format(i))
    lr_model_file_obj = open(lr_current_model_path, 'wb')
    pickle.dump(lr_model, lr_model_file_obj)
    lr_model_file_obj.close()
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    print("Generating train set metrics.")
    
    y_pred = lr_model.predict(X_lr_train)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Type"].append("LinR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################
    
    print("Generating test set metrics.")

    y_pred = lr_model.predict(X_lr_test)
    label_pred = pred2label(y_pred)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Type"].append("LinR")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)


Train/Test model on Fold #0.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training logistic regression.
Generating train set metrics.
Generating test set metrics.
Training linear regression.
Generating train set metrics.
Generating test set metrics.

Train/Test model on Fold #1.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating TRAIN set SUM metrics.
Generating TEST set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating TEST set VOTE metrics.
Training logistic regression.
Generating train set metrics.
Generating test set metrics.
Training linear regression.
Generating train set metrics.
Generating test set metrics.

Train/Test model on Fold #2.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.
Generating TRAIN set SUM metrics.
Generating T

## k-fold Training evaluation

In [12]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test", "Type"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Test,LinR,0.796817,0.786283,0.851737,0.819489,0.774122,0.596583
Test,LogR,0.810665,0.795958,0.810685,0.837144,0.784227,0.622881
Test,Sum,0.811505,0.795739,0.86862,0.839658,0.783387,0.624589
Test,Vote,0.811505,0.795739,0.86862,0.839658,0.783387,0.624589
Train,LinR,0.996641,0.997482,0.998517,0.995802,0.997481,0.993289
Train,LogR,0.996641,0.997898,0.996641,0.995382,0.997901,0.99329
Train,Sum,0.996641,0.997898,0.998517,0.995382,0.997901,0.99329
Train,Vote,0.996641,0.997898,0.998517,0.995382,0.997901,0.99329


# Independent data

# Train on full data

In [13]:
# adding random shuffling of the dataset for training purpose
randomized_index_arr = np.arange(fold["X_DDE_train"].shape[0])
randomized_index_arr = np.random.permutation(randomized_index_arr)

print("\nTrain/Test model on Fold #"+str(i)+".")

##################################################################################
##### TRAIN DDE model
##################################################################################

print("Training DDE model.")

dde_model = get_model()

dde_model.fit(X = dde_train_features, y = train_labels)

dde_model_file_path = os.path.join(modelPath, "DDE_bestModel-full.hdf5")
dde_model_file_obj = open(dde_model_file_path, 'wb')
pickle.dump(dde_model, dde_model_file_obj)
dde_model_file_obj.close()

##################################################################################
##### TRAIN DPR model
##################################################################################

print("Training DPR model.")

dpr_model = get_model()

dpr_model.fit(X = dpr_train_features, y = train_labels)

dpr_model_file_path = os.path.join(modelPath, "DPR_bestModel-full.hdf5")
dpr_model_file_obj = open(dpr_model_file_path, 'wb')
pickle.dump(dpr_model, dpr_model_file_obj)
dpr_model_file_obj.close()

##################################################################################
##### TRAIN DPC model
##################################################################################

print("Training DPC model.")

dpc_model = get_model()

dpc_model.fit(X = dpc_train_features, y = train_labels)

dpc_model_file_path = os.path.join(modelPath, "DPC_bestModel-full.hdf5")
dpc_model_file_obj = open(dpc_model_file_path, 'wb')
pickle.dump(dpc_model, dpc_model_file_obj)
dpc_model_file_obj.close()

##################################################################################
##### TRAIN TPC model
##################################################################################

print("Training TPC model.")

tpc_model = get_model()

tpc_model.fit(X = tpc_train_features, y = train_labels)

tpc_model_file_path = os.path.join(modelPath, "TPC_bestModel-full.hdf5")
tpc_model_file_obj = open(tpc_model_file_path, 'wb')
pickle.dump(tpc_model, tpc_model_file_obj)
tpc_model_file_obj.close()


Train/Test model on Fold #4.
Training DDE model.
Training DPR model.
Training DPC model.
Training TPC model.


In [14]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Type" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Generate scores for Train/Test dataset
##################################################################################

dde_train_y_pred = dde_model.predict(dde_train_features)
dde_indpe_y_pred = dde_model.predict(dde_indpe_features)

dpr_train_y_pred = dpr_model.predict(dpr_train_features)
dpr_indpe_y_pred = dpr_model.predict(dpr_indpe_features)

dpc_train_y_pred = dpc_model.predict(dpc_train_features)
dpc_indpe_y_pred = dpc_model.predict(dpc_indpe_features)

tpc_train_y_pred = tpc_model.predict(tpc_train_features)
tpc_indpe_y_pred = tpc_model.predict(tpc_indpe_features)

X_lr_train = np.concatenate((dde_train_y_pred[:, np.newaxis], 
                             dpr_train_y_pred[:, np.newaxis], 
                             dpc_train_y_pred[:, np.newaxis], 
                             tpc_train_y_pred[:, np.newaxis]), 
                            axis=1)
X_lr_indpe = np.concatenate((dde_indpe_y_pred[:, np.newaxis], 
                             dpr_indpe_y_pred[:, np.newaxis], 
                             dpc_indpe_y_pred[:, np.newaxis], 
                             tpc_indpe_y_pred[:, np.newaxis]), 
                            axis=1)

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating TRAIN set SUM metrics.")

y_pred = np.sum(X_lr_train, axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("Sum")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating Independent set SUM metrics.")

y_pred = np.sum(X_lr_indpe, axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("Sum")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating TRAIN set VOTE metrics.")

# y_pred = lr_model.predict(X_lr_train)
y_pred = np.sum(pred2label(X_lr_train), axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("Vote")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating Independent set VOTE metrics.")

# y_pred = lr_model.predict(X_lr_test)
y_pred = np.sum(pred2label(X_lr_indpe), axis=1)/3
y_pred = y_pred[:, np.newaxis]
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("Vote")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Logistic regression using the scores
##################################################################################

print("Training logistic regression.")

# lr_model = LogisticRegression(penalty='elasticnet')
lr_model = LogisticRegression()
lr_model.fit(X_lr_train, train_labels)

lr_current_model_path = os.path.join(modelPath, "LogR_bestModel-fold{}.hdf5".format(i))
lr_model_file_obj = open(lr_current_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating train set metrics.")

y_pred = lr_model.predict(X_lr_train)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating test set metrics.")

y_pred = lr_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("LogR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Linear regression using the scores
##################################################################################

print("Training linear regression.")

lr_model = LinearRegression(positive=True)
lr_model.fit(X_lr_train, train_labels)

lr_current_model_path = os.path.join(modelPath, "LinR_bestModel-fold{}.hdf5".format(i))
lr_model_file_obj = open(lr_current_model_path, 'wb')
pickle.dump(lr_model, lr_model_file_obj)
lr_model_file_obj.close()

##################################################################################
##### Prediction and metrics for TRAIN dataset
##################################################################################

print("Generating train set metrics.")

y_pred = lr_model.predict(X_lr_train)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Type"].append("LinR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

print("Generating test set metrics.")

y_pred = lr_model.predict(X_lr_indpe)
label_pred = pred2label(y_pred)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Independent")
evaluations["Type"].append("LinR")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

Generating TRAIN set SUM metrics.
Generating Independent set SUM metrics.
Generating TRAIN set VOTE metrics.
Generating Independent set VOTE metrics.
Training logistic regression.
Generating train set metrics.
Generating test set metrics.
Training linear regression.
Generating train set metrics.
Generating test set metrics.


In [15]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df

Unnamed: 0,Train_Test,Type,Accuracy,Precision,TPR,FPR,TPR_FPR_Thresholds,AUC,Sensitivity,Specificity,MCC
0,Train,Sum,0.996222,0.998314,"[0.0, 0.9924433249370277, 0.9949622166246851, ...","[0.0, 0.0, 0.0025188916876574307, 1.0]","[2.333333333333333, 1.3333333333333333, 0.3333...",0.997472,0.994123,0.998321,0.992452
1,Independent,Sum,0.599184,0.23913,"[0.0, 0.4876847290640394, 0.5862068965517241, ...","[0.0, 0.22504892367906065, 0.312133072407045, ...","[2.333333333333333, 1.3333333333333333, 1.0, 0...",0.655447,0.650246,0.589041,0.17882
2,Train,Vote,0.996222,0.998314,"[0.0, 0.9924433249370277, 0.9949622166246851, ...","[0.0, 0.0, 0.0025188916876574307, 1.0]","[2.333333333333333, 1.3333333333333333, 0.3333...",0.997472,0.994123,0.998321,0.992452
3,Independent,Vote,0.599184,0.23913,"[0.0, 0.4876847290640394, 0.5862068965517241, ...","[0.0, 0.22504892367906065, 0.312133072407045, ...","[2.333333333333333, 1.3333333333333333, 1.0, 0...",0.655447,0.650246,0.589041,0.17882
4,Train,LogR,0.996222,0.998314,"[0.0, 0.9941225860621327, 1.0]","[0.0, 0.0016792611251049538, 1.0]","[2, 1, 0]",0.996222,0.994123,0.998321,0.992452
5,Independent,LogR,0.614694,0.245747,"[0.0, 0.6403940886699507, 1.0]","[0.0, 0.3904109589041096, 1.0]","[2, 1, 0]",0.624992,0.640394,0.609589,0.187651
6,Train,LinR,0.996222,1.0,"[0.0, 0.9924433249370277, 0.9949622166246851, ...","[0.0, 0.0, 0.0025188916876574307, 1.0]","[2.000000000000007, 1.000000000000007, 0.49999...",0.997472,0.992443,1.0,0.992472
7,Independent,LinR,0.635918,0.249485,"[0.0, 0.5763546798029556, 0.5960591133004927, ...","[0.0, 0.3101761252446184, 0.3561643835616438, ...","[2.000000000000007, 1.000000000000007, 0.50502...",0.637919,0.596059,0.643836,0.182393
