In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_ML_AsimEmbedding_individual"
outPath = "Results"
foldName = "folds.pickle"

epochs = 100
batch_size = 32
shuffle = True
seed = None

input_data_folder = "Data_from_Asim"

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import XGBClassifier as xgb
# from xgboost.sklearn import XGBClassifier as xgb
import xgboost as xgb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

import math

  from pandas import MultiIndex, Int64Index


In [3]:
# ##################################################################################
# ##### define all CUSTOM functions
# ##################################################################################

# def one_hot_encode_nt(sequence, char_dict):
    
#     seq_encoded = np.zeros((len(sequence),len(char_dict)))
    
#     i = 0
#     for single_character in sequence:
#         if(single_character.upper() in char_dict.keys()):
#             seq_encoded[i][char_dict[single_character.upper()]] = 1
#             i = i+1
#         else:
#             raise ValueError('Incorrect character in NT sequence: '+sequence)
#     return seq_encoded

In [4]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

In [5]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

In [6]:
# for step in range(10):
#     initial_learning_rate=1e-1
#     decay_steps=10000
#     decay_rate=0.9
#     print(step, ':', initial_learning_rate * decay_rate ** (step / decay_steps))

# Training

In [7]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Dataset" : [],
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

In [8]:
def get_model(ccp = None):
        
    model = RandomForestClassifier(n_estimators=200, 
                                   criterion='gini', 
                                   bootstrap=True,
                                   # max_depth=# 10,
                                   oob_score=True
                                  )

#     model = xgb.XGBClassifier(objective="binary:logistic",
#                               booster='gbtree',
#                               learning_rate=0.1, eval_metric='auc',
#                               reg_alpha=0.1, reg_lambda=0.1,
#                               subsample=0.8, 
#                               colsample_bytree=0.5,
#                               use_label_encoder=False
#                              )

#     model = xgb.XGBClassifier(objective="binary:logistic", use_label_encoder=False)
    
    return model

In [9]:
# def get_model(ccp = None):
#     if ccp != None:
        
#         model = RandomForestClassifier(n_estimators=100, 
#                                        criterion='gini', 
#                                        bootstrap=True,
#                                        # max_depth=# 10,
#                                        ccp_alpha=ccp,
#                                        oob_score=True,
#                                        random_state=0
#                                       )
#     else:
        
#         model = RandomForestClassifier(n_estimators=500, 
#                                        criterion='gini', 
#                                        bootstrap=True,
#                                        # max_depth=# 10,
#                                        oob_score=True)
    
#     return model

In [10]:
get_model()

RandomForestClassifier(n_estimators=200, oob_score=True)

In [11]:
# ccp_alphas = list(np.arange(0, 0.0055, 0.0005))
# clfs = []

# for ccp_alpha in ccp_alphas:
#     clf = get_model(ccp=ccp_alpha)
#     clf.fit(X = fold["X_train"], y = fold["y_train"])
#     clfs.append(clf)
    
# train_scores = [clf.score(fold["X_train"], fold["y_train"]) for clf in clfs]
# test_scores = [clf.score(fold["X_test"], fold["y_test"]) for clf in clfs]

In [12]:
# import matplotlib.pyplot as plt

In [13]:
# fig, ax = plt.subplots()
# ax.set_xlabel("alpha")
# ax.set_ylabel("accuracy")
# ax.set_title("Accuracy vs alpha for training and testing sets")
# ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
# ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
# ax.legend()
# plt.show()

In [14]:
for root, dirs, files in os.walk(input_data_folder):
    for file in files:
        
        input_data_file = os.path.join(root, file)
        
        current_dataset_variety = input_data_file.split("\\")[-1].split("[")[0]

        data = pd.read_csv(input_data_file, sep=',', header=0)

        train_data = data[data['set'] == 'train'].drop('set', axis=1)
        independent_data = data[data['set'] == 'test'].drop('set', axis=1)

        train_labels = np.array(train_data['labels'])
        # train_labels = train_labels.reshape((train_labels.shape[0], 1))

        train_features = np.array(train_data.drop('labels', axis=1))

        indpe_labels = np.array(independent_data['labels'])
        # indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

        indpe_features = np.array(independent_data.drop('labels', axis=1))
        
        ##################################################################################
        ##### extract data from the current fasta file
        ##################################################################################

        print("\n======================================================================")
        print("\nFile:", file)
        print("Training Positive:", np.sum(train_labels))
        print("Training Negative:", train_labels.shape[0] - np.sum(train_labels))
        print("Independent Positive:", np.sum(indpe_labels))
        print("Independent Negative:", indpe_labels.shape[0] - np.sum(indpe_labels))
        
        ##################################################################################
        ##### Generate Folds from dataset, and store to file
        ##################################################################################

        ## Generate the k-fold dataset
        folds = build_kfold(train_features, train_labels, k=n_fold, shuffle=shuffle, seed=seed)

        ## Write the k-fold dataset to file
        foldPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold))
        if(not os.path.isdir(foldPath)):
            os.makedirs(foldPath)
        pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

        ## Create and set directory to save model
        modelPath = os.path.join(outPath, expName, current_dataset_variety, "{}fold".format(n_fold), "models")
        if(not os.path.isdir(modelPath)):
            os.makedirs(modelPath)
            
        ##################################################################################
        ##### TRAIN and PREDICT for every Fold, using models
        ##################################################################################

        # fold counter
        i = 0

        for fold in folds:
            
            # adding random shuffling of the dataset for training purpose
            randomized_index_arr = np.arange(fold["X_train"].shape[0])
            randomized_index_arr = np.random.permutation(randomized_index_arr)

            print("\nTrain/Test model "+current_dataset_variety+" on Fold #"+str(i)+".")
            
            input_size = fold["X_train"][0].shape
    
            model_file_path = os.path.join(modelPath, "{}_bestModel-fold{}.hdf5".format(current_dataset_variety, i))
            
            model = get_model()
            
            model.fit(X = fold["X_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr])

            model_file_obj = open(model_file_path, 'wb')
            pickle.dump(model, model_file_obj)
            model_file_obj.close()
            
            ##################################################################################
            ##### Prediction and metrics for TRAIN dataset
            ##################################################################################

            y_pred = model.predict(fold["X_train"])
            label_pred = pred2label(y_pred)
            # Compute precision, recall, sensitivity, specifity, mcc
            acc = accuracy_score(fold["y_train"], label_pred)
            prec = precision_score(fold["y_train"],label_pred)
            mcc = matthews_corrcoef(fold["y_train"], label_pred)

            conf = confusion_matrix(fold["y_train"], label_pred)
            tn, fp, fn, tp = conf.ravel()
            sens = tp/(tp+fn)
            spec = tn/(tn+fp)

            fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
            auc = roc_auc_score(fold["y_train"], y_pred)
            
            evaluations["Dataset"].append(current_dataset_variety)
            evaluations["Fold"].append(i)
            evaluations["Train_Test"].append("Train")
            evaluations["Accuracy"].append(acc)
            evaluations["Precision"].append(prec)
            evaluations["TPR"].append(tpr)
            evaluations["FPR"].append(fpr)
            evaluations["TPR_FPR_Thresholds"].append(thresholds)
            evaluations["AUC"].append(auc)
            evaluations["Sensitivity"].append(sens)
            evaluations["Specificity"].append(spec)
            evaluations["MCC"].append(mcc)

            ##################################################################################
            ##### Prediction and metrics for TEST dataset
            ##################################################################################

            y_pred = model.predict(fold["X_test"])
            label_pred = pred2label(y_pred)
            # Compute precision, recall, sensitivity, specifity, mcc
            acc = accuracy_score(fold["y_test"], label_pred)
            prec = precision_score(fold["y_test"],label_pred)
            mcc = matthews_corrcoef(fold["y_test"], label_pred)

            conf = confusion_matrix(fold["y_test"], label_pred)
            tn, fp, fn, tp = conf.ravel()
            sens = tp/(tp+fn)
            spec = tn/(tn+fp)

            fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
            auc = roc_auc_score(fold["y_test"], y_pred)
            
            evaluations["Dataset"].append(current_dataset_variety)
            evaluations["Fold"].append(i)
            evaluations["Train_Test"].append("Test")
            evaluations["Accuracy"].append(acc)
            evaluations["Precision"].append(prec)
            evaluations["TPR"].append(tpr)
            evaluations["FPR"].append(fpr)
            evaluations["TPR_FPR_Thresholds"].append(thresholds)
            evaluations["AUC"].append(auc)
            evaluations["Sensitivity"].append(sens)
            evaluations["Specificity"].append(spec)
            evaluations["MCC"].append(mcc)

            i = i+1
            
        ##################################################################################
        ##### Independent Data performance
        ##################################################################################
        
        print("\nIndependent evaluation for model "+current_dataset_variety+".")
        
        # adding random shuffling of the dataset for training purpose
        randomized_index_arr = np.arange(train_features.shape[0])
        randomized_index_arr = np.random.permutation(randomized_index_arr)

        input_size = train_features.shape
        
        model_file_path = os.path.join(modelPath, "{}_bestModel-full.hdf5".format(current_dataset_variety))
            
        model = get_model()

        model.fit(X = train_features[randomized_index_arr], y = train_labels[randomized_index_arr])

        model_file_obj = open(model_file_path, 'wb')
        pickle.dump(model, model_file_obj)
        model_file_obj.close()
        
        ##################################################################################
        ##### Prediction and metrics for TEST dataset
        ##################################################################################

        y_pred = model.predict(indpe_features)
        label_pred = pred2label(y_pred)
        # Compute precision, recall, sensitivity, specifity, mcc
        acc = accuracy_score(indpe_labels, label_pred)
        prec = precision_score(indpe_labels,label_pred)
        mcc = matthews_corrcoef(indpe_labels, label_pred)

        conf = confusion_matrix(indpe_labels, label_pred)
        tn, fp, fn, tp = conf.ravel()
        sens = tp/(tp+fn)
        spec = tn/(tn+fp)

        fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
        auc = roc_auc_score(indpe_labels, y_pred)

        evaluations["Dataset"].append(current_dataset_variety)
        evaluations["Fold"].append(i)
        evaluations["Train_Test"].append("Independent")
        evaluations["Accuracy"].append(acc)
        evaluations["Precision"].append(prec)
        evaluations["TPR"].append(tpr)
        evaluations["FPR"].append(fpr)
        evaluations["TPR_FPR_Thresholds"].append(thresholds)
        evaluations["AUC"].append(auc)
        evaluations["Sensitivity"].append(sens)
        evaluations["Specificity"].append(spec)
        evaluations["MCC"].append(mcc)

        ##################################################################################
        ##### Dump evaluations to a file
        ##################################################################################

        evalPath = os.path.join(outPath, expName, "_Evaluation_All_Datasets")
        if(not os.path.isdir(evalPath)):
            os.makedirs(evalPath)

        pickle.dump(evaluations,
                    open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "wb"))
        



File: Protein_DDE[100, 0, 0, 0]-st-simplesequence.csv
Training Positive: 1191
Training Negative: 1191
Independent Positive: 203
Independent Negative: 1022

Train/Test model Protein_DDE on Fold #0.

Train/Test model Protein_DDE on Fold #1.

Train/Test model Protein_DDE on Fold #2.

Train/Test model Protein_DDE on Fold #3.

Train/Test model Protein_DDE on Fold #4.

Independent evaluation for model Protein_DDE.


File: Protein_DistancePair[100, 0, 0, 0]-st-simplesequence.csv
Training Positive: 1191
Training Negative: 1191
Independent Positive: 203
Independent Negative: 1022

Train/Test model Protein_DistancePair on Fold #0.

Train/Test model Protein_DistancePair on Fold #1.

Train/Test model Protein_DistancePair on Fold #2.

Train/Test model Protein_DistancePair on Fold #3.

Train/Test model Protein_DistancePair on Fold #4.

Independent evaluation for model Protein_DistancePair.


File: Protein_DPC[100, 0, 0, 0]-st-simplesequence.csv
Training Positive: 1191
Training Negative: 1191
Indep

## k-fold Training evaluation

In [15]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(['Dataset', "Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Dataset,Train_Test,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Protein_DDE,Independent,0.644082,0.267465,0.650499,0.660099,0.6409,0.227641
Protein_DDE,Test,0.79093,0.805843,0.790915,0.766552,0.815277,0.582699
Protein_DDE,Train,0.996641,0.997899,0.996642,0.995382,0.997901,0.99329
Protein_DPC,Independent,0.623673,0.252874,0.634321,0.650246,0.618395,0.201992
Protein_DPC,Test,0.793035,0.800156,0.793021,0.782504,0.803537,0.586858
Protein_DPC,Train,0.996746,0.998107,0.996746,0.995382,0.998111,0.993497
Protein_DistancePair,Independent,0.634286,0.256461,0.634759,0.635468,0.634051,0.203709
Protein_DistancePair,Test,0.80941,0.814654,0.809398,0.801828,0.816968,0.619314
Protein_DistancePair,Train,0.996746,0.998318,0.996747,0.995172,0.998321,0.9935
TPC,Independent,0.593469,0.230347,0.604376,0.62069,0.588063,0.156134


In [16]:
evaluations_df_grouped = evaluations_df_grouped.reset_index()

In [17]:
evaluations_df_grouped[evaluations_df_grouped["Train_Test"] == "Test"]

Unnamed: 0,Dataset,Train_Test,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
1,Protein_DDE,Test,0.79093,0.805843,0.790915,0.766552,0.815277,0.582699
4,Protein_DPC,Test,0.793035,0.800156,0.793021,0.782504,0.803537,0.586858
7,Protein_DistancePair,Test,0.80941,0.814654,0.809398,0.801828,0.816968,0.619314
10,TPC,Test,0.775813,0.777489,0.775831,0.772494,0.779167,0.552032


In [18]:
evaluations_df_grouped[evaluations_df_grouped["Train_Test"] == "Independent"]

Unnamed: 0,Dataset,Train_Test,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
0,Protein_DDE,Independent,0.644082,0.267465,0.650499,0.660099,0.6409,0.227641
3,Protein_DPC,Independent,0.623673,0.252874,0.634321,0.650246,0.618395,0.201992
6,Protein_DistancePair,Independent,0.634286,0.256461,0.634759,0.635468,0.634051,0.203709
9,TPC,Independent,0.593469,0.230347,0.604376,0.62069,0.588063,0.156134


In [23]:
evaluations_df.groupby("Train_Test").mean().filter(['Accuracy', 
                                                       'Precision', 
                                                       'AUC', 
                                                       'Sensitivity', 
                                                       'Specificity', 
                                                       'MCC'])

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.623878,0.251787,0.630989,0.641626,0.620352,0.197369
Test,0.792297,0.799535,0.792291,0.780845,0.803738,0.585226
Train,0.996668,0.997899,0.996668,0.995435,0.997901,0.993341


In [24]:
# RF 50
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Independent	0.633878	0.252278	0.625633	0.613300	0.637965	0.190681
# Test	0.778336	0.792523	0.778336	0.755889	0.800783	0.557999
# Train	0.996668	0.997951	0.996668	0.995382	0.997954	0.993341

In [25]:
# RF 200
# 	Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Independent	0.630204	0.254883	0.633300	0.637931	0.628669	0.201350
# Test	0.796074	0.801982	0.796070	0.786723	0.805418	0.592736
# Train	0.996694	0.998527	0.996694	0.994857	0.998531	0.993397

In [26]:
# RF 500
# Accuracy	Precision	AUC	Sensitivity	Specificity	MCC
# Train_Test						
# Independent	0.624694	0.252010	0.630491	0.639163	0.621820	0.196845
# Test	0.793974	0.800483	0.793971	0.784633	0.803309	0.588841
# Train	0.996720	0.998580	0.996720	0.994857	0.998583	0.993450