In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_ML_KgapRFE_GridSearchCV"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "PredNTS_MathFeature_ENC"

monitor = 'val_loss'

sub_feature_count = 200

In [2]:
kgap_max = 4

train_data_filename = 'Training-datasets-PredNTS_kgap_{}.csv'
indpe_data_filename = 'independent-dataset-PredNTS_kgap_{}.csv'

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

# import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report, matthews_corrcoef

from sklearn.feature_selection import RFE

import math

In [4]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
    return kfoldList

In [5]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

In [6]:
def get_model(cw = None):
    
    model = RandomForestClassifier(n_estimators=100,
                                   criterion='gini', 
                                   class_weight=cw,
                                   bootstrap=True,
                                   oob_score=True, 
                                  )
    
    return model

# Train data preparation

In [7]:
##################################################################################
##### Read CSV data
##################################################################################

for i in range(kgap_max+1):
    
    current_train_data_filepath = os.path.join(input_data_folder, train_data_filename.format(i))
    current_train_data = pd.read_csv(current_train_data_filepath, sep=',', header=0)
    current_train_data = current_train_data.drop('label', axis=1)
    
    if i == 0:
        train_data = current_train_data
    else:
        train_data = pd.merge(
            train_data,
            current_train_data,
            how="inner",
            on='nameseq'
        )

train_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in train_data['nameseq']])

train_data = train_data.drop('nameseq', axis=1)

train_features = np.array(train_data.drop('label', axis=1))
train_labels = np.array(train_data['label'])
train_labels = train_labels.reshape((train_labels.shape[0], 1))

In [8]:
##################################################################################
##### Recursive feature selection
##################################################################################

# model = DecisionTreeClassifier(criterion="gini")
model = get_model()

selector = RFE(model, n_features_to_select=sub_feature_count, step=50)
selector = selector.fit(train_features, train_labels.reshape(train_labels.shape[0]))

feature_indices = np.where(selector.ranking_ == 1)[0]

NameError: name 'get_model' is not defined

In [None]:
##################################################################################
##### Extract features and labels, create folds
##################################################################################

train_features = train_features[:, feature_indices]

# folds = build_kfold(train_features, train_labels, k=n_fold, shuffle=shuffle, seed=seed)

# input_vec_shape = train_features[0].shape

# ## Write the k-fold dataset to file
# foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
# if(not os.path.isdir(foldPath)):
#     os.makedirs(foldPath)
# pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

# Independent data

In [None]:
##################################################################################
##### Read CSV data
##################################################################################

for i in range(kgap_max+1):

    current_indpe_data_filepath = os.path.join(input_data_folder, indpe_data_filename.format(i))
    current_indpe_data = pd.read_csv(current_indpe_data_filepath, sep=',', header=0)
    current_indpe_data = current_indpe_data.drop('label', axis=1)
    
    if i == 0:
        indpe_data = current_indpe_data
    else:
        indpe_data = pd.merge(
            indpe_data,
            current_indpe_data,
            how="inner",
            on='nameseq'
        )

indpe_data['label'] = pd.Series([int(val.split('_')[-2])
                                 for val in indpe_data['nameseq']])

indpe_data = indpe_data.drop('nameseq', axis=1)

##################################################################################
##### Extract features and labels, create folds
##################################################################################

indpe_features = np.array(indpe_data.drop('label', axis=1))
indpe_features = indpe_features[:, feature_indices]

indpe_labels = np.array(indpe_data['label'])
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

## GridSearch using full Model

Train model on full data from training. Predict and evaluate on Independent data.

In [None]:
# model = get_model(cw={0:1, 1:0.1})
model = RandomForestClassifier()
param_dict = {
    "n_estimators": [100, 200, 300],
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 10, 50, 100],
    "min_samples_split": [2, 10, 50, 100],
    "min_samples_leaf": [1, 10, 25, 50],
    "max_features": ["auto", "sqrt", "log2"],
    "bootstrap": [True],
    "max_samples": [0.25, 0.5, 0.75],
    "oob_score": [True],
    "random_state": [0],
    "class_weight": [{0:1, 1:1}, 
                   {0:1, 1:0.1}],
    "ccp_alpha": [0, 1, 0.1, 10]
}

clf = GridSearchCV(estimator = RandomForestClassifier(), 
                   param_grid = param_dict,
                   cv = 5,
                   verbose = 2,
                   scoring = "accuracy",
                   n_jobs = 1,
                  )

clf.fit(train_features,
        train_labels.reshape(train_labels.shape[0]))

In [None]:
# # model = get_model(cw={0:1, 1:0.1})
# model = get_model()
    
# ## Define the model callbacks for early stopping and saving the model. Then train model
# current_model_path = os.path.join(modelPath, "_fullModel.hdf5")

# # adding random shuffling of the dataset for training purpose
# index_arr = np.arange(train_features.shape[0])
# index_arr = np.random.permutation(index_arr)

# model.fit(train_features[index_arr], train_labels[index_arr].reshape(train_labels.shape[0]))

In [None]:
# ## create the evaluation data structure for all iterations
# evaluations = {
#     "Train_Test" : [],
#     "Accuracy" : [],
#     "Precision": [],
#     "TPR": [],
#     "FPR": [],
#     "TPR_FPR_Thresholds": [],
#     "AUC": [],
#     "Sensitivity": [],
#     "Specificity": [],
#     "MCC":[]
# }

# ##################################################################################
# ##### Prediction and metrics for Train dataset
# ##################################################################################

# y_pred = model.predict(train_features)
# label_pred = pred2label(y_pred)

# # Compute precision, recall, sensitivity, specifity, mcc
# acc = accuracy_score(train_labels, label_pred)
# prec = precision_score(train_labels,label_pred)
# mcc = matthews_corrcoef(train_labels, label_pred)

# conf = confusion_matrix(train_labels, label_pred)
# tn, fp, fn, tp = conf.ravel()
# sens = tp/(tp+fn)
# spec = tn/(tn+fp)

# fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
# auc = roc_auc_score(train_labels, y_pred)

# evaluations["Train_Test"].append("Train")
# evaluations["Accuracy"].append(acc)
# evaluations["Precision"].append(prec)
# evaluations["TPR"].append(tpr)
# evaluations["FPR"].append(fpr)
# evaluations["TPR_FPR_Thresholds"].append(thresholds)
# evaluations["AUC"].append(auc)
# evaluations["Sensitivity"].append(sens)
# evaluations["Specificity"].append(spec)
# evaluations["MCC"].append(mcc)

# ##################################################################################
# ##### Prediction and metrics for Independent dataset
# ##################################################################################

# y_pred = model.predict(indpe_features)
# label_pred = pred2label(y_pred)

# # Compute precision, recall, sensitivity, specifity, mcc
# acc = accuracy_score(indpe_labels, label_pred)
# prec = precision_score(indpe_labels,label_pred)
# mcc = matthews_corrcoef(indpe_labels, label_pred)

# conf = confusion_matrix(indpe_labels, label_pred)
# tn, fp, fn, tp = conf.ravel()
# sens = tp/(tp+fn)
# spec = tn/(tn+fp)

# fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
# auc = roc_auc_score(indpe_labels, y_pred)

# evaluations["Train_Test"].append("Independent")
# evaluations["Accuracy"].append(acc)
# evaluations["Precision"].append(prec)
# evaluations["TPR"].append(tpr)
# evaluations["FPR"].append(fpr)
# evaluations["TPR_FPR_Thresholds"].append(thresholds)
# evaluations["AUC"].append(auc)
# evaluations["Sensitivity"].append(sens)
# evaluations["Specificity"].append(spec)
# evaluations["MCC"].append(mcc)

# ##################################################################################

# evaluations_df = pd.DataFrame.from_dict(evaluations)

# evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
#                                                                                'Precision', 
#                                                                                'AUC', 
#                                                                                'Sensitivity', 
#                                                                                'Specificity', 
#                                                                                'MCC'])

# evaluations_df_grouped

In [None]:
# print(classification_report(indpe_labels, np.round(y_pred).astype(int)))