In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT"
outPath = "Results"
foldName = "folds.pickle"

epochs = 100
batch_size = 64
shuffle = True
seed = None
latent_dim_size = 10

input_data_folder = "Data"
training_data_file = "Training-datasets-PredNTS.txt"
independent_data_file = "independent dataset-PredNTS.txt"

In [3]:
import os 
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC

import math

In [4]:
# print(tf.test.is_gpu_available(cuda_only=True))
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
print(tf.__version__)
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

2.8.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Utility Functions

In [5]:
##################################################################################
##### define all CUSTOM functions
##################################################################################

def integer_encode_nt(sequence, char_dict):
    
    seq_encoded = np.ones((len(sequence))) * -1
    
    i = 0
    for single_character in sequence:
        if(single_character.upper() in char_dict.keys()):
            seq_encoded[i] = char_dict[single_character.upper()]
            i = i+1
        else:
            raise ValueError('Incorrect character in NT sequence: '+sequence)
    return seq_encoded

In [6]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
        
    return kfoldList

In [7]:
##################################################################################
##### define evaluator functions
##################################################################################

def pred2label(y_pred):
    y_pred = np.round(y_pred)
    return y_pred

# Model

In [8]:
def get_model():
    
    model = RandomForestClassifier(n_estimators=100, 
                                   criterion='gini',
                                   max_depth = 10,
                                   max_features = 'sqrt',
                                   bootstrap=True,
                                   oob_score=True)

#     model = xgb.XGBClassifier(objective="binary:logistic", use_label_encoder=False,
#                               learn_rate=0.001, eta=0.1, eval_metric='auc'
#                              )

#     model = xgb.XGBClassifier(objective="binary:logistic", 
#                               eval_metric='logloss',
#                               use_label_encoder=False, 
#                               max_depth=5)
    
#     model = SVC(kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False,
#                 tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=100, decision_function_shape='ovr', 
#                 break_ties=False, random_state=None)

#     model = tf.keras.models.Sequential()
#     model.add(tf.keras.layers.Input(shape=(latent_dim_size,)))
#     model.add(tf.keras.layers.Dense(32,
#                                     kernel_regularizer=tf.keras.regularizers.l2(0.001)))
#     model.add(tf.keras.layers.Activation('relu'))
#     model.add(tf.keras.layers.Dropout(0.5))
#     model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
#     model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0001), 
#                   loss='binary_crossentropy', 
#                   metrics=None)
    
    return model

In [9]:
class GradientReversal(tf.keras.layers.Layer):
    """Flip the sign of gradient during training.
    based on https://github.com/michetonu/gradient_reversal_keras_tf
    ported to tf 2.x
    """

    def __init__(self, λ=1, **kwargs):
        super(GradientReversal, self).__init__(**kwargs)
        self.λ = λ

    @staticmethod
    @tf.custom_gradient
    def reverse_gradient(x, λ):
        # @tf.custom_gradient suggested by Hoa's comment at
        # https://stackoverflow.com/questions/60234725/how-to-use-gradient-override-map-with-tf-gradienttape-in-tf2-0
        return tf.identity(x), lambda dy: (-dy, None)

    def call(self, x):
        return self.reverse_gradient(x, self.λ)

    def compute_mask(self, inputs, mask=None):
        return mask

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_config(self):
        return super(GradientReversal, self).get_config() | {'λ': self.λ}

In [10]:
# def mape(y_true, y_pred):
#     loss_val = tf.keras.losses.MeanAbsolutePercentageError(y_true, y_pred) / 100
#     return loss_val  # Note the `axis=-1`
import tensorflow.keras.backend as K
K.set_epsilon(0.01)
def mape(y_true, y_pred):
    diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true),
                                            K.epsilon(),
                                            None))
    return K.mean(diff, axis=-1)

In [11]:
##################################################################################
##### Function to customize the DLNN architecture with parameters
##################################################################################

def DANN(input_seq_shape=(41,)):
    
    hp_beta = 0.001
    
    hp_conv_activation = 'relu'
    hp_conv_filters_1 = 50
    hp_conv_kernel_length_1 = 10
    hp_conv_filters_2 = 50
    hp_conv_kernel_length_2 = 5
    hp_conv_stride = 1

    hp_latent_dim = latent_dim_size
    
    hp_dense_units = 32
    hp_dropout_prob = 0.5
    hp_dense_activation = 'relu'
    
    hp_learning_rate = 0.001
    hp_opt_func = 'adam'
    
    metric = None

    ###########################################################################
    ##### Encoder
    ###########################################################################
    
    ae_input = tf.keras.layers.Input(shape=input_seq_shape)

    xe = tf.keras.layers.Dense(hp_latent_dim*10)(ae_input)
    xe = tf.keras.layers.Dense(hp_latent_dim)(xe)

    encoder = tf.keras.models.Model(inputs=ae_input, outputs=xe)
    
    ###########################################################################
    ##### Decoder
    ###########################################################################
    
    dec_input = tf.keras.layers.Input(shape=(hp_latent_dim,))
    
    # xd = tf.keras.layers.RepeatVector(input_seq_shape[0]-hp_conv_kernel_length_1-hp_conv_kernel_length_2+2)(dec_input)

    xd = tf.keras.layers.Dense(hp_latent_dim*10)(dec_input)
    
    xd = tf.keras.layers.Dense(input_seq_shape[0])(xd)

    decoder = tf.keras.models.Model(inputs=dec_input, outputs=xd)
    
    ###########################################################################
    ##### Domain Classifier
    ###########################################################################
    
    classifier = tf.keras.models.Sequential()
    
    classifier.add(GradientReversal())

    classifier.add(tf.keras.layers.Dense(hp_dense_units,
                                         kernel_regularizer=tf.keras.regularizers.l2(hp_beta)))
    classifier.add(tf.keras.layers.Activation(hp_dense_activation))
    classifier.add(tf.keras.layers.Dropout(hp_dropout_prob))
    
    classifier.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    #########################
    ##### Generate Model from input and output
    #########################
    
    autoencoder = tf.keras.models.Model(ae_input, [decoder(encoder(ae_input)), classifier(encoder(ae_input))])

    if hp_opt_func == 'adam':
        optimizer_function = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    elif hp_opt_func == 'adagrad':
        optimizer_function = tf.keras.optimizers.Adagrad(learning_rate=hp_learning_rate)
    elif hp_opt_func == 'rmsprop':
        optimizer_function = tf.keras.optimizers.RMSprop(learning_rate=hp_learning_rate)
        
    dec_op_name = autoencoder.output[0].name.split('/')[0]
    cls_op_name = autoencoder.output[1].name.split('/')[0]
    
#     autoencoder.compile(optimizer=optimizer_function, 
#                         loss={dec_op_name: mape, 
#                               cls_op_name: 'binary_crossentropy'}, 
#                         metrics=metric)

    autoencoder.compile(optimizer=optimizer_function, 
                        loss={dec_op_name: 'mean_absolute_percentage_error',
                              cls_op_name: 'binary_crossentropy'}, 
                        metrics=metric)
    
    return autoencoder, encoder, decoder, classifier, dec_op_name, cls_op_name

In [12]:
ae, enc, dec, cls, _, _ = DANN()

In [13]:
ae.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 41)]         0           []                               
                                                                                                  
 model (Functional)             (None, 10)           5210        ['input_1[0][0]',                
                                                                  'input_1[0][0]']                
                                                                                                  
 model_1 (Functional)           (None, 41)           5241        ['model[0][0]']                  
                                                                                                  
 sequential (Sequential)        (None, 1)            385         ['model[1][0]']            

In [14]:
enc.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 41)]              0         
                                                                 
 dense (Dense)               (None, 100)               4200      
                                                                 
 dense_1 (Dense)             (None, 10)                1010      
                                                                 
Total params: 5,210
Trainable params: 5,210
Non-trainable params: 0
_________________________________________________________________


In [15]:
dec.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10)]              0         
                                                                 
 dense_2 (Dense)             (None, 100)               1100      
                                                                 
 dense_3 (Dense)             (None, 41)                4141      
                                                                 
Total params: 5,241
Trainable params: 5,241
Non-trainable params: 0
_________________________________________________________________


In [16]:
cls.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gradient_reversal (Gradient  (None, 10)               0         
 Reversal)                                                       
                                                                 
 dense_4 (Dense)             (None, 32)                352       
                                                                 
 activation (Activation)     (None, 32)                0         
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 385
Trainable params: 385
Non-trainable params: 0
__________________________________________________________

In [17]:
# tf.keras.utils.plot_model(ae)

# Train Dataset

In [18]:
##################################################################################
##### read training file
##################################################################################
train_file_path = os.path.join(input_data_folder, training_data_file)
train_data = pd.read_csv(train_file_path, sep='\t', header=None)
train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
train_data.head()

##################################################################################
##### Create dictionary of all characters in the NT sequence 
##################################################################################
all_char_set = set({})
for val in [set(val) for val in train_data['Sequence']]:
    all_char_set = all_char_set.union(val)
all_char_list = list(all_char_set)
all_char_list.sort()
all_char_dict = {}
for i in range(len(all_char_list)):
    all_char_dict[all_char_list[i]] = i
    
##################################################################################
##### Create OHE of sequence
##################################################################################
train_data['INT_Sequence'] = pd.Series([integer_encode_nt(val, all_char_dict) 
                                        for val in train_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
train_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in train_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

train_features = np.array(list(train_data['INT_Sequence']))
train_labels = np.array(list(train_data['label']))
train_labels = train_labels.reshape((train_features.shape[0], 1))

# Independent data

In [19]:
##################################################################################
##### read independent data file
##################################################################################
indpe_file_path = os.path.join(input_data_folder, independent_data_file)
indpe_data = pd.read_csv(indpe_file_path, sep='\t', header=None)
indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
indpe_data.head()
    
##################################################################################
##### Create OHE of sequence
##################################################################################
indpe_data['INT_Sequence'] = pd.Series([integer_encode_nt(val, all_char_dict) 
                                        for val in indpe_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in indpe_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

indpe_features = np.array(list(indpe_data['INT_Sequence']))
indpe_labels = np.array(list(indpe_data['label']))
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

input_seq_shape = indpe_features[0].shape

# Creating DANN data

In [20]:
dann_features = np.concatenate((train_features, indpe_features))
dann_labels = np.concatenate((np.zeros((train_features.shape[0], 1)),
                              np.ones((indpe_features.shape[0], 1))
                             )
                            )

# Training the DANN

In [21]:
##################################################################################
##### Train the DANN
##################################################################################

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)
    
ae, enc, dec, cls, dec_name, cls_name = DANN(input_seq_shape = input_seq_shape)
    
## Define the model callbacks for early stopping and saving the model. Then train model
current_model_path = os.path.join(modelPath, "DANN.hdf5")
modelCallbacks = [
    tf.keras.callbacks.ModelCheckpoint(current_model_path,
                                       monitor = 'val_'+dec_name+'_loss', verbose = 1, save_best_only = True, 
                                       save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
]

# adding random shuffling of the dataset for training purpose
index_arr = np.arange(dann_features.shape[0])
index_arr = np.random.permutation(index_arr)

ae.fit(x = dann_features, y = [dann_features, dann_labels], 
       batch_size = batch_size, epochs = epochs, 
       verbose = 1, 
       callbacks = modelCallbacks, validation_split = 0.2)

custom_objects = {"GradientReversal": GradientReversal}
with tf.keras.utils.custom_object_scope(custom_objects):
    ae = tf.keras.models.load_model(current_model_path)
    
enc = ae.layers[1]

Epoch 1/100
Epoch 1: val_model_4_loss improved from inf to 205.20392, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 2/100
Epoch 2: val_model_4_loss improved from 205.20392 to 130.75258, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 3/100
Epoch 3: val_model_4_loss improved from 130.75258 to 115.19942, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 4/100
Epoch 4: val_model_4_loss improved from 115.19942 to 105.60751, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 5/100
Epoch 5: val_model_4_loss improved from 105.60751 to 102.91545, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 6/100
Epoch 6: val_model_4_loss improved from 102.91545 to 102.69746, saving model to Results\NT_Sit

Epoch 19/100
Epoch 19: val_model_4_loss did not improve from 92.46063
Epoch 20/100
Epoch 20: val_model_4_loss did not improve from 92.46063
Epoch 21/100
Epoch 21: val_model_4_loss did not improve from 92.46063
Epoch 22/100
Epoch 22: val_model_4_loss did not improve from 92.46063
Epoch 23/100
Epoch 23: val_model_4_loss did not improve from 92.46063
Epoch 24/100
Epoch 24: val_model_4_loss did not improve from 92.46063
Epoch 25/100
Epoch 25: val_model_4_loss did not improve from 92.46063
Epoch 26/100
Epoch 26: val_model_4_loss did not improve from 92.46063
Epoch 27/100
Epoch 27: val_model_4_loss did not improve from 92.46063
Epoch 28/100
Epoch 28: val_model_4_loss did not improve from 92.46063
Epoch 29/100
Epoch 29: val_model_4_loss improved from 92.46063 to 91.95369, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 30/100
Epoch 30: val_model_4_loss did not improve from 91.95369
Epoch 31/100
Epoch 31: val_model_4_loss did not im

Epoch 40/100
Epoch 40: val_model_4_loss did not improve from 91.95369
Epoch 41/100
Epoch 41: val_model_4_loss did not improve from 91.95369
Epoch 42/100
Epoch 42: val_model_4_loss did not improve from 91.95369
Epoch 43/100
Epoch 43: val_model_4_loss did not improve from 91.95369
Epoch 44/100
Epoch 44: val_model_4_loss did not improve from 91.95369
Epoch 45/100
Epoch 45: val_model_4_loss improved from 91.95369 to 90.82394, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 46/100
Epoch 46: val_model_4_loss did not improve from 90.82394
Epoch 47/100
Epoch 47: val_model_4_loss did not improve from 90.82394
Epoch 48/100
Epoch 48: val_model_4_loss did not improve from 90.82394
Epoch 49/100
Epoch 49: val_model_4_loss did not improve from 90.82394
Epoch 50/100
Epoch 50: val_model_4_loss did not improve from 90.82394
Epoch 51/100
Epoch 51: val_model_4_loss did not improve from 90.82394
Epoch 52/100
Epoch 52: val_model_4_loss improved f

Epoch 61/100
Epoch 61: val_model_4_loss improved from 89.83844 to 89.31949, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 62/100
Epoch 62: val_model_4_loss did not improve from 89.31949
Epoch 63/100
Epoch 63: val_model_4_loss did not improve from 89.31949
Epoch 64/100
Epoch 64: val_model_4_loss did not improve from 89.31949
Epoch 65/100
Epoch 65: val_model_4_loss did not improve from 89.31949
Epoch 66/100
Epoch 66: val_model_4_loss improved from 89.31949 to 88.43924, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 67/100
Epoch 67: val_model_4_loss improved from 88.43924 to 88.05430, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 68/100
Epoch 68: val_model_4_loss did not improve from 88.05430
Epoch 69/100
Epoch 69: val_model_4_loss did not improve from 88.05430
Epoch 70/100
Epoch 70: val_model_4_loss 

Epoch 81: val_model_4_loss did not improve from 88.05430
Epoch 82/100
Epoch 82: val_model_4_loss did not improve from 88.05430
Epoch 83/100
Epoch 83: val_model_4_loss did not improve from 88.05430
Epoch 84/100
Epoch 84: val_model_4_loss did not improve from 88.05430
Epoch 85/100
Epoch 85: val_model_4_loss did not improve from 88.05430
Epoch 86/100
Epoch 86: val_model_4_loss improved from 88.05430 to 85.95656, saving model to Results\NT_Site_PredNTS_Classification_Domain_Adversarial_AE_INT\5fold\models\DANN.hdf5
Epoch 87/100
Epoch 87: val_model_4_loss did not improve from 85.95656
Epoch 88/100
Epoch 88: val_model_4_loss did not improve from 85.95656
Epoch 89/100
Epoch 89: val_model_4_loss did not improve from 85.95656
Epoch 90/100
Epoch 90: val_model_4_loss did not improve from 85.95656
Epoch 91/100
Epoch 91: val_model_4_loss did not improve from 85.95656
Epoch 92/100
Epoch 92: val_model_4_loss did not improve from 85.95656
Epoch 93/100
Epoch 93: val_model_4_loss did not improve from 85

# Encoded features using DANN

In [22]:
train_enc_features = enc.predict(train_features)
indpe_enc_features = enc.predict(indpe_features)

# Build folds using encoded training features

In [23]:
folds = build_kfold(train_enc_features, train_labels, k=n_fold, shuffle=shuffle, seed=seed)

## Write the k-fold dataset to file
foldPath = os.path.join(outPath, expName, "{}fold".format(n_fold))
if(not os.path.isdir(foldPath)):
    os.makedirs(foldPath)
pickle.dump(folds, open(os.path.join(foldPath, foldName), "wb"))

# Training Evaluation

In [24]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}
        
##################################################################################
##### extract data from the current fasta file
##################################################################################

print("\n======================================================================")
print("Training Positive:", np.sum(train_labels))
print("Training Negative:", train_labels.shape[0] - np.sum(train_labels))
print("Independent Positive:", np.sum(indpe_labels))
print("Independent Negative:", indpe_labels.shape[0] - np.sum(indpe_labels))

##################################################################################
##### TRAIN and PREDICT for every Fold, using models
##################################################################################

# fold counter
i = 0

for fold in folds:

    # adding random shuffling of the dataset for training purpose
    randomized_index_arr = np.arange(fold["X_train"].shape[0])
    randomized_index_arr = np.random.permutation(randomized_index_arr)

    print("\nTrain/Test model on Fold #"+str(i)+".")

    model_file_path = os.path.join(modelPath, "bestModel-fold{}.hdf5".format(i))

    model = get_model()
    
#     ## Define the model callbacks for early stopping and saving the model. Then train model.
#     modelCallbacks = [
#         tf.keras.callbacks.ModelCheckpoint(model_file_path,
#                                            monitor = 'val_loss', verbose = 1, save_best_only = True, 
#                                            save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
#     ]
#     model.fit(x = fold["X_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr], 
#               batch_size = batch_size, epochs = epochs, verbose = 1, 
#               callbacks = modelCallbacks, validation_data = (fold["X_test"], fold["y_test"]))
    
#     model = tf.keras.models.load_model(current_model_path)

    model.fit(X = fold["X_train"][randomized_index_arr], y = fold["y_train"][randomized_index_arr])

    model_file_obj = open(model_file_path, 'wb')
    pickle.dump(model, model_file_obj)
    model_file_obj.close()

    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################

    y_pred = model.predict(fold["X_train"])
    label_pred = pred2label(y_pred)
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_train"], label_pred)
    prec = precision_score(fold["y_train"],label_pred)
    mcc = matthews_corrcoef(fold["y_train"], label_pred)

    conf = confusion_matrix(fold["y_train"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_train"], y_pred)
    auc = roc_auc_score(fold["y_train"], y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.predict(fold["X_test"])
    label_pred = pred2label(y_pred)
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold["y_test"], label_pred)
    prec = precision_score(fold["y_test"],label_pred)
    mcc = matthews_corrcoef(fold["y_test"], label_pred)

    conf = confusion_matrix(fold["y_test"], label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold["y_test"], y_pred)
    auc = roc_auc_score(fold["y_test"], y_pred)

    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)

    i = i+1

##################################################################################
##### Independent Data performance
##################################################################################

print("\nIndependent evaluation for model.")

# adding random shuffling of the dataset for training purpose
randomized_index_arr = np.arange(train_enc_features.shape[0])
randomized_index_arr = np.random.permutation(randomized_index_arr)

model_file_path = os.path.join(modelPath, "bestModel-full.hdf5")

# ## Define the model callbacks for early stopping and saving the model. Then train model.
# modelCallbacks = [
#     tf.keras.callbacks.ModelCheckpoint(model_file_path,
#                                        monitor = 'val_loss', verbose = 1, save_best_only = True, 
#                                        save_weights_only = False, mode = 'auto', save_freq = 'epoch'),
# ]
# model.fit(x = train_enc_features[randomized_index_arr], y = train_labels[randomized_index_arr], 
#           batch_size = batch_size, epochs = epochs, verbose = 1, 
#           callbacks = modelCallbacks, validation_data = (indpe_enc_features, indpe_labels))

# model = tf.keras.models.load_model(current_model_path)

model = get_model()

model.fit(X = train_enc_features[randomized_index_arr], y = train_labels[randomized_index_arr])

model_file_obj = open(model_file_path, 'wb')
pickle.dump(model, model_file_obj)
model_file_obj.close()

##################################################################################
##### Prediction and metrics for TEST dataset
##################################################################################

y_pred = model.predict(indpe_enc_features)
label_pred = pred2label(y_pred)
# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels,label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Fold"].append(i)
evaluations["Train_Test"].append("Independent")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Dump evaluations to a file
##################################################################################

evalPath = os.path.join(outPath, expName, "_Evaluation_All_Datasets")
if(not os.path.isdir(evalPath)):
    os.makedirs(evalPath)

pickle.dump(evaluations,
            open(os.path.join(evalPath, "{}fold_evaluations.pickle".format(n_fold)), "wb"))


Training Positive: 1191
Training Negative: 1191
Independent Positive: 203
Independent Negative: 1022

Train/Test model on Fold #0.

Train/Test model on Fold #1.

Train/Test model on Fold #2.

Train/Test model on Fold #3.

Train/Test model on Fold #4.

Independent evaluation for model.


In [25]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Independent,0.63102,0.179949,0.516347,0.344828,0.687867,0.026114
Test,0.562136,0.572912,0.562141,0.486987,0.637295,0.125688
Train,0.934508,0.990316,0.934508,0.877622,0.991394,0.87476


In [26]:
# def get_model():
    
# #     model = RandomForestClassifier(n_estimators=10, 
# #                                    criterion='gini', 
# #                                    bootstrap=True,
# #                                    oob_score=True)

# #     model = xgb.XGBClassifier(objective="binary:logistic", use_label_encoder=False,
# #                               learn_rate=0.001, eta=0.1, eval_metric='auc'
# #                              )

#     model = xgb.XGBClassifier(objective="binary:logistic", 
#                               eval_metric='logloss',
#                               use_label_encoder=False, 
#                               max_depth=10,
#                               reg_lambda=0.001,
#                               learning_rate=0.001
#                              )
    
# #     model = SVC(kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False,
# #                 tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=100, decision_function_shape='ovr', 
# #                 break_ties=False, random_state=None)

# #     model = tf.keras.models.Sequential()
# #     model.add(tf.keras.layers.Input(shape=(latent_dim_size,)))
# #     model.add(tf.keras.layers.Dense(32,
# #                                     kernel_regularizer=tf.keras.regularizers.l2(0.001)))
# #     model.add(tf.keras.layers.Activation('relu'))
# #     model.add(tf.keras.layers.Dropout(0.5))
# #     model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
# #     model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0001), 
# #                   loss='binary_crossentropy', 
# #                   metrics=None)
    
#     return model

In [27]:
from sklearn.decomposition import PCA, KernelPCA

In [28]:
pca = PCA(n_components=3)
pca.fit(np.concatenate((train_enc_features, indpe_enc_features)))
# pca.fit(indpe_enc_features)
print(sum(pca.explained_variance_ratio_), ' : ', pca.explained_variance_ratio_)

0.8622940182685852  :  [0.6662198  0.14358738 0.05248687]


In [29]:
pca.components_.shape

(3, 10)

In [30]:
pca.components_[0]

array([ 0.23529403, -0.04270929, -0.06038254,  0.05727866,  0.35982838,
       -0.32488906, -0.3668896 ,  0.3212422 , -0.35213375, -0.58228284],
      dtype=float32)

In [31]:
transformer = KernelPCA(n_components=7, kernel='rbf')
transformer.fit(np.concatenate((train_enc_features, indpe_enc_features)))
print(sum(transformer.explained_variance_ratio_), ' : ', transformer.explained_variance_ratio_)

AttributeError: 'KernelPCA' object has no attribute 'explained_variance_ratio_'