In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "NT_Site_PredNTS_PredNitro_Corrected"
outPath = "Results"
foldName = "folds.pickle"

shuffle = True
seed = None

input_data_folder = "Data"
training_data_file = "Training-datasets-PredNTS.txt"
independent_data_file = "independent dataset-PredNTS.txt"

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

import math

import itertools

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score, matthews_corrcoef

from sklearn.svm import SVC

# Utility functions

In [3]:
##################################################################################
##### Build k-fold functions
##################################################################################

## Build the K-fold from dataset
def build_kfold(features, labels, k=10, shuffle=False, seed=None):
    
    skf = StratifiedKFold(n_splits=k, shuffle=shuffle, random_state=seed)
    kfoldList = []
    for train_index, test_index in skf.split(features, labels):
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        kfoldList.append({
            "X_train": X_train,
            "X_test": X_test,
            "y_train":y_train,
            "y_test":y_test
        })
        
    return kfoldList

# PredNitro conditional probability embedding

In [4]:
def generate_conditional_probabilities(data, pos_mid = 20):
    
    ############################################################################################################
    ##### Generate list of all characters
    ############################################################################################################
    
    all_char_set = set({})
    for val in [set(val) for val in data['Sequence']]:
        all_char_set = all_char_set.union(val)
    all_char_list = list(all_char_set)
    all_char_list.sort()
    
    ############################################################################################################
    ##### Initialize probability dataframe
    ############################################################################################################
    
    position_indexes_A_left = list(range(0,pos_mid))
    position_indexes_B_left = [val+1 for val in position_indexes_A_left]
    all_left_lists = [
       all_char_list,
       all_char_list,
       list(zip(position_indexes_A_left, position_indexes_B_left))
    ]
    all_left_combinations = list(itertools.product(*all_left_lists))
    df_left = pd.DataFrame(all_left_combinations, columns=['A', 'B', 'pos'])

    position_indexes_A_right = list(range(pos_mid+1,(pos_mid*2)+1))
    position_indexes_B_right = [val-1 for val in position_indexes_A_right]
    all_right_lists = [
       all_char_list,
       all_char_list,
       list(zip(position_indexes_A_right, position_indexes_B_right))
    ]
    all_right_combinations = list(itertools.product(*all_right_lists))
    df_right = pd.DataFrame(all_right_combinations, columns=['A', 'B', 'pos'])

    df_position_specific_conditional_probabilities = pd.concat((df_left, df_right), ignore_index=True)
    df_position_specific_conditional_probabilities['prob'] = 0.0
    
    ############################################################################################################
    ##### Calculate the conditional probabilities
    ############################################################################################################
    
    positive_data = np.array([list(val) for val in list(data['Sequence'][data['label_original'] == 1])])
    negative_data = np.array([list(val) for val in list(data['Sequence'][data['label_original'] != 1])])

    for i in range(df_position_specific_conditional_probabilities.shape[0]):
        charA = df_position_specific_conditional_probabilities['A'][i]
        charB = df_position_specific_conditional_probabilities['B'][i]
        posA, posB = df_position_specific_conditional_probabilities['pos'][i]

        df_filteredB_by_pos = df_position_specific_conditional_probabilities

        ## Generating the probability scores from positive data

        positive_prob = 0

        positive_d_charA = positive_data[:, posA]
        positive_d_charB = positive_data[:, posB]

        charB_indexes_in_posB = np.where(positive_d_charB == charB)[0]
        charA_indexes_in_posA_when_charB_in_posB = np.where(positive_d_charA[charB_indexes_in_posB] == charA)[0]

        if charB_indexes_in_posB.shape[0] > 0:
            positive_prob = charA_indexes_in_posA_when_charB_in_posB.shape[0] / charB_indexes_in_posB.shape[0]

        ## Generating the probability scores from negative data

        negative_prob = 0

        negative_d_charA = negative_data[:, posA]
        negative_d_charB = negative_data[:, posB]

        charB_indexes_in_posB = np.where(negative_d_charB == charB)[0]
        charA_indexes_in_posA_when_charB_in_posB = np.where(negative_d_charA[charB_indexes_in_posB] == charA)[0]

        if charB_indexes_in_posB.shape[0] > 0:
            negative_prob = charA_indexes_in_posA_when_charB_in_posB.shape[0] / charB_indexes_in_posB.shape[0]

        ## Appending the final probability difference

        df_position_specific_conditional_probabilities.loc[i, 'prob'] = positive_prob - negative_prob
        
    return df_position_specific_conditional_probabilities

In [5]:
def embed_data_with_generated_probabilities(data, df_cond_probs, pos_mid = 20):
    prob_features = np.empty((0, pos_mid*2))
    for seq in data["Sequence"]:
        seq_prob_features = np.zeros((1, pos_mid*2))
        for i in range(0, len(seq)):
            if i < pos_mid:
                feature_index = i
                posB = i+1
            elif i > pos_mid:
                feature_index = i-1
                posB = i-1
            else:
                continue
            posA = i

            charA = seq[posA]
            charB = seq[posB]

            prob = df_cond_probs[(df_cond_probs['A'] == charA) & 
                                 (df_cond_probs['B'] == charB) & 
                                 (df_cond_probs['pos'] == (posA,posB))
                                ]['prob'].values[0]

            seq_prob_features[0, feature_index] = prob

        prob_features = np.concatenate((prob_features, seq_prob_features))
        
    return prob_features

# PredNTS Training data preparation

In [6]:
##################################################################################
##### read training file
##################################################################################
train_file_path = os.path.join(input_data_folder, training_data_file)
train_data = pd.read_csv(train_file_path, sep='\t', header=None)
train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
train_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in train_data["label_original"]])

train_labels = np.array(list(train_data['label']))

# Build training data folds

In [7]:
feature_indices = np.array(train_data.index)

folds = build_kfold(feature_indices, train_labels, k=n_fold, shuffle=shuffle, seed=seed)

# PredNTS Independent data preparation

In [8]:
##################################################################################
##### read independent data file
##################################################################################
indpe_file_path = os.path.join(input_data_folder, independent_data_file)
indpe_data = pd.read_csv(indpe_file_path, sep='\t', header=None)
indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in indpe_data["label_original"]])

# k-fold Training

In [9]:
##################################################################################
##### For each input file, train model and generate different outputs in a structured folder
##################################################################################

## create the evaluation data structure for all iterations
evaluations = {
    "Fold" : [],
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Train/Test model on all folds, generate evaluations
##################################################################################

## Create and set directory to save model
modelPath = os.path.join(outPath, expName, "{}fold".format(n_fold), "models")
if(not os.path.isdir(modelPath)):
    os.makedirs(modelPath)

i = -1
for fold in folds:
    i += 1
    
    print("\nTrain/Test model on Fold #"+str(i)+".")
    
    ##################################################################################
    ##### Generate the embeddings for the specific fold
    ##################################################################################
    
    fold_train_data = train_data[train_data.index.isin(fold['X_train'])]
    fold_test_data = train_data[train_data.index.isin(fold['X_test'])]
    
    df_fold_train_data_cond_probs = generate_conditional_probabilities(fold_train_data)
    
    fold_train_features = embed_data_with_generated_probabilities(fold_train_data, df_fold_train_data_cond_probs)
    fold_train_labels = np.array(list(fold_train_data['label']))
    
    fold_test_features = embed_data_with_generated_probabilities(fold_test_data, df_fold_train_data_cond_probs)
    fold_test_labels = np.array(list(fold_test_data['label']))
    
    ##################################################################################
    ##### Train model
    ##################################################################################
    
    model = SVC(gamma='auto')
    
    model.fit(fold_train_features, fold_train_labels)
    
    ##################################################################################
    ##### Prediction and metrics for TRAIN dataset
    ##################################################################################
    
    y_pred = model.decision_function(fold_train_features)
    label_pred = model.predict(fold_train_features)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_train_labels, label_pred)
    prec = precision_score(fold_train_labels,label_pred)
    mcc = matthews_corrcoef(fold_train_labels, label_pred)

    conf = confusion_matrix(fold_train_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold_train_labels, y_pred)
    auc = roc_auc_score(fold_train_labels, y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Train")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)
    
    ##################################################################################
    ##### Prediction and metrics for TEST dataset
    ##################################################################################

    y_pred = model.decision_function(fold_test_features)
    label_pred = model.predict(fold_test_features)
    
    # Compute precision, recall, sensitivity, specifity, mcc
    acc = accuracy_score(fold_test_labels, label_pred)
    prec = precision_score(fold_test_labels,label_pred)
    mcc = matthews_corrcoef(fold_test_labels, label_pred)

    conf = confusion_matrix(fold_test_labels, label_pred)
    tn, fp, fn, tp = conf.ravel()
    sens = tp/(tp+fn)
    spec = tn/(tn+fp)
    
    fpr, tpr, thresholds = roc_curve(fold_test_labels, y_pred)
    auc = roc_auc_score(fold_test_labels, y_pred)
    
    evaluations["Fold"].append(i)
    evaluations["Train_Test"].append("Test")
    evaluations["Accuracy"].append(acc)
    evaluations["Precision"].append(prec)
    evaluations["TPR"].append(tpr)
    evaluations["FPR"].append(fpr)
    evaluations["TPR_FPR_Thresholds"].append(thresholds)
    evaluations["AUC"].append(auc)
    evaluations["Sensitivity"].append(sens)
    evaluations["Specificity"].append(spec)
    evaluations["MCC"].append(mcc)


Train/Test model on Fold #0.

Train/Test model on Fold #1.

Train/Test model on Fold #2.

Train/Test model on Fold #3.

Train/Test model on Fold #4.


In [10]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Test,0.594893,0.59779,0.645924,0.585243,0.604595,0.190268
Train,0.986461,0.989647,0.997166,0.983207,0.989714,0.972943


# Independent Testing

In [11]:
df_train_data_cond_probs = generate_conditional_probabilities(train_data)

train_features = embed_data_with_generated_probabilities(train_data, df_train_data_cond_probs)
train_labels = np.array(list(train_data['label']))

indpe_features = embed_data_with_generated_probabilities(indpe_data, df_train_data_cond_probs)
indpe_labels = np.array(list(indpe_data['label']))

In [12]:
model = SVC(gamma='auto')
    
model.fit(train_features, train_labels)

SVC(gamma='auto')

In [13]:
## create the evaluation data structure for all iterations
evaluations = {
    "Train_Test" : [],
    "Accuracy" : [],
    "Precision": [],
    "TPR": [],
    "FPR": [],
    "TPR_FPR_Thresholds": [],
    "AUC": [],
    "Sensitivity": [],
    "Specificity": [],
    "MCC":[]
}

##################################################################################
##### Prediction and metrics for Training dataset
##################################################################################

y_pred = model.decision_function(train_features)
label_pred = model.predict(train_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(train_labels, label_pred)
prec = precision_score(train_labels,label_pred)
mcc = matthews_corrcoef(train_labels, label_pred)

conf = confusion_matrix(train_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(train_labels, y_pred)
auc = roc_auc_score(train_labels, y_pred)

evaluations["Train_Test"].append("Train")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

##################################################################################
##### Prediction and metrics for Independent dataset
##################################################################################

y_pred = model.decision_function(indpe_features)
label_pred = model.predict(indpe_features)

# Compute precision, recall, sensitivity, specifity, mcc
acc = accuracy_score(indpe_labels, label_pred)
prec = precision_score(indpe_labels, label_pred)
mcc = matthews_corrcoef(indpe_labels, label_pred)

conf = confusion_matrix(indpe_labels, label_pred)
tn, fp, fn, tp = conf.ravel()
sens = tp/(tp+fn)
spec = tn/(tn+fp)

fpr, tpr, thresholds = roc_curve(indpe_labels, y_pred)
auc = roc_auc_score(indpe_labels, y_pred)

evaluations["Train_Test"].append("Indpe")
evaluations["Accuracy"].append(acc)
evaluations["Precision"].append(prec)
evaluations["TPR"].append(tpr)
evaluations["FPR"].append(fpr)
evaluations["TPR_FPR_Thresholds"].append(thresholds)
evaluations["AUC"].append(auc)
evaluations["Sensitivity"].append(sens)
evaluations["Specificity"].append(spec)
evaluations["MCC"].append(mcc)

In [14]:
evaluations_df = pd.DataFrame.from_dict(evaluations)

evaluations_df_grouped = evaluations_df.groupby(["Train_Test"]).mean().filter(['Accuracy', 
                                                                               'Precision', 
                                                                               'AUC', 
                                                                               'Sensitivity', 
                                                                               'Specificity', 
                                                                               'MCC'])

evaluations_df_grouped

Unnamed: 0_level_0,Accuracy,Precision,AUC,Sensitivity,Specificity,MCC
Train_Test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Indpe,0.608163,0.220202,0.597428,0.536946,0.622309,0.120671
Train,0.981108,0.984772,0.994848,0.97733,0.984887,0.962244


In [17]:
train_d = pd.DataFrame(train_features)

In [19]:
train_d['label'] = train_labels

In [22]:
a = train_d.corr()

In [24]:
a['label']

0        0.352211
1        0.349592
2        0.379839
3        0.340795
4        0.348639
5        0.351957
6        0.338883
7        0.358604
8        0.379461
9        0.366089
10       0.256825
11       0.272116
12       0.273379
13       0.275612
14       0.274003
15       0.281191
16       0.326355
17       0.331553
18       0.370885
19       0.123199
20       0.138376
21       0.353915
22       0.361890
23       0.353280
24       0.347158
25       0.340283
26       0.362160
27       0.359948
28       0.332383
29       0.344526
30       0.341981
31       0.352265
32       0.348190
33       0.377917
34       0.342668
35       0.337039
36       0.341035
37       0.346236
38       0.348597
39       0.350187
label    1.000000
Name: label, dtype: float64