In [1]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

n_fold = 5
expName = "PredNitro_Embedding_PredNTS"
outPath = "Results"
foldName = "folds.pickle"

epochs = 100
batch_size = 64
shuffle = True
seed = None

input_data_folder = "Data"
training_data_file = "Training-datasets-PredNTS.txt"
independent_data_file = "independent dataset-PredNTS.txt"

In [2]:
import os 
import pickle
import numpy as np
import pandas as pd

import math

import itertools

# Read PredNTS Training data

In [3]:
##################################################################################
##### read training file
##################################################################################
train_file_path = os.path.join(input_data_folder, training_data_file)
train_data = pd.read_csv(train_file_path, sep='\t', header=None)
train_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']

# Create PredNitro probability embedding

In [4]:
train_data.head(5)

Unnamed: 0,Sequence,name,id,flag,label_original,type
0,EVPQLCSFILKTSQCTLKEVYGFNPEGKALLKKTKNSEEFA,sp|P04114|,103,Y,1,pssm1
1,AAMSRYELKLAIPEGKQVFLYPEKDEPTYILNIKRGIISAL,sp|P04114|,144,Y,1,pssm1
2,VAEAICKEQHLFLPFSYKNKYGMVAQVTQTLKLEDTPKINS,sp|P04114|,276,Y,1,pssm1
3,HILQWLKRVHANPLLIDVVTYLVALIPEPSAQQLREIFNMA,sp|P04114|,413,Y,1,pssm1
4,PSLDPASAKIEGNLIFDPNNYLPKESMLKTTLTAFGFASAD,sp|P04114|,666,Y,1,pssm1


In [9]:
all_char_set = set({})
for val in [set(val) for val in train_data['Sequence']]:
    all_char_set = all_char_set.union(val)
all_char_list = list(all_char_set)
all_char_list.sort()

pos_mid = 20
position_indexes_A_left = list(range(0,pos_mid))
position_indexes_B_left = [val+1 for val in position_indexes_A_left]

all_left_lists = [
   all_char_list,
   all_char_list,
   list(zip(position_indexes_A_left, position_indexes_B_left))
]
all_left_combinations = list(itertools.product(*all_left_lists))

df_left = pd.DataFrame(all_left_combinations, columns=['A', 'B', 'pos'])

position_indexes_A_right = list(range(pos_mid+1,(pos_mid*2)+1))
position_indexes_B_right = [val-1 for val in position_indexes_A_right]

all_right_lists = [
   all_char_list,
   all_char_list,
   list(zip(position_indexes_A_right, position_indexes_B_right))
]
all_right_combinations = list(itertools.product(*all_right_lists))

df_right = pd.DataFrame(all_right_combinations, columns=['A', 'B', 'pos'])

df_position_specific_conditional_probabilities = pd.concat((df_left, df_right), ignore_index=True)

df_position_specific_conditional_probabilities['prob'] = 0.0

In [10]:
df_position_specific_conditional_probabilities

Unnamed: 0,A,B,pos,prob
0,-,-,"(0, 1)",0.0
1,-,-,"(1, 2)",0.0
2,-,-,"(2, 3)",0.0
3,-,-,"(3, 4)",0.0
4,-,-,"(4, 5)",0.0
...,...,...,...,...
17635,Y,Y,"(36, 35)",0.0
17636,Y,Y,"(37, 36)",0.0
17637,Y,Y,"(38, 37)",0.0
17638,Y,Y,"(39, 38)",0.0


In [30]:
positive_train_data = np.array([list(val) for val in list(train_data['Sequence'][train_data['label_original'] == 1])])
negative_train_data = np.array([list(val) for val in list(train_data['Sequence'][train_data['label_original'] != 1])])

for i in range(df_position_specific_conditional_probabilities.shape[0]):
    charA = df_position_specific_conditional_probabilities['A'][i]
    charB = df_position_specific_conditional_probabilities['B'][i]
    posA, posB = df_position_specific_conditional_probabilities['pos'][i]
    
    df_filteredB_by_pos = df_position_specific_conditional_probabilities
    
    ## Generating the probability scores from positive data
    
    positive_prob = 0
    
    positive_d_charA = positive_train_data[:, posA]
    positive_d_charB = positive_train_data[:, posB]

    charB_indexes_in_posB = np.where(positive_d_charB == charB)[0]
    charA_indexes_in_posA_when_charB_in_posB = np.where(positive_d_charA[charB_indexes_in_posB] == charA)[0]

    if charB_indexes_in_posB.shape[0] > 0:
        positive_prob = charA_indexes_in_posA_when_charB_in_posB.shape[0] / charB_indexes_in_posB.shape[0]
    
    ## Generating the probability scores from negative data
    
    negative_prob = 0
    
    negative_d_charA = negative_train_data[:, posA]
    negative_d_charB = negative_train_data[:, posB]

    charB_indexes_in_posB = np.where(negative_d_charB == charB)[0]
    charA_indexes_in_posA_when_charB_in_posB = np.where(negative_d_charA[charB_indexes_in_posB] == charA)[0]

    if charB_indexes_in_posB.shape[0] > 0:
        negative_prob = charA_indexes_in_posA_when_charB_in_posB.shape[0] / charB_indexes_in_posB.shape[0]
    
    ## Appending the final probability difference
    
    df_position_specific_conditional_probabilities.loc[i, 'prob'] = positive_prob - negative_prob

In [31]:
df_position_specific_conditional_probabilities

Unnamed: 0,A,B,pos,prob
0,-,-,"(0, 1)",0.000000
1,-,-,"(1, 2)",0.000000
2,-,-,"(2, 3)",0.000000
3,-,-,"(3, 4)",0.000000
4,-,-,"(4, 5)",0.000000
...,...,...,...,...
17635,Y,Y,"(36, 35)",-0.039216
17636,Y,Y,"(37, 36)",0.078125
17637,Y,Y,"(38, 37)",-0.020054
17638,Y,Y,"(39, 38)",-0.019444


In [63]:
pos_mid = 20
prob_features = np.empty((0, 40))
for seq in train_data["Sequence"]:
    seq_prob_features = np.zeros((1, 40))
    for i in range(0, len(seq)):
        if i < pos_mid:
            feature_index = i
            posB = i+1
        elif i > pos_mid:
            feature_index = i-1
            posB = i-1
        else:
            continue
        posA = i
        
        charA = seq[posA]
        charB = seq[posB]
        
        prob = df_position_specific_conditional_probabilities[(df_position_specific_conditional_probabilities['A'] == charA) & 
                                                              (df_position_specific_conditional_probabilities['B'] == charB) & 
                                                              (df_position_specific_conditional_probabilities['pos'] == (posA, 
                                                                                                                         posB))
                                                             ]['prob'].values[0]
        
        seq_prob_features[0, feature_index] = prob
    
    prob_features = np.concatenate((prob_features, seq_prob_features))

In [65]:
prob_features[0]

array([ 0.00238095, -0.03769559, -0.06839623,  0.00361601, -0.03304348,
        0.025     ,  0.05767077, -0.01585082,  0.01832707, -0.10678325,
        0.07848673, -0.02621475,  0.04424277,  0.17647059,  0.01818182,
        0.02498298,  0.09863182,  0.02534113,  0.06492637,  0.00923594,
       -0.00419815, -0.05789474, -0.01072125, -0.02609971,  0.10901001,
       -0.06070334,  0.01607143,  0.0008547 , -0.01428571,  0.00080808,
        0.11644355,  0.04898447,  0.05445344,  0.06372549,  0.02272727,
        0.03225806,  0.02731183, -0.0582716 ,  0.02881563,  0.12727273])

In [67]:
np.savetxt("foo.csv", prob_features, delimiter=",")

In [51]:
charA, charB, posA, posB

('Y', 'Y', 40, 39)

In [60]:
df_position_specific_conditional_probabilities[(df_position_specific_conditional_probabilities['A'] == charA) & 
                                                              (df_position_specific_conditional_probabilities['B'] == charB) & 
                                                              (df_position_specific_conditional_probabilities['pos'] == (posA, 
                                                                                                                         posB))
                                                             ]['prob'].values[0]

0.01317829457364341

In [33]:
features.shape

(2382, 40)

In [34]:
len(seq)

41

In [45]:
a = np.empty((0,4))
b = np.ones((5,4))

In [46]:
a.shape

(0, 4)

In [47]:
b.shape

(5, 4)

In [49]:
np.concatenate((a,b)).shape

(5, 4)

In [27]:
charB_indexes_in_posB.shape[0]

55

In [23]:
pos_d_charA

array(['E', 'A', 'V', ..., 'S', 'D', 'S'], dtype='<U1')

In [15]:
pos_d_charB

array(['V', 'A', 'A', ..., 'E', 'C', 'V'], dtype='<U1')

In [20]:
charB_indexes_in_posB

array([  18,   21,   48,   88,   97,  122,  128,  129,  130,  136,  160,
        172,  176,  207,  222,  250,  353,  367,  377,  417,  436,  454,
        487,  492,  525,  540,  557,  562,  565,  580,  617,  636,  654,
        670,  671,  695,  789,  791,  818,  835,  888,  906,  917,  951,
        976,  985, 1010, 1022, 1057, 1064, 1069, 1104, 1126, 1155, 1162],
      dtype=int64)

In [22]:
np.where(pos_d_charA[charB_indexes_in_posB] == charA)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54], dtype=int64),)

In [None]:
pos_train_data = np.array([list(val) for val in list(train_data['Sequence'][train_data['label_original'] == 1])])
neg_train_data = np.array([list(val) for val in list(train_data['Sequence'][train_data['label_original'] != 1])])

In [None]:
pos_train_data

In [16]:
pos_train_data[:, 0:2]

array([['E', 'V'],
       ['A', 'A'],
       ['V', 'A'],
       ...,
       ['S', 'E'],
       ['D', 'C'],
       ['S', 'V']], dtype='<U1')

# Read PredNTS Independent data

In [None]:
##################################################################################
##### read independent data file
##################################################################################
indpe_file_path = os.path.join(input_data_folder, independent_data_file)
indpe_data = pd.read_csv(indpe_file_path, sep='\t', header=None)
indpe_data.columns = ['Sequence', 'name', 'id', 'flag', 'label_original', 'type']
indpe_data.head()
    
##################################################################################
##### Create OHE of sequence
##################################################################################
indpe_data['OHE_Sequence'] = pd.Series([one_hot_encode_nt(val, all_char_dict) 
                                        for val in indpe_data["Sequence"]])

##################################################################################
##### Fix the labels
##################################################################################
indpe_data['label'] = pd.Series([1 if val == 1 else 0 
                                 for val in indpe_data["label_original"]])

##################################################################################
##### Extract features and labels, create folds
##################################################################################

indpe_features = np.array(list(indpe_data['OHE_Sequence']))
indpe_labels = np.array(list(indpe_data['label']))
indpe_labels = indpe_labels.reshape((indpe_labels.shape[0], 1))

input_seq_shape = indpe_features[0].shape