**Step 1: Importing the Dataset**

The following code cells will import necessary libraries and import the dataset from the repository as a Pandas DataFrame

In [None]:
!pip install keras-self-attention
import pandas as pd
import numpy as np
import os

import joblib

from keras import utils
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, MaxPooling2D, Conv2D, LSTM, GRU, Bidirectional, Attention
from keras_self_attention import SeqSelfAttention
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
import keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/nlp_project
!ls

Mounted at /content/gdrive
/content/gdrive/MyDrive/nlp_project
gru_model_def		       PromoterSequence_test.txt   test_predictions.txt
NonPromoterSequence_test.txt   PromoterSequence_train.txt  TestSequence.txt
NonPromoterSequence_train.txt  PromoterSequence.txt
NonPromoterSequence.txt        test_predictions.csv


**Step 2: Preprocessing the Dataset**

The data is not in a usable form; as a result, we will need to process it before using it to train our algorithms.

In [None]:
df = pd.read_csv('./NonPromoterSequence_train.txt', sep = '>', )
df.dropna(subset=['Unnamed: 0'], how='all', inplace=True)
df.reset_index(inplace = True)
df.drop(['EP 1 (+) mt:CoI_1; range -400 to -100.', 'index'], axis = 1, inplace=True) #data cleaning after error found
df.rename(columns={'Unnamed: 0': "sequence"}, inplace = True)
df['label'] = 0
display(df)
display(df.shape)

Unnamed: 0,sequence,label
0,TAATTACATTATTTTTTTATTTACGAATTTGTTATTCCGCTTTTAT...,0
1,ATTTTTACAAGAACAAGACATTTAACTTTAACTTTATCTTTAGCTT...,0
2,AGAGATAGGTGGGTCTGTAACACTCGAATCAAAAACAATATTAAGA...,0
3,TATGTATATAGAGATAGGCGTTGCCAATAACTTTTGCGTTTTTTGC...,0
4,AGAAATAATAGCTAGAGCAAAAAACAGCTTAGAACGGCTGATGCTC...,0
...,...,...
10995,ATTGCTCTGGTTCCATATGAAATTTGTGTAAAAGTTTATGTTATTT...,0
10996,GTCACTGATCACGAGCATCTGTCTTTAGTGGCGACTGTCTGCCAGT...,0
10997,TTTTCAAGGATGTTGGGAGCTGCCATCCTGACTGTTGGGGATGAGT...,0
10998,ACGTGGTTTGGATCACACGCAATATGTTCTATTCAACCGAACAAAT...,0


(11000, 2)

In [None]:
df2 = pd.read_csv('./PromoterSequence_train.txt', sep = '>', )
df2.dropna(subset=['Unnamed: 0'], how='all', inplace=True)
df2.reset_index(inplace = True)
df2.drop(['EP 1 (+) mt:CoI_1; range -100 to 200.', 'index'], axis = 1, inplace=True)
df2.rename(columns={'Unnamed: 0': "sequence"}, inplace = True)
df2['label'] = 1

display(df2)
display(df2.shape)

Unnamed: 0,sequence,label
0,TTAATTTGTCCTTATTTGATTAAGAAGAATAAATCTTATATATAGA...,1
1,ATAGCTCAAATTGCTTTATTAGTATTAGAATCAGCTGTAGCTATAA...,1
2,AAGCTTCCCTTTAATGTGCTCCTTGTGAATACAGCATTACAATGCC...,1
3,TATGTAGAATCTGTACAAGTATCTGTGTTTGGACAATGGCATGTGT...,1
4,ACATATTACTGCATACAGGTCTCAAATTATAAAATGACACTCGTGG...,1
...,...,...
10995,TCAAATGAGCTATCTTAACGGGAGTTTGAATGATTCTCCTCTGTTG...,1
10996,TCCCATCCCATCCGATCCGATCCGTTCCGATTTTGCCTTATCAATC...,1
10997,GAATAGCTTTACCGTTTGAAAGCGTCGCCATTCCTGTGCAGCTTTT...,1
10998,AGAGAGTCGAAGAAAATGGTAAAAGCAAACAGTGGTAAGCCGCCGC...,1


(11000, 2)

In [None]:
df = pd.concat([df, df2], axis = 0 )
df.shape

(22000, 2)

In [None]:
for seq in df['sequence']:
    if 'N' in seq:
        display(df.loc[df['sequence'] == seq])

Unnamed: 0,sequence,label
1822,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATTC...,0


In [None]:
df.drop([1822], inplace = True)

In [None]:
for seq in df['sequence']:
    if 'N' in seq:
        display(df.loc[df['sequence'] == seq])

In [None]:
sequence = list(df.loc[:, 'sequence'])
encoded_list = []

In [None]:
def encode_seq(s):
    Encode = {'A':[1,0,0,0],'T':[0,1,0,0],'C':[0,0,1,0],'G':[0,0,0,1]}
    return [Encode[x] for x in s]

for i in sequence:
    x = encode_seq(i)
    encoded_list.append(x)

X = np.array(encoded_list)
X.shape

(21998, 301, 4)

In [None]:
y = df['label']
y.shape

(21998,)

In [None]:
X.shape

(21998, 301, 4)

**Step 3: Training and Testing Neural Networks**

Now that we have preprocessed the data and built our training and testing datasets, we can start to deploy different convultional neural network architectures. It's relatively easy to test multiple models using gridsearch; as a result, we will compare and contrast the perforance using GridSearchCV over many values.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [None]:
y_train = utils.to_categorical(y_train)
y_test = utils.to_categorical(y_test)

In [None]:
params = {
    'first_node': [128, 64],
    'second_node': [32, 64],
    'alpha': [0.001, 0.01],
    'first_filter': [9, 16, 32], 
    'dropout': [0.1, 0.2, 0.5]
}
#used for GridSearchCV

In [None]:
gru_model = Sequential()

gru_model.add(Conv1D(filters = 27, kernel_size = (4), activation = 'relu', input_shape = (301, 4)))
gru_model.add(MaxPooling1D(pool_size= (3)))
gru_model.add(Dropout(0.2))

gru_model.add(Conv1D(filters = 14, kernel_size = (2), activation = 'relu', padding = 'same'))
#cnn_model.add(MaxPooling1D(pool_size= (1)))
#cnn_model.add(Dropout(0.2))



gru_model.add(Bidirectional(GRU(128, activation = 'relu')))
gru_model.add(Dropout(0.2))
gru_model.add(Dense(128, activation = 'relu'))
gru_model.add(Dense(64, activation = 'relu'))
gru_model.add(Dense(64, activation = 'relu'))
gru_model.add(Dense(16, activation = 'relu', kernel_regularizer = regularizers.l2(0.01)))
gru_model.add(Dense(2, activation = 'sigmoid'))

gru_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', min_delta = 0.0005, patience=8, 
                                           restore_best_weights=True )
history = gru_model.fit(X_train, y_train, batch_size = 128, validation_data=(X_test, y_test), 
                        epochs=115)
gru_model.save('./gru_model')

ValueError: ignored

In [None]:
gru_model = keras.models.load_model('./gru_model')

In [None]:
pred = gru_model.predict

In [None]:
df1 = pd.read_csv('./NonPromoterSequence_test.txt', sep = '>', )
df1.dropna(subset=['Unnamed: 0'], how='all', inplace=True)
df1.reset_index(inplace = True)
df1.drop(['EP 1 (+) mt:CoI_1; range -400 to -100.', 'index'], axis = 1, inplace=True) #data cleaning after error found
df1.rename(columns={'Unnamed: 0': "sequence"}, inplace = True)
df1['label'] = 0
display(df1)
display(df1.shape)

Unnamed: 0,sequence,label
0,CGGATGGTAAATATCGAACAAAAGTCGGCTTGCATTTATTTCGATA...,0
1,GATATTTACCATCCGAATCGTTTAGAAAAAACGTTGATATTTTCCA...,0
2,TAAATCTCTACCTGTGCCTTTGTGGTCAAAGGGTCTACATTGTGTC...,0
3,TAGCCAGAATAACAACGGCCTGTCAATAAAATGCGATGAGTAAATA...,0
4,AGAACGACGACAGCCTGTGAACAAATTAACTTGATTAGTTTAATCC...,0
...,...,...
295,TGGTAAAAAATTGTACACCTAACTAGTGCCTTCATGTATACCACCA...,0
296,AGTGCAACTGGAGCCGTGCCGTGACCCACAGAGATCGCCCACTCGA...,0
297,GCATGGATTTCATATTATCTTAATCGACTTGCTTTTATAAAATAGG...,0
298,GTGACCAGGTTTTGCTCTAATGCGAAGTACGGATTGGGTAGAGATA...,0


(300, 2)

In [None]:
df2 = pd.read_csv('./PromoterSequence_test.txt', sep = '>', )
df2.dropna(subset=['Unnamed: 0'], how='all', inplace=True)
df2.reset_index(inplace = True)
df2.drop(['EP 1 (+) mt:CoI_1; range -400 to -100.', 'index'], axis = 1, inplace=True) # data cleaning after error found
df2.rename(columns={'Unnamed: 0': "sequence"}, inplace = True)
df2['label'] = 1
display(df2)
display(df2.shape)

Unnamed: 0,sequence,label
0,AGTTTAGACTTCACATTTTGTAGAAGTGTTTAAACATTAGCGTTAA...,1
1,CATCAGACCAATTTATTTCGGAAGATCAGTGTGAATGCTTCTTTGC...,1
2,TAATAATGGCCCAATGGGTTGCGGCGCACTTGGAATATTAAGTAAC...,1
3,GAGACTGTCACCGCATTTAAATGAACTGAATGAGTGCGATCATGAG...,1
4,TGTGGAATAACTCTTGGTGGCCACGGAATGCGATGGCCTGAGGTCA...,1
...,...,...
295,CGACAAAGTTTGATCCATGTGCATTCTTGGCGCCTTATCGATAGCT...,1
296,CATATCTACATCTCGCTTGCTCCTTCCCTTTCGCTGCGTGTGTGTG...,1
297,ATACCGCGGAAGCGCAAAAGTACCAGAATTTCCCTGGTATCGCGCT...,1
298,ATTATTCCGAATTCTTTTATCAGATTTAAATATGGGAAACACTTTA...,1


(300, 2)

In [None]:
df = pd.concat([df1, df2], axis = 0)
df = df.reset_index(drop=True)
display(df)
display(df.shape)

Unnamed: 0,sequence,label
0,CGGATGGTAAATATCGAACAAAAGTCGGCTTGCATTTATTTCGATA...,0
1,GATATTTACCATCCGAATCGTTTAGAAAAAACGTTGATATTTTCCA...,0
2,TAAATCTCTACCTGTGCCTTTGTGGTCAAAGGGTCTACATTGTGTC...,0
3,TAGCCAGAATAACAACGGCCTGTCAATAAAATGCGATGAGTAAATA...,0
4,AGAACGACGACAGCCTGTGAACAAATTAACTTGATTAGTTTAATCC...,0
...,...,...
595,CGACAAAGTTTGATCCATGTGCATTCTTGGCGCCTTATCGATAGCT...,1
596,CATATCTACATCTCGCTTGCTCCTTCCCTTTCGCTGCGTGTGTGTG...,1
597,ATACCGCGGAAGCGCAAAAGTACCAGAATTTCCCTGGTATCGCGCT...,1
598,ATTATTCCGAATTCTTTTATCAGATTTAAATATGGGAAACACTTTA...,1


(600, 2)

In [None]:
sequence = list(df.loc[:, 'sequence'])
encoded_list = []

In [None]:
for i in sequence:
    x = encode_seq(i)
    encoded_list.append(x)

X_test_test = np.array(encoded_list)
X_test_test.shape

(600, 301, 4)

In [None]:
preds = gru_model.predict(X_test_test)



In [None]:
preds

array([[9.8194546e-01, 1.6660649e-02],
       [8.6175746e-01, 1.3427529e-01],
       [9.5504349e-01, 4.2450391e-02],
       ...,
       [9.8398160e-03, 9.8858863e-01],
       [6.1660488e-05, 9.9991739e-01],
       [9.1858763e-01, 7.8081697e-02]], dtype=float32)

In [None]:
len(preds)

600

In [None]:
np.savetxt("test_predictions.txt", preds, delimiter=",")

In [None]:
# get accuracy
tp = 0
tn = 0
fn = 0
fp = 0

for i in range(len(preds)):
  if(preds[i][0] > preds[i][1]): # non-promoter
    if df.loc[i]['label'] == 0:
      tn += 1
    else:
      fn += 1
  else:
    if df.loc[i]['label'] == 0:
      fp += 1
    else:
      tp += 1

precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fn + fp + tn)
print(precision, recall, accuracy)
print(tp, fn, fp, tn)

0.936026936026936 0.9266666666666666 0.9316666666666666
278 22 19 281


**Initial Gridsearch on CNN model as backup.**

The models is saved in this file.

Best Parameters are L2 alpha of 0.01, Dropout of 0.02, First filter: 32, First node: 64, Second node: 64.

In [None]:
'''
def model_func(first_node, second_node, alpha, first_filter, dropout):

    model = Sequential()
    model.add(Conv2D(filters = first_filter,       
                    kernel_size = (3,3),    
                    activation = 'relu',    
                  input_shape = (301, 4, 1)))
    model.add(MaxPooling2D(pool_size= (2,2)))
    model.add(Dropout(dropout))

    model.add(Conv2D(filters = 15,  
                     kernel_size = (2,2),
                    activation = 'relu',
                    padding = 'same'))
    model.add(MaxPooling2D(pool_size= (1,1)))
    model.add(Dropout(dropout))


    model.add(Flatten())
    model.add(Dense(first_node, activation = 'relu', kernel_regularizer = regularizers.l2(alpha)))
    model.add(Dense(second_node, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(2, activation = 'sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model


early_stop = keras.callbacks.EarlyStopping(monitor = 'loss', min_delta = 0, patience=5, 
                                           restore_best_weights=True )


nn = KerasClassifier(build_fn = model_func, batch_size = 512, epochs = 1)

gs = GridSearchCV(nn, param_grid = params, cv = 2)

gs.fit(X_train, y_train, callbacks = [early_stop])

cnn_model_saved = cnn_model.save('CNN_best_model.h5')
'''