**Step 1: Importing the Dataset**

The following code cells will import necessary libraries and import the dataset from the repository as a Pandas DataFrame

In [2]:
import sys
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import tensorflow as tf

# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   raise SystemError('GPU device not found')
# print('Found GPU at: {}'.format(device_name))

In [7]:
import pandas as pd
import numpy as np
import os

import joblib

from keras import utils
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, MaxPooling2D, Conv2D, LSTM, GRU, Bidirectional, Attention
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
import keras

**Step 2: Preprocessing the Dataset**

The data is not in a usable form; as a result, we will need to process it before using it to train our algorithms.

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/NonPromoterSequence.txt', sep = '>', )
df.dropna(subset=['Unnamed: 0'], how='all', inplace=True)
df.reset_index(inplace = True)
df.drop(['EP 1 (+) mt:CoI_1; range -400 to -100.', 'index'], axis = 1, inplace=True) #data cleaning after error found
df.rename(columns={'Unnamed: 0': "sequence"}, inplace = True)
df['label'] = 0

df2 = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/PromoterSequence.txt', sep = '>', )
df2.dropna(subset=['Unnamed: 0'], how='all', inplace=True)
df2.reset_index(inplace = True)
df2.drop(['EP 1 (+) mt:CoI_1; range -100 to 200.', 'index'], axis = 1, inplace=True)
df2.rename(columns={'Unnamed: 0': "sequence"}, inplace = True)
df2['label'] = 1

df = pd.concat([df, df2], axis = 0 )
df.shape

(22600, 2)

In [None]:
df.drop([1822], inplace = True)

for seq in df['sequence']:
    if 'N' in seq:
        display(df.loc[df['sequence'] == seq])

sequence = list(df.loc[:, 'sequence'])
encoded_list = []

In [None]:
def encode_seq(s):
    Encode = {'A':[1,0,0,0],'T':[0,1,0,0],'C':[0,0,1,0],'G':[0,0,0,1]}
    return [Encode[x] for x in s]

for i in sequence:
    x = encode_seq(i)
    encoded_list.append(x)

X = np.array(encoded_list)
X.shape

(22598, 301, 4)

In [None]:
y = df['label']
y.shape
X.shape

(22598, 301, 4)

**Step 3: Training and Testing Neural Networks**

Now that we have preprocessed the data and built our training and testing datasets, we can start to deploy different convultional neural network architectures. It's relatively easy to test multiple models using gridsearch; as a result, we will compare and contrast the perforance using GridSearchCV over many values.

In [None]:
X_train, X_not_train, y_train, y_not_train = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = y)
y_train = utils.to_categorical(y_train)

X_val, X_test, y_val, y_test = train_test_split(X_not_train, y_not_train, random_state = 42, stratify = y_not_train, test_size=0.5)

y_val = utils.to_categorical(y_val)
y_test = utils.to_categorical(y_test)

X_train = tf.cast(X_train, dtype='float32')
X_val = tf.cast(X_val, dtype='float32')
X_test = tf.cast(X_test, dtype='float32')

In [None]:
params = {
    'first_node': [128, 64],
    'second_node': [32, 64],
    'alpha': [0.001, 0.01],
    'first_filter': [9, 16, 32], 
    'dropout': [0.1, 0.2, 0.5]
}
#used for GridSearchCV

In [None]:
gru_model = Sequential()

gru_model.add(Conv1D(filters = 27, kernel_size = (4), activation = 'relu', input_shape = (301, 4)))
gru_model.add(MaxPooling1D(pool_size= (3)))
gru_model.add(Dropout(0.2))

gru_model.add(Conv1D(filters = 14, kernel_size = (2), activation = 'relu', padding = 'same'))
#cnn_model.add(MaxPooling1D(pool_size= (1)))
#cnn_model.add(Dropout(0.2))



gru_model.add(Bidirectional(GRU(128, activation = 'relu')))
gru_model.add(Dropout(0.2))
gru_model.add(Dense(128, activation = 'relu'))
gru_model.add(Dense(64, activation = 'relu'))
gru_model.add(Dense(64, activation = 'relu'))
gru_model.add(Dense(16, activation = 'relu', kernel_regularizer = regularizers.l2(0.01)))
gru_model.add(Dense(2, activation = 'sigmoid'))

gru_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

early_stop = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', min_delta = 0.0005, patience=8, 
                                           restore_best_weights=True )
history = gru_model.fit(X_train, y_train, batch_size = 128, validation_data=(X_val, y_val), 
                        epochs=115)

Epoch 1/115
Epoch 2/115
Epoch 3/115
Epoch 4/115
Epoch 5/115
Epoch 6/115
Epoch 7/115
Epoch 8/115
Epoch 9/115
Epoch 10/115
Epoch 11/115
Epoch 12/115
Epoch 13/115
Epoch 14/115
Epoch 15/115
Epoch 16/115
Epoch 17/115
Epoch 18/115
Epoch 19/115
Epoch 20/115
Epoch 21/115
Epoch 22/115
Epoch 23/115
Epoch 24/115
Epoch 25/115
Epoch 26/115
Epoch 27/115
Epoch 28/115
Epoch 29/115
Epoch 30/115
Epoch 31/115
Epoch 32/115
Epoch 33/115
Epoch 34/115
Epoch 35/115
Epoch 36/115
Epoch 37/115
Epoch 38/115
Epoch 39/115
Epoch 40/115
Epoch 41/115
Epoch 42/115
Epoch 43/115
Epoch 44/115
Epoch 45/115
Epoch 46/115
Epoch 47/115
Epoch 48/115
Epoch 49/115
Epoch 50/115
Epoch 51/115
Epoch 52/115
Epoch 53/115
Epoch 54/115
Epoch 55/115
Epoch 56/115
Epoch 57/115
Epoch 58/115
Epoch 59/115
Epoch 60/115
Epoch 61/115
Epoch 62/115
Epoch 63/115
Epoch 64/115
Epoch 65/115
Epoch 66/115
Epoch 67/115
Epoch 68/115
Epoch 69/115
Epoch 70/115
Epoch 71/115
Epoch 72/115
Epoch 73/115
Epoch 74/115
Epoch 75/115
Epoch 76/115
Epoch 77/115
Epoch 78

In [None]:
# save model
gru_model.save('/content/gdrive/MyDrive/Colab Notebooks/CNN_GRU_model')

In [8]:
# load model
gru_model = keras.models.load_model('/content/gdrive/MyDrive/Colab Notebooks/CNN_GRU_model')

In [None]:
pred = gru_model.predict

In [None]:
preds = gru_model.predict(X_test)



In [None]:
preds
len(preds)

1130

In [None]:
np.savetxt("test_predictions.txt", preds, delimiter=",")

In [None]:
# get accuracy
# positive: is promoter
tn = 0
fn = 0
fp = 0
tp = 0

for i in range(len(preds)):
  if preds[i][0] >= preds[i][1]: # non-promoter
    preds[i][0] = 1
    preds[i][1] = 0
  else:                          # promoter
    preds[i][0] = 0
    preds[i][1] = 1

for i in range(len(preds)):
  if(preds[i][0] > preds[i][1]): # non-promoter
    if y_test[i][0] == 1:
      tn += 1
    else:
      fn += 1
  else:                          # promoter
    if y_test[i][1] == 1:
      tp += 1
    else:
      fp += 1

precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + fn + fp + tn)
print(precision, recall, accuracy)
print(tp, fn, fp, tn)

0.8817005545286506 0.8442477876106195 0.8654867256637168
477 88 64 501


In [9]:
print(gru_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 298, 27)           459       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 99, 27)           0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 99, 27)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 99, 14)            770       
                                                                 
 bidirectional (Bidirectiona  (None, 256)              110592    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 256)               0

**Initial Gridsearch on CNN model as backup.**

The models is saved in this file.

Best Parameters are L2 alpha of 0.01, Dropout of 0.02, First filter: 32, First node: 64, Second node: 64.

In [None]:
'''
def model_func(first_node, second_node, alpha, first_filter, dropout):

    model = Sequential()
    model.add(Conv2D(filters = first_filter,       
                    kernel_size = (3,3),    
                    activation = 'relu',    
                  input_shape = (301, 4, 1)))
    model.add(MaxPooling2D(pool_size= (2,2)))
    model.add(Dropout(dropout))

    model.add(Conv2D(filters = 15,  
                     kernel_size = (2,2),
                    activation = 'relu',
                    padding = 'same'))
    model.add(MaxPooling2D(pool_size= (1,1)))
    model.add(Dropout(dropout))


    model.add(Flatten())
    model.add(Dense(first_node, activation = 'relu', kernel_regularizer = regularizers.l2(alpha)))
    model.add(Dense(second_node, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(2, activation = 'sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model


early_stop = keras.callbacks.EarlyStopping(monitor = 'loss', min_delta = 0, patience=5, 
                                           restore_best_weights=True )


nn = KerasClassifier(build_fn = model_func, batch_size = 512, epochs = 1)

gs = GridSearchCV(nn, param_grid = params, cv = 2)

gs.fit(X_train, y_train, callbacks = [early_stop])

cnn_model_saved = cnn_model.save('CNN_best_model.h5')
'''