In [1]:
import numpy as np
import pandas as pd
import sklearn

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import keras

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score

# Parameters

In [2]:
train_split_size = 0.8

# Data

* Download Data
* Pre-process data for feeding into ML models



In [3]:
######################################################################################
##### DATA fetching #####
######################################################################################

DNA_data_link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
ecoli_dna_data_original = pd.read_csv(DNA_data_link)
ecoli_dna_data_original.columns = ['Class', 'id', 'Sequence']

In [4]:
######################################################################################
##### Preparing data for ML models by breaking sequence and using dummy variables
######################################################################################

ecoli_dna_data_original["Sequence_edited"] = ecoli_dna_data_original["Sequence"].replace({"\t":""}, regex = True)

ecoli_dna_data_broken_sequence_class = pd.DataFrame()
for char in range(np.unique(ecoli_dna_data_original["Sequence_edited"].apply(len))[0]):
  ecoli_dna_data_broken_sequence_class["Sequence_"+str(char)] = ecoli_dna_data_original['Sequence_edited'].astype(str).str[char]
ecoli_dna_data_broken_sequence_class["Class"] = ecoli_dna_data_original["Class"]

ecoli_dna_data_broken_sequence_class_dummified = pd.get_dummies(ecoli_dna_data_broken_sequence_class)

In [5]:
######################################################################################
##### Splitting the data into Test/Train sets
######################################################################################

x_train, x_test, y_train, y_test = train_test_split(ecoli_dna_data_broken_sequence_class_dummified.iloc[:, 0:ecoli_dna_data_broken_sequence_class_dummified.shape[1]-2], 
                                                    ecoli_dna_data_broken_sequence_class_dummified.iloc[:, ecoli_dna_data_broken_sequence_class_dummified.shape[1]-2:ecoli_dna_data_broken_sequence_class_dummified.shape[1]], 
                                                    train_size = train_split_size)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(84, 228)
(84, 2)
(21, 228)
(21, 2)


# Machine Learning Models

### Support Vector Machines

In [6]:
######################################################################################
##### Check for generalization of model with 5-fold cross-validation
######################################################################################

cv_results = cross_val_score(svm.LinearSVC(random_state=1), x_train, y_train.iloc[:,0], cv = KFold(n_splits = 5), scoring = 'accuracy')
print(np.mean(cv_results))
cv_results

## After multiple different combinations of parameters, found that there was no significanct improvement in training accuracy

0.9286764705882353


array([1.        , 0.88235294, 0.88235294, 0.94117647, 0.9375    ])

In [7]:
######################################################################################
##### Training SVM model
######################################################################################

svm_model = svm.LinearSVC(random_state=1)
svm_model.fit(x_train, y_train.iloc[:,0])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
          verbose=0)

In [8]:
######################################################################################
##### Prediction performance evaluation for SVM model - TRAIN data
######################################################################################

svm_train_prediction =  svm_model.predict(x_train)
svm_train_pred_eval = pd.DataFrame(classification_report(y_train.iloc[:,0], svm_train_prediction, output_dict=True)).transpose()
svm_train_pred_eval["Model"] = "Linear Support Vector Machine"
svm_train_pred_eval["Data"] = "Train"
svm_train_pred_eval.index.name = "Details"
svm_train_pred_eval.reset_index(inplace=True)

In [9]:
######################################################################################
##### Prediction performance evaluation for SVM model - TEST data
######################################################################################

svm_test_prediction =  svm_model.predict(x_test)
svm_test_pred_eval = pd.DataFrame(classification_report(y_test.iloc[:,0], svm_test_prediction, output_dict=True)).transpose()
svm_test_pred_eval["Model"] = "Linear Support Vector Machine"
svm_test_pred_eval["Data"] = "Test"
svm_test_pred_eval.index.name = "Details"
svm_test_pred_eval.reset_index(inplace=True)

### Random Forest Classifier

In [10]:
######################################################################################
##### Check for generalization of model with 5-fold cross-validation
######################################################################################

cv_results = cross_val_score(RandomForestClassifier(random_state=1), x_train, y_train.iloc[:,0], cv = KFold(n_splits = 5), scoring = 'accuracy')
print(np.mean(cv_results))
cv_results

## After multiple different combinations of parameters, found that there was no significanct improvement in training accuracy

0.8227941176470587


array([0.94117647, 0.82352941, 0.76470588, 0.64705882, 0.9375    ])

In [11]:
######################################################################################
##### Training SVM model
######################################################################################

rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(x_train, y_train.iloc[:,0])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [12]:
######################################################################################
##### Prediction performance evaluation for Random Forest model - TRAIN data
######################################################################################

rf_train_prediction =  rf_model.predict(x_train)
rf_train_pred_eval = pd.DataFrame(classification_report(y_train.iloc[:,0], rf_train_prediction, output_dict=True)).transpose()
rf_train_pred_eval["Model"] = "Random Forest"
rf_train_pred_eval["Data"] = "Train"
rf_train_pred_eval.index.name = "Details"
rf_train_pred_eval.reset_index(inplace=True)

In [13]:
######################################################################################
##### Prediction performance evaluation for Random Forest model - TEST data
######################################################################################

rf_test_prediction =  rf_model.predict(x_test)
rf_test_pred_eval = pd.DataFrame(classification_report(y_test.iloc[:,0], rf_test_prediction, output_dict=True)).transpose()
rf_test_pred_eval["Model"] = "Random Forest"
rf_test_pred_eval["Data"] = "Test"
rf_test_pred_eval.index.name = "Details"
rf_test_pred_eval.reset_index(inplace=True)

# Deep Learning Models

### Convolutional Neural Network

In [14]:
######################################################################################
##### CNN Model Parameters
######################################################################################

input_shape = (x_train.shape[1],1)
no_of_labels = len(np.unique(y_train))

loss = "binary_crossentropy"
learning_rate = 0.001
epochs = 10
batch_size = 10
validation_split = 0.2

In [15]:
######################################################################################
##### Data Preparation for CNN
######################################################################################

x_train_reshaped_cnn = x_train.values.reshape(x_train.shape[0],x_train.shape[1],1)
x_test_reshaped_cnn = x_test.values.reshape(x_test.shape[0],x_test.shape[1],1)

y_train_reshaped_cnn = y_train.values
y_test_reshaped_cnn = y_test.values

In [16]:
######################################################################################
##### Initializing MODEL #####
######################################################################################

cnn_model = keras.models.Sequential()

cnn_model.add(keras.layers.Conv1D(64, 12, strides = 1, activation = 'relu', input_shape = input_shape))

cnn_model.add(keras.layers.Flatten())

cnn_model.add(keras.layers.Dense(912, activation = 'relu'))
cnn_model.add(keras.layers.Dense(no_of_labels, activation = 'softmax'))

cnn_model.compile(optimizer = keras.optimizers.Adam(learning_rate = learning_rate), loss = loss, metrics = ['BinaryAccuracy'])

cnn_model.build()
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 217, 64)           832       
_________________________________________________________________
flatten (Flatten)            (None, 13888)             0         
_________________________________________________________________
dense (Dense)                (None, 912)               12666768  
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1826      
Total params: 12,669,426
Trainable params: 12,669,426
Non-trainable params: 0
_________________________________________________________________


In [17]:
######################################################################################
##### Training CNN MODEL #####
######################################################################################

cnn_model.fit(x_train_reshaped_cnn, y_train_reshaped_cnn, epochs = epochs, validation_split = validation_split, batch_size = batch_size, verbose = 1, callbacks=keras.callbacks.EarlyStopping(monitor='val_loss', patience=3))
# cnn_model.fit(x_train_reshaped_cnn, y_train_reshaped_cnn, epochs = epochs, validation_split = validation_split, batch_size = batch_size, verbose = 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<tensorflow.python.keras.callbacks.History at 0x7f8f2f262f98>

In [18]:
######################################################################################
##### Generate CNN prediction and performance metrics using TRAIN Data
######################################################################################

cnn_train_prediction =  cnn_model.predict(x_train_reshaped_cnn)
cnn_train_pred_eval = pd.DataFrame(classification_report(pd.DataFrame(y_train_reshaped_cnn).iloc[:,0], pd.DataFrame(np.round(cnn_train_prediction)).iloc[:,0].astype(int), output_dict=True)).transpose()
cnn_train_pred_eval["Model"] = "Convolutional Neural Network"
cnn_train_pred_eval["Data"] = "Train"
cnn_train_pred_eval.index.name = "Details"
cnn_train_pred_eval.reset_index(inplace=True)


In [19]:
######################################################################################
##### Generate CNN prediction and performance metrics using Test Data (UNSEEN Data)
######################################################################################

cnn_test_prediction =  cnn_model.predict(x_test_reshaped_cnn)
cnn_test_pred_eval = pd.DataFrame(classification_report(pd.DataFrame(y_test_reshaped_cnn).iloc[:,0], pd.DataFrame(np.round(cnn_test_prediction)).iloc[:,0].astype(int), output_dict=True)).transpose()
cnn_test_pred_eval["Model"] = "Convolutional Neural Network"
cnn_test_pred_eval["Data"] = "Test"
cnn_test_pred_eval.index.name = "Details"
cnn_test_pred_eval.reset_index(inplace=True)


### Recurrent Neural Network

In [20]:
######################################################################################
##### RNN Model Parameters
######################################################################################

input_shape = (1,x_train.shape[1])
no_of_labels = len(np.unique(y_train))

loss = "binary_crossentropy"
learning_rate = 0.001
epochs = 10
batch_size = 10
validation_split = 0.2
decode_units = 912

In [21]:
######################################################################################
##### Data Preparation for CNN
######################################################################################

x_train_reshaped_rnn = x_train.values.reshape(x_train.shape[0],1,x_train.shape[1])
x_test_reshaped_rnn = x_test.values.reshape(x_test.shape[0],1,x_test.shape[1])

y_train_reshaped_rnn = y_train.iloc[:,0].values
y_test_reshaped_rnn = y_test.iloc[:,0].values

In [22]:
######################################################################################
##### Initializing MODEL #####
######################################################################################

rnn_model = keras.models.Sequential()

rnn_model.add(keras.layers.LSTM(units = decode_units, activation='tanh', return_sequences = True, trainable = True, input_shape = input_shape))
# model.add(keras.layers.BatchNormalization(trainable = True))

rnn_model.add(keras.layers.LSTM(units = decode_units, activation='tanh', return_sequences = True, trainable = True))
# rnn_model.add(keras.layers.BatchNormalization(trainable = True))

# model.add(keras.layers.Dense(per_keyword_bit_length, activation='sigmoid'))
rnn_model.add(keras.layers.Dense(units = 1, activation='sigmoid'))

rnn_model.compile(optimizer = keras.optimizers.Adam(learning_rate = learning_rate), loss = loss)

# model.summary()

rnn_model.build()
rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1, 912)            4162368   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1, 912)            6657600   
_________________________________________________________________
dense_2 (Dense)              (None, 1, 1)              913       
Total params: 10,820,881
Trainable params: 10,820,881
Non-trainable params: 0
_________________________________________________________________


In [23]:
######################################################################################
##### Training RNN MODEL #####
######################################################################################

rnn_model.fit(x_train_reshaped_rnn, y_train_reshaped_rnn, epochs = epochs, validation_split = validation_split, batch_size = batch_size, verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8f2b968860>

In [24]:
######################################################################################
##### Generate RNN prediction and performance metrics using Train Data
######################################################################################

rnn_train_prediction =  rnn_model.predict(x_train_reshaped_rnn).reshape(x_train_reshaped_rnn.shape[0],1)
rnn_train_pred_eval = pd.DataFrame(classification_report(pd.DataFrame(y_train_reshaped_cnn).iloc[:,0], pd.DataFrame(np.round(rnn_train_prediction)).iloc[:,0].astype(int), output_dict=True)).transpose()
rnn_train_pred_eval["Model"] = "Recurrent Neural Network"
rnn_train_pred_eval["Data"] = "Train"
rnn_train_pred_eval.index.name = "Details"
rnn_train_pred_eval.reset_index(inplace=True)


In [25]:
######################################################################################
##### Generate RNN prediction and performance metrics using Test Data (UNSEEN Data)
######################################################################################

rnn_test_prediction =  rnn_model.predict(x_test_reshaped_rnn).reshape(x_test_reshaped_rnn.shape[0],1)
rnn_test_pred_eval = pd.DataFrame(classification_report(pd.DataFrame(y_test_reshaped_cnn).iloc[:,0], pd.DataFrame(np.round(rnn_test_prediction)).iloc[:,0].astype(int), output_dict=True)).transpose()
rnn_test_pred_eval["Model"] = "Recurrent Neural Network"
rnn_test_pred_eval["Data"] = "Test"
rnn_test_pred_eval.index.name = "Details"
rnn_test_pred_eval.reset_index(inplace=True)

# Comparison

### Collating all predictions together into 1 Dataframe for easy visualization

In [26]:
prediction_eval_all = pd.concat([svm_train_pred_eval, svm_test_pred_eval,
                                rf_train_pred_eval, rf_test_pred_eval,
                                cnn_train_pred_eval, cnn_test_pred_eval,
                                rnn_train_pred_eval, rnn_test_pred_eval])

In [27]:
print("Accuracy Details:")
prediction_eval_all[prediction_eval_all["Details"] == "accuracy"].iloc[:,[5,6,4]]

Accuracy Details:


Unnamed: 0,Model,Data,support
2,Linear Support Vector Machine,Train,1.0
2,Linear Support Vector Machine,Test,0.952381
2,Random Forest,Train,1.0
2,Random Forest,Test,0.857143
2,Convolutional Neural Network,Train,0.988095
2,Convolutional Neural Network,Test,0.904762
2,Recurrent Neural Network,Train,0.988095
2,Recurrent Neural Network,Test,0.904762


In [28]:
print("Classwise Details:")
prediction_eval_all[(prediction_eval_all["Details"] == "0") | (prediction_eval_all["Details"] == "1")].iloc[:,[5,6,0,1,2,3]]
## Here, details column means class

Classwise Details:


Unnamed: 0,Model,Data,Details,precision,recall,f1-score
0,Linear Support Vector Machine,Train,0,1.0,1.0,1.0
1,Linear Support Vector Machine,Train,1,1.0,1.0,1.0
0,Linear Support Vector Machine,Test,0,0.923077,1.0,0.96
1,Linear Support Vector Machine,Test,1,1.0,0.888889,0.941176
0,Random Forest,Train,0,1.0,1.0,1.0
1,Random Forest,Train,1,1.0,1.0,1.0
0,Random Forest,Test,0,0.909091,0.833333,0.869565
1,Random Forest,Test,1,0.8,0.888889,0.842105
0,Convolutional Neural Network,Train,0,0.97619,1.0,0.987952
1,Convolutional Neural Network,Train,1,1.0,0.976744,0.988235
