## Classifying DNA sequences

### Importing required packages and functions

In [30]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold, train_test_split, cross_val_score

### Loading the dataset

In [6]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['class', 'id', 'seq']
data = pd.read_csv(url, names = names)

In [10]:
print(data.head())

  class                                                seq
0     +  \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1     +  \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2     +  \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3     +  \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4     +  \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


## Data Preprocessing

In [9]:
data.drop(columns=['id'], inplace=True)

In [19]:
classes = data['class']
data = data['seq'].apply(lambda x: pd.Series([c for c in list(x) if c != "\t"]))
data['class'] = classes
print(data.head())

   0  1  2  3  4  5  6  7  8  9  ...  48 49 50 51 52 53 54 55 56 class
0  t  a  c  t  a  g  c  a  a  t  ...   g  c  t  t  g  t  c  g  t     +
1  t  g  c  t  a  t  c  c  t  g  ...   c  a  t  c  g  c  c  a  a     +
2  g  t  a  c  t  a  g  a  g  a  ...   c  a  c  c  c  g  g  c  g     +
3  a  a  t  t  g  t  g  a  t  g  ...   a  a  c  a  a  a  c  t  c     +
4  t  c  g  a  t  a  a  t  t  a  ...   c  c  g  t  g  g  t  a  g     +

[5 rows x 58 columns]


In [23]:
# Printing summary of nucleotides in each column
data.apply(pd.Series.value_counts)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,class
+,,,,,,,,,,,...,,,,,,,,,,53.0
-,,,,,,,,,,,...,,,,,,,,,,53.0
a,26.0,34.0,30.0,22.0,36.0,42.0,38.0,34.0,33.0,36.0,...,23.0,24.0,28.0,27.0,25.0,22.0,26.0,24.0,27.0,
c,27.0,22.0,21.0,30.0,19.0,18.0,21.0,20.0,22.0,22.0,...,36.0,42.0,31.0,32.0,21.0,32.0,29.0,29.0,17.0,
g,15.0,24.0,28.0,28.0,29.0,22.0,17.0,20.0,19.0,20.0,...,26.0,18.0,24.0,14.0,25.0,22.0,28.0,24.0,28.0,
t,38.0,26.0,27.0,26.0,22.0,24.0,30.0,32.0,32.0,28.0,...,21.0,22.0,23.0,33.0,35.0,30.0,23.0,29.0,34.0,


### One-hot encoding of data

In [26]:
data = pd.get_dummies(data)
data.drop(columns=['class_-'], inplace=True)
data.rename(columns={'class_+':'class'}, inplace=True)
print(data.head())

   0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...    54_t  55_a  55_c  \
0    0    0    0    1    1    0    0    0    0    1  ...       0     0     0   
1    0    0    0    1    0    0    1    0    0    1  ...       0     1     0   
2    0    0    1    0    0    0    0    1    1    0  ...       0     0     1   
3    1    0    0    0    1    0    0    0    0    0  ...       0     0     0   
4    0    0    0    1    0    1    0    0    0    0  ...       1     1     0   

   55_g  55_t  56_a  56_c  56_g  56_t  class  
0     1     0     0     0     0     1      1  
1     0     0     1     0     0     0      1  
2     0     0     0     0     1     0      1  
3     0     1     0     1     0     0      1  
4     0     0     0     0     1     0      1  

[5 rows x 229 columns]


### Splitting the data into train and test sets

In [28]:
X = np.array(data.drop(columns=['class']))
y = np.array(data['class'])
seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

## K-fold cross validation for model selection of various classifiers

In [31]:
scoring = 'accuracy'
names = ["Nearest Neighbors", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "SVM Linear", "SVM RBF", "SVM Sigmoid"]
classifiers = [
    KNeighborsClassifier(n_neighbors=5),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
]
models = zip(names, classifiers)
results = []
names = []
kfold = KFold(n_splits=10, random_state=seed)
for name, model in models:
    cv_res = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_res)
    names.append(name)
    print("{0}: {1} ({2})".format(name, cv_res.mean(), cv_res.std()))

Nearest Neighbors: 0.773214285714 (0.164837969298)
Gaussian Process: 0.873214285714 (0.0561578042626)
Decision Tree: 0.7875 (0.209538182678)
Random Forest: 0.671428571429 (0.125813678241)




Neural Net: 0.8875 (0.0875)
AdaBoost: 0.9125 (0.1125)
Naive Bayes: 0.8375 (0.1375)
SVM Linear: 0.85 (0.108972473589)
SVM RBF: 0.7375 (0.117924764151)
SVM Sigmoid: 0.569642857143 (0.159209222505)




### Comparision of all the classifiers

In [32]:
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

Nearest Neighbors
0.7407407407407407
              precision    recall  f1-score   support

           0       1.00      0.59      0.74        17
           1       0.59      1.00      0.74        10

   micro avg       0.74      0.74      0.74        27
   macro avg       0.79      0.79      0.74        27
weighted avg       0.85      0.74      0.74        27

Gaussian Process
0.8888888888888888
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        17
           1       0.77      1.00      0.87        10

   micro avg       0.89      0.89      0.89        27
   macro avg       0.88      0.91      0.89        27
weighted avg       0.91      0.89      0.89        27

Decision Tree
0.7407407407407407
              precision    recall  f1-score   support

           0       1.00      0.59      0.74        17
           1       0.59      1.00      0.74        10

   micro avg       0.74      0.74      0.74        27
   macro avg       0.7

### Conclusion:
Linear SVM performed the best out of all the classifiers with and average f1-score of 0.96