In [5]:
import numpy as np
import pandas as pd

In [6]:
names = ['Class', 'id', 'Sequence']
data = pd.read_csv('promoters.data', names = names)

In [7]:
print(data.head())

  Class         id                                           Sequence
0     +        S10  \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1     +       AMPC  \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2     +       AROH  \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3     +      DEOP2  \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4     +  LEU1_TRNA  \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [8]:
print(data.iloc[0])

Class                                                       +
id                                                        S10
Sequence    \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
Name: 0, dtype: object


In [9]:
classes = data.loc[:, 'Class']
print(classes[:5])

0    +
1    +
2    +
3    +
4    +
Name: Class, dtype: object


### Preprocessing the dataset

In [10]:
# generate list of DNA sequences
sequences = list(data.loc[:, 'Sequence'])
dataset = {}

# loop through sequences and split into individual nucleotides

for i, seq in enumerate(sequences):
    
    # split into nucleotides, removing tab characters
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    
    # append class assignment
    nucleotides.append(classes[i])
    
    # add to dataset
    dataset[i] = nucleotides
    
print(dataset[0])

['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']


In [11]:
# turn dataset into pandas dataframe
df = pd.DataFrame(dataset)
print(df)

   0   1   2   3   4   5   6   7   8   9   ... 96  97  98  99  100 101 102  \
0    t   t   g   a   t   a   c   t   c   t ...   c   c   t   a   g   c   g   
1    a   g   t   a   c   g   a   t   g   t ...   c   g   a   g   a   c   t   
2    c   c   a   t   g   g   g   t   a   t ...   g   c   t   a   g   t   a   
3    t   t   c   t   a   g   g   c   c   t ...   a   t   g   g   a   c   t   
4    a   a   t   g   t   g   g   t   t   a ...   g   a   a   g   g   a   t   
5    g   t   a   t   a   c   g   a   t   a ...   t   g   c   g   c   a   c   
6    c   c   g   g   a   a   g   c   a   a ...   a   g   c   t   a   t   t   
7    a   c   a   a   t   a   t   a   a   t ...   g   a   g   g   t   g   c   
8    a   t   g   t   t   g   g   a   t   t ...   a   c   a   t   g   g   a   
9    t   g   a   g   a   g   g   a   a   t ...   c   t   a   a   t   c   a   
10   a   a   a   t   a   a   a   a   t   c ...   c   t   c   c   c   c   c   
11   c   c   c   g   c   g   g   c   a   c ...   c   t   g   t  

In [12]:
# transpose the df
df = df.transpose()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [13]:
df.rename(columns = {57: 'Class'}, inplace = True)
print(df.shape)
df.head()

(106, 58)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [14]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,t,t,-
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [15]:
# desribe does not tell us enough information since the attributes are text.

series = []
for name in df.columns:
    series.append(df[name].value_counts())
    
info = pd.DataFrame(series)
print(info)
details = info.transpose()
print(details)

          t     c     a     g     -     +
0      38.0  27.0  26.0  15.0   NaN   NaN
1      26.0  22.0  34.0  24.0   NaN   NaN
2      27.0  21.0  30.0  28.0   NaN   NaN
3      26.0  30.0  22.0  28.0   NaN   NaN
4      22.0  19.0  36.0  29.0   NaN   NaN
5      24.0  18.0  42.0  22.0   NaN   NaN
6      30.0  21.0  38.0  17.0   NaN   NaN
7      32.0  20.0  34.0  20.0   NaN   NaN
8      32.0  22.0  33.0  19.0   NaN   NaN
9      28.0  22.0  36.0  20.0   NaN   NaN
10     31.0  22.0  38.0  15.0   NaN   NaN
11     29.0  31.0  21.0  25.0   NaN   NaN
12     34.0  14.0  29.0  29.0   NaN   NaN
13     21.0  38.0  24.0  23.0   NaN   NaN
14     54.0  13.0  23.0  16.0   NaN   NaN
15     54.0  24.0  17.0  11.0   NaN   NaN
16     24.0  14.0  15.0  53.0   NaN   NaN
17     18.0  29.0  40.0  19.0   NaN   NaN
18     23.0  44.0  27.0  12.0   NaN   NaN
19     24.0  24.0  31.0  27.0   NaN   NaN
20     34.0  22.0  28.0  22.0   NaN   NaN
21     26.0  20.0  31.0  29.0   NaN   NaN
22     27.0  27.0  22.0  30.0   Na

In [17]:
df = n_df.drop(columns = ['Class_-'])
df.rename(columns = {'Class_+': 'Class'}, inplace = True)
print(df.iloc[:5])


   0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...    54_t  55_a  55_c  \
0    0    0    0    1    1    0    0    0    0    1  ...       0     0     0   
1    0    0    0    1    0    0    1    0    0    1  ...       0     1     0   
2    0    0    1    0    0    0    0    1    1    0  ...       0     0     1   
3    1    0    0    0    1    0    0    0    0    0  ...       0     0     0   
4    0    0    0    1    0    1    0    0    0    0  ...       1     1     0   

   55_g  55_t  56_a  56_c  56_g  56_t  Class  
0     1     0     0     0     0     1      1  
1     0     0     1     0     0     0      1  
2     0     0     0     0     1     0      1  
3     0     1     0     1     0     0      1  
4     0     0     0     0     1     0      1  

[5 rows x 229 columns]


In [18]:
# Using the model_selection module to separate training and testing datasets

from sklearn import model_selection

x = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

# split data into training and testing datasets

X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.25, random_state = 47)


###  Training and Testing the classification algorithms

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score



# defining scoring method
scoring = 'accuracy'

# defining models to train

names = ["Nearest Neighbors", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", 
         "Naive Bayes", "SVM Linear", "SVM RBF", "SVM Sigmoid"]

classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth = 15),
    RandomForestClassifier(max_depth = 15, n_estimators = 15, max_features = 3),
    MLPClassifier(alpha = 1),
    GaussianNB(),
    SVC(kernel = 'linear', gamma = 'auto'),
    SVC(kernel = 'rbf',  gamma = 'auto'),
    SVC(kernel = 'sigmoid',  gamma = 'auto')
]

models = zip(names, classifiers)

# evaluate each model 
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits = 7, random_state = 47 )
    cv_rslt = model_selection.cross_val_score(model, X_train, y_train, 
                                             cv = kfold, scoring = scoring)
    results.append(cv_rslt)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_rslt.mean(), cv_rslt.std())
    print(msg)
    # test the algorithms on the validation datasets
    print("Testing:")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))
    


Nearest Neighbors: 0.733766 (0.069484)
Testing:
Nearest Neighbors
0.8518518518518519
              precision    recall  f1-score   support

           0       0.89      0.73      0.80        11
           1       0.83      0.94      0.88        16

   micro avg       0.85      0.85      0.85        27
   macro avg       0.86      0.83      0.84        27
weighted avg       0.86      0.85      0.85        27

Gaussian Process: 0.796537 (0.134574)
Testing:
Gaussian Process
0.8518518518518519
              precision    recall  f1-score   support

           0       0.89      0.73      0.80        11
           1       0.83      0.94      0.88        16

   micro avg       0.85      0.85      0.85        27
   macro avg       0.86      0.83      0.84        27
weighted avg       0.86      0.85      0.85        27

Decision Tree: 0.667749 (0.139245)
Testing:
Decision Tree
0.7037037037037037
              precision    recall  f1-score   support

           0       0.62      0.73      0.67   



Neural Net: 0.859307 (0.081277)
Testing:
Neural Net
0.9629629629629629
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.00      0.94      0.97        16

   micro avg       0.96      0.96      0.96        27
   macro avg       0.96      0.97      0.96        27
weighted avg       0.97      0.96      0.96        27

Naive Bayes: 0.746753 (0.126369)
Testing:
Naive Bayes
0.7777777777777778
              precision    recall  f1-score   support

           0       0.67      0.91      0.77        11
           1       0.92      0.69      0.79        16

   micro avg       0.78      0.78      0.78        27
   macro avg       0.79      0.80      0.78        27
weighted avg       0.81      0.78      0.78        27

SVM Linear: 0.897186 (0.102432)
Testing:
SVM Linear
0.9629629629629629
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        11
           1       1.0



In [33]:
p =  SVC(kernel = 'linear', gamma = 'auto')
p.fit(X_train, y_train)
prediction = p.predict(x[98].reshape(1, -1))
print(prediction)


[0]
