In [1]:
import numpy as np
import pandas as pd

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names = names)

In [3]:
classes = data.loc[:, 'Class']
print(classes[:])

0      +
1      +
2      +
3      +
4      +
      ..
101    -
102    -
103    -
104    -
105    -
Name: Class, Length: 106, dtype: object


In [4]:
print(data.iloc[0])

Class                                                       +
id                                                        S10
Sequence    \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
Name: 0, dtype: object


In [5]:
classes = data.loc[:, 'Class']
print(classes[:5])

0    +
1    +
2    +
3    +
4    +
Name: Class, dtype: object


In [6]:
sequences = list(data.loc[:, 'Sequence'])
dataset = {}
for i, seq in enumerate(sequences):
   nucleotides = list(seq)
   nucleotides = [x for x in nucleotides if x != '\t']
   nucleotides.append(classes[i])
   dataset[i] = nucleotides
   print(dataset[0])
  

['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']
['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']
['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']
['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', '

In [7]:
dframe = pd.DataFrame(dataset)
print(dframe)

   0   1   2   3   4   5   6   7   8   9    ... 96  97  98  99  100 101 102  \
0    t   t   g   a   t   a   c   t   c   t  ...   c   c   t   a   g   c   g   
1    a   g   t   a   c   g   a   t   g   t  ...   c   g   a   g   a   c   t   
2    c   c   a   t   g   g   g   t   a   t  ...   g   c   t   a   g   t   a   
3    t   t   c   t   a   g   g   c   c   t  ...   a   t   g   g   a   c   t   
4    a   a   t   g   t   g   g   t   t   a  ...   g   a   a   g   g   a   t   
5    g   t   a   t   a   c   g   a   t   a  ...   t   g   c   g   c   a   c   
6    c   c   g   g   a   a   g   c   a   a  ...   a   g   c   t   a   t   t   
7    a   c   a   a   t   a   t   a   a   t  ...   g   a   g   g   t   g   c   
8    a   t   g   t   t   g   g   a   t   t  ...   a   c   a   t   g   g   a   
9    t   g   a   g   a   g   g   a   a   t  ...   c   t   a   a   t   c   a   
10   a   a   a   t   a   a   a   a   t   c  ...   c   t   c   c   c   c   c   
11   c   c   c   g   c   g   g   c   a   c  ...   c 

In [8]:
df = dframe.transpose()
print(df.iloc[:5])

  0  1  2  3  4  5  6  7  8  9   ... 48 49 50 51 52 53 54 55 56 57
0  t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t  +
1  t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a  +
2  g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g  +
3  a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c  +
4  t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g  +

[5 rows x 58 columns]


In [13]:
df.rename(columns = {57: 'Class'}, inplace= True) 
print(df.iloc[:5])
df

   0  1  2  3  4  5  6  7  8  9  ... 48 49 50 51 52 53 54 55 56 Class
0  t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t     +
1  t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a     +
2  g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g     +
3  a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c     +
4  t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g     +

[5 rows x 58 columns]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,c,c,t,c,a,a,t,g,g,c,...,g,a,a,c,t,a,t,a,t,-
102,g,t,a,t,t,c,t,c,a,a,...,t,c,a,a,c,a,t,t,g,-
103,c,g,c,g,a,c,t,a,c,g,...,a,a,g,g,c,t,t,c,c,-
104,c,t,c,g,t,c,c,t,c,a,...,a,g,g,a,g,g,a,a,c,-


In [16]:
p=df.transpose()

In [18]:
series = []
for name in p.columns:
    series.append(p[name].value_counts())
print(series)



[t    19
g    17
a    11
c    10
+     1
Name: 0, dtype: int64, t    17
c    15
a    13
g    12
+     1
Name: 1, dtype: int64, t    20
a    14
c    12
g    11
+     1
Name: 2, dtype: int64, a    18
t    18
g    15
c     6
+     1
Name: 3, dtype: int64, a    20
t    13
c    12
g    12
+     1
Name: 4, dtype: int64, g    21
a    19
t    11
c     6
+     1
Name: 5, dtype: int64, g    16
a    15
c    13
t    13
+     1
Name: 6, dtype: int64, a    22
t    17
c    11
g     7
+     1
Name: 7, dtype: int64, t    17
a    16
g    13
c    11
+     1
Name: 8, dtype: int64, t    17
c    17
a    15
g     8
+     1
Name: 9, dtype: int64, a    16
g    15
c    15
t    11
+     1
Name: 10, dtype: int64, a    18
c    14
g    14
t    11
+     1
Name: 11, dtype: int64, a    21
t    14
g    11
c    11
+     1
Name: 12, dtype: int64, c    18
t    16
a    12
g    11
+     1
Name: 13, dtype: int64, t    18
c    16
a    14
g     9
+     1
Name: 14, dtype: int64, a    16
g    16
c    15
t    10
+     1
Name: 15,

In [19]:
info = pd.DataFrame(series)
details = info.transpose()
print(details)

    0     1     2     3     4     5     6     7     8     9    ...   96   \
t  19.0  17.0  20.0  18.0  13.0  11.0  13.0  17.0  17.0  17.0  ...  15.0   
g  17.0  12.0  11.0  15.0  12.0  21.0  16.0   7.0  13.0   8.0  ...  12.0   
a  11.0  13.0  14.0  18.0  20.0  19.0  15.0  22.0  16.0  15.0  ...  15.0   
c  10.0  15.0  12.0   6.0  12.0   6.0  13.0  11.0  11.0  17.0  ...  15.0   
+   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0  ...   NaN   
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   1.0   

    97    98    99    100   101   102   103   104   105  
t  25.0  18.0  11.0  15.0  17.0  16.0  15.0  16.0  14.0  
g  13.0  11.0  20.0  13.0  16.0  11.0  15.0  17.0   8.0  
a   8.0  16.0  18.0  14.0  14.0  18.0  11.0  12.0  24.0  
c  11.0  12.0   8.0  15.0  10.0  12.0  16.0  12.0  11.0  
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
-   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0   1.0  

[6 rows x 106 columns]


In [20]:
numerical_df = pd.get_dummies(df)
numerical_df.iloc[:5]

Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class_+,Class_-
0,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [21]:
df = numerical_df.drop(columns=['Class_-'])
df.rename(columns = {'Class_+': 'Class'}, inplace = True)
print(df.iloc[:5])

   0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...  54_t  55_a  55_c  \
0    0    0    0    1    1    0    0    0    0    1  ...     0     0     0   
1    0    0    0    1    0    0    1    0    0    1  ...     0     1     0   
2    0    0    1    0    0    0    0    1    1    0  ...     0     0     1   
3    1    0    0    0    1    0    0    0    0    0  ...     0     0     0   
4    0    0    0    1    0    1    0    0    0    0  ...     1     1     0   

   55_g  55_t  56_a  56_c  56_g  56_t  Class  
0     1     0     0     0     0     1      1  
1     0     0     1     0     0     0      1  
2     0     0     0     0     1     0      1  
3     0     1     0     1     0     0      1  
4     0     0     0     0     1     0      1  

[5 rows x 229 columns]


In [22]:
import pandas as pd   
import numpy as np    
import matplotlib.pyplot as plt 
%matplotlib inline
%config InlineBackend.figure_formats = ['retina']
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.metrics import auc, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import fbeta_score, cohen_kappa_score
SEED = 42

In [293]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [294]:
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

In [295]:
def evaluate(model, X_test , y_test,modelname):
  y_pred=model.predict(X_test)
  precision    = precision_score(y_test, y_pred)
  recall       = recall_score(y_test, y_pred)
  f1score      = f1_score(y_test, y_pred) 
  rocauc       = roc_auc_score(y_test, y_pred)
  logloss      = log_loss(y_test, y_pred)
  accuracy     = accuracy_score(y_test, y_pred)
  confusionmatrix = confusion_matrix(y_test,y_pred)

  df_model = pd.DataFrame({  'model'        : [modelname],
                            'confusionmatrix'     :[confusionmatrix],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'logloss'      : [logloss],
                             'timetaken'    : [time2]   
                                }) 
  return df_model

### BASE MODEL EVALUATION

In [296]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('randomforest', RandomForestClassifier())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model1=evaluate(pipe, X_test,  y_test, 'RandomForest')
model1.timetaken[0] = time.time() - time2
print(model1)

0.96875
          model     confusionmatrix  accuracy  precision  recall   f1score  \
0  RandomForest  [[16, 0], [1, 15]]   0.96875        1.0  0.9375  0.967742   

    rocauc   logloss  timetaken  
0  0.96875  1.079337   0.341472  


In [297]:
%%time
from sklearn.ensemble import ExtraTreesClassifier
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('extratree', ExtraTreesClassifier())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model2=evaluate(pipe, X_test,  y_test, 'Extra Tree')
model2.timetaken[0] = time.time() - time2
print(model2)

0.90625
        model     confusionmatrix  accuracy  precision  recall   f1score  \
0  Extra Tree  [[16, 0], [3, 13]]   0.90625        1.0  0.8125  0.896552   

    rocauc  logloss  timetaken  
0  0.90625  3.23801   0.220162  
Wall time: 236 ms


In [298]:
%%time
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('LogisticRegression', LogisticRegression())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model3=evaluate(pipe, X_test,  y_test, 'Logistic Regression')
model3.timetaken[0] = time.time() - time2
print(model3)

0.9375
                 model     confusionmatrix  accuracy  precision  recall  \
0  Logistic Regression  [[15, 1], [1, 15]]    0.9375     0.9375  0.9375   

   f1score  rocauc   logloss  timetaken  
0   0.9375  0.9375  2.158699   0.031095  
Wall time: 39.9 ms


In [299]:
%%time
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model4=evaluate(pipe, X_test,  y_test, 'KNN')
model4.timetaken[0] = time.time() - time2
print(model4)

0.78125
  model    confusionmatrix  accuracy  precision  recall   f1score   rocauc  \
0   KNN  [[9, 7], [0, 16]]   0.78125   0.695652     1.0  0.820513  0.78125   

    logloss  timetaken  
0  7.555532   0.025066  
Wall time: 40.6 ms


In [300]:
%%time
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('LinearSVC', LinearSVC())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model5=evaluate(pipe, X_test,  y_test, 'LinearSVC')
model5.timetaken[0] = time.time() - time2
print(model5)

0.9375
       model     confusionmatrix  accuracy  precision  recall  f1score  \
0  LinearSVC  [[15, 1], [1, 15]]    0.9375     0.9375  0.9375   0.9375   

   rocauc   logloss  timetaken  
0  0.9375  2.158699   0.015406  
Wall time: 20 ms


In [301]:
%%time
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('GaussianNB',GaussianNB())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model6=evaluate(pipe, X_test,  y_test, 'GaussianNB')
model6.timetaken[0] = time.time() - time2
print(model6)

0.96875
        model     confusionmatrix  accuracy  precision  recall   f1score  \
0  GaussianNB  [[16, 0], [1, 15]]   0.96875        1.0  0.9375  0.967742   

    rocauc   logloss  timetaken  
0  0.96875  1.079337   0.016031  
Wall time: 22.1 ms


In [302]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model7=evaluate(pipe, X_test,  y_test, 'SVC')
model7.timetaken[0] = time.time() - time2
print(model7)

0.96875
  model     confusionmatrix  accuracy  precision  recall   f1score   rocauc  \
0   SVC  [[16, 0], [1, 15]]   0.96875        1.0  0.9375  0.967742  0.96875   

    logloss  timetaken  
0  1.079337   0.024621  


In [303]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)
time2 = time.time()
pipe = Pipeline([('scaler', StandardScaler()), ('Decision Tree', DecisionTreeClassifier())])
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
model8=evaluate(pipe, X_test,  y_test, 'Decision Tree')
model8.timetaken[0] = time.time() - time2
print(model8)

0.90625
           model     confusionmatrix  accuracy  precision  recall   f1score  \
0  Decision Tree  [[14, 2], [1, 15]]   0.90625   0.882353  0.9375  0.909091   

    rocauc  logloss  timetaken  
0  0.90625  3.23806   0.012211  


In [304]:
df_base= pd.concat([model1,model2,model3,model4,model5,model6,model7,model8],axis = 0).reset_index()
df_base.drop('index', axis=1, inplace=True)
df_base

Unnamed: 0,model,confusionmatrix,accuracy,precision,recall,f1score,rocauc,logloss,timetaken
0,RandomForest,"[[16, 0], [1, 15]]",0.96875,1.0,0.9375,0.967742,0.96875,1.079337,0.341472
1,Extra Tree,"[[16, 0], [3, 13]]",0.90625,1.0,0.8125,0.896552,0.90625,3.23801,0.220162
2,Logistic Regression,"[[15, 1], [1, 15]]",0.9375,0.9375,0.9375,0.9375,0.9375,2.158699,0.031095
3,KNN,"[[9, 7], [0, 16]]",0.78125,0.695652,1.0,0.820513,0.78125,7.555532,0.025066
4,LinearSVC,"[[15, 1], [1, 15]]",0.9375,0.9375,0.9375,0.9375,0.9375,2.158699,0.015406
5,GaussianNB,"[[16, 0], [1, 15]]",0.96875,1.0,0.9375,0.967742,0.96875,1.079337,0.016031
6,SVC,"[[16, 0], [1, 15]]",0.96875,1.0,0.9375,0.967742,0.96875,1.079337,0.024621
7,Decision Tree,"[[14, 2], [1, 15]]",0.90625,0.882353,0.9375,0.909091,0.90625,3.23806,0.012211
