In [1]:
import sys
import pandas
import numpy
import sklearn


print('python : {}'.format(sys.version))
print('pandas : {}'.format(pandas.__version__))
print('numpy  : {}'.format(numpy.__version__))
print('sklearn: {}'.format(sklearn.__version__))

import pandas as pd
import numpy as np

python : 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64 bit (AMD64)]
pandas : 0.22.0
numpy  : 1.14.2
sklearn: 0.19.1


In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
columns=['Class', 'Id', 'Sequence']

df = pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,Class,Id,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [3]:
df.isnull().sum()

Class       0
Id          0
Sequence    0
dtype: int64

## Data Transformation

As from above we can see that, there are no null values present in the dataset. Yet the data set is not ready for processing, we will have to transform it. Below are the points that needs to be addressed:

- The DNA sequences are stored as a string in a single column, we will have to store it in sepearte columns.
- We need to remove the unwanted charactres ('\t').
- The column values are in string format, in order to run the ML methods, we will have to trnsform it in numeric values.

In [4]:
# filter out all the values of column class
classes = df.loc[:, 'Class']
print(classes[0 : 5])

0    +
1    +
2    +
3    +
4    +
Name: Class, dtype: object


In [5]:
# filter out all the values of column Sequence
sequences = df.loc[ : , 'Sequence']
sequences[0 : 5]

0    \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1    \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2    \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3    \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4    \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...
Name: Sequence, dtype: object

In [6]:
# loop through each value of sequences and and store them in a dictionary

dataset = {}

for i,seq in enumerate(sequences):
    nucleotides = sequences[i]
    nucleotides = [x for x in nucleotides if x != '\t']
    
    nucleotides.append(classes[i])
    
    dataset[i]=nucleotides

print(dataset[0])

['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+']


In [7]:
dframe=pd.DataFrame(dataset)
dframe.shape

(58, 106)

In [8]:
dframe.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,t,t,g,a,t,a,c,t,c,t,...,c,c,t,a,g,c,g,c,c,t


In [9]:
# Transpose the dframe to reshape the dataframe 
data=dframe.T
data.shape

(106, 58)

In [10]:
data.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+


In [11]:
data=data.rename(columns={57: 'class'})
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [12]:
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,c,t,-
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [13]:
# desribe does not tell us enough information since the attributes are text. Lets record value counts for each sequence

series = []
for col in data.columns:
    series.append(data[col].value_counts())

info = pd.DataFrame(series).T
print(info)

      0     1     2     3     4     5     6     7     8     9  ...      48  \
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...     NaN   
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...     NaN   
a  26.0  34.0  30.0  22.0  36.0  42.0  38.0  34.0  33.0  36.0  ...    23.0   
c  27.0  22.0  21.0  30.0  19.0  18.0  21.0  20.0  22.0  22.0  ...    36.0   
g  15.0  24.0  28.0  28.0  29.0  22.0  17.0  20.0  19.0  20.0  ...    26.0   
t  38.0  26.0  27.0  26.0  22.0  24.0  30.0  32.0  32.0  28.0  ...    21.0   

     49    50    51    52    53    54    55    56  class  
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  
a  24.0  28.0  27.0  25.0  22.0  26.0  24.0  27.0    NaN  
c  42.0  31.0  32.0  21.0  32.0  29.0  29.0  17.0    NaN  
g  18.0  24.0  14.0  25.0  22.0  28.0  24.0  28.0    NaN  
t  22.0  23.0  33.0  35.0  30.0  23.0  29.0  34.0    NaN  

[6 rows x 58 columns]


In [14]:
# since the data above is in string format, we will have to convert it into numeric values, 
# since we can't run ML algo's on string values. Pandas get_dummies() function will help us to achieve that
data_numeric = pd.get_dummies(data)
data_numeric[0:5]

Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,class_+,class_-
0,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


In [15]:
# We don't need both class columns.  Lets drop one then rename the other to simply 'Class'.

data_numeric.drop(columns=['class_-'], inplace=True)

data_numeric.rename(columns = {'class_+' : 'class'}, inplace=True)

data_numeric[0:5]

Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,54_t,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,class
0,0,0,0,1,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
1,0,0,0,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
2,0,0,1,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,1
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
4,0,0,0,1,0,1,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1


In [16]:
# Use the model_selection module to separate training and testing datasets
from sklearn import model_selection

X = np.array(data_numeric.drop(columns = ['class']))
y = np.array(data_numeric['class'])

# define seed for reproducibility
seed = 1

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=seed)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(84, 228)
(22, 228)
(84,)
(22,)


# Training and Testing the Classification Algorithms

In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score

score='accuracy'

names = ["Nearest Neighbors", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes",
        "SVM Linear", "SVM RBF", "SVM Sigmoid"]

classifiers= [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth = 5),
    RandomForestClassifier(max_depth = 5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='linear'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid')
]

models = zip(names, classifiers)
a=1
names = []
results = []

for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=score)
    names.append(name)
    results.append(cv_results)
    print('classifier : {}, mean={}, std={}'.format(name, cv_results.mean(), cv_results.std()))
    


classifier : Nearest Neighbors, mean=0.7916666666666667, std=0.1359409367026906
classifier : Gaussian Process, mean=0.8680555555555556, std=0.10472074858969059
classifier : Decision Tree, mean=0.7819444444444444, std=0.1095462724261722
classifier : Random Forest, mean=0.6194444444444445, std=0.10322724542256692




classifier : Neural Net, mean=0.8805555555555555, std=0.09706336227586748
classifier : AdaBoost, mean=0.9069444444444444, std=0.06833841444523636
classifier : Naive Bayes, mean=0.8486111111111111, std=0.10119845128859373
classifier : SVM Linear, mean=0.8805555555555555, std=0.07510281019219875
classifier : SVM RBF, mean=0.7763888888888889, std=0.10233576122104802
classifier : SVM Sigmoid, mean=0.575, std=0.1868765354617129


In [48]:
#print(type(models))
models = zip(names, classifiers)
for name,model in models:
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(Y_test, predictions))
    print(classification_report(Y_test, predictions))
   


Nearest Neighbors
0.7727272727272727
             precision    recall  f1-score   support

          0       1.00      0.67      0.80        15
          1       0.58      1.00      0.74         7

avg / total       0.87      0.77      0.78        22

Gaussian Process
0.9545454545454546
             precision    recall  f1-score   support

          0       1.00      0.93      0.97        15
          1       0.88      1.00      0.93         7

avg / total       0.96      0.95      0.96        22

Decision Tree
0.7727272727272727
             precision    recall  f1-score   support

          0       0.92      0.73      0.81        15
          1       0.60      0.86      0.71         7

avg / total       0.82      0.77      0.78        22

Random Forest
0.7272727272727273
             precision    recall  f1-score   support

          0       0.91      0.67      0.77        15
          1       0.55      0.86      0.67         7

avg / total       0.79      0.73      0.74        22

N



# ----------------------------------------Thank You----------------------------------------------------- 