# DNA Classification
### Predict whether or not a short sequence of DNA(E.coli) was a promoter or a 
### non-promoter using algorithms :
    1. Tensorflow decision forest
    2. Nearest Neighbors
    3. Decision Tree
    4. Random Forest
    5. Neural Net
    6. SVM Linear
    7. SVM RBF
    8. SVM Sigmoid

Dataset : https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sys
import sklearn

print('Python: {}'.format(sys.version))

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
names = ['Class', 'id', 'Sequence']
data = pd.read_csv('../input/promoters/promoters.csv', names = names, skiprows=1)
data.describe()

In [None]:
classes = data.loc[:, 'Class']
sequences = list(data.loc[:, 'Sequence'])
dataset = {}

for i, seq in enumerate(sequences):
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    nucleotides.append(classes[i])
    dataset[i] = nucleotides

#print(dataset)    
dframe = pd.DataFrame(dataset)
print(dframe)

In [None]:
dframe = dframe.transpose()
dframe.rename(columns = {57: 'Class'}, inplace = True) 
print(dframe.iloc[:5])
#print(dframe)
dframe.describe()

In [None]:
dframe.columns

In [None]:
model_data = pd.get_dummies(dframe)
model_data.iloc[:4]
model_data = model_data.drop(columns=['Class_-'])
model_data.rename(columns = {'Class_+':'Class'}, inplace=True)

In [None]:
model_data

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

from sklearn import model_selection

In [None]:
X = np.array(model_data.drop(['Class'], 1))
Y = np.array(model_data['Class'])
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=1)

In [None]:
scoring = 'accuracy'

# Models used
names = ["Nearest Neighbors",
         "Decision Tree", "Random Forest", "Neural Net", "SVM Linear", "SVM RBF", "SVM Sigmoid"]

classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10,  max_features=1),
    MLPClassifier(alpha=1),
    SVC(kernel = 'linear'), 
    SVC(kernel = 'rbf'),
    SVC(kernel = 'sigmoid')
]

models = zip(names, classifiers)

results = []
names = []

for name, model in models:
    # The object to use to fit the data.
    # K-Folds cross-validator, Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default).
    # Each fold is then used once as a validation while the k - 1 remaining folds form the training set.
    kfold = model_selection.KFold(n_splits=10, random_state = None)
    # Evaluate a score by cross-validation
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
models = zip(names, classifiers)
for name, model in models:
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(Y_test, predictions))
    print(classification_report(Y_test, predictions))

In [None]:
model = SVC(kernel = 'sigmoid')
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
print(name)
print(accuracy_score(Y_test, predictions))
print(classification_report(Y_test, predictions))

## Classification using Tensorflow decision forest

In [None]:
! pip install tensorflow_decision_forests

In [None]:
import tensorflow_decision_forests as tfdf
# Check the version of TensorFlow Decision Forests
print("Found TensorFlow Decision Forests v" + tfdf.__version__)

In [None]:
# Split the dataset into a training and a testing dataset.

def split_dataset(dataset, test_ratio=0.30):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]


# Encode the categorical label into an integer.
# Name of the label column.
label = "Class"

dframe.columns = [      '0',       '1',       '2',       '3',       '4',       '5',       '6',       '7',
             '8',       '9',      '10',      '11',      '12',      '13',      '14',      '15',
            '16',      '17',      '18',      '19',      '20',      '21',      '22',      '23',
            '24',      '25',      '26',      '27',      '28',      '29',      '30',      '31',
            '32',      '33',      '34',      '35',      '36',      '37',      '38',      '39',
            '40',      '41',      '42',      '43',      '44',      '45',      '46',      '47',
            '48',      '49',      '50',      '51',      '52',      '53',      '54',      '55',
            '56', 'Class']

classes = dframe[label].unique().tolist()
print(f"Label classes: {classes}")

dframe[label] = dframe[label].map(classes.index)

train_ds_pd, test_ds_pd = split_dataset(dframe)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

# Convert the dataset into a TensorFlow dataset.
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

# Train a Random Forest model.
model = tfdf.keras.RandomForestModel()
model.compile(metrics=["accuracy"])

model.fit(train_ds)

# Summary of the model structure.
model.summary()


In [None]:
evaluation = model.evaluate(test_ds, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

# Export the model to a SavedModel.
#model.save("project/model")