<a href="https://colab.research.google.com/github/sauravkb94/DNA-Sequencing-Classifier-/blob/main/DNA_Sequencing_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install biopython

In [None]:
import Bio

In [None]:
from Bio import SeqIO
for sequence in SeqIO.parse('/content/example_dna.fa', "fasta"):
    print(sequence.id)
    print(sequence.seq)
    print(len(sequence))

**One-hot encoding DNA Sequence**


Another approach is to use one-hot encoding to represent the DNA sequence. This is widely used in deep learning methods and lends itself well to algorithms like convolutional neural networks. In this example, “ATGC” would become [0,0,0,1], [0,0,1,0], [0,1,0,0], [1,0,0,0]. And these one-hot encoded vectors can either be concatenated or turned into 2-dimensional arrays.

In [None]:
import numpy as np
import re
def string_to_array(seq_string):
   seq_string = seq_string.lower()
   seq_string = re.sub('[^acgt]', 'n', seq_string)
   seq_string = np.array(list(seq_string))
   return seq_string
# create a label encoder with 'acgtn' alphabet
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(np.array(['a','c','g','t','z']))

In [None]:
from sklearn.preprocessing import OneHotEncoder
def one_hot_encoder(seq_string):
    int_encoded = label_encoder.transform(seq_string)
    onehot_encoder = OneHotEncoder(sparse=False, dtype=int)
    int_encoded = int_encoded.reshape(len(int_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(int_encoded)
    onehot_encoded = np.delete(onehot_encoded, -1, 1)
    return onehot_encoded

In [None]:
#So let’s try it out with a simple short sequence:
seq_test = 'GAATTCTCGAA'
one_hot_encoder(string_to_array(seq_test))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
human_data = pd.read_table('/content/human.txt')

In [None]:
human_data

In [None]:
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo

In [None]:
import cv2
from sklearn.model_selection import train_test_split

In [None]:
geneFamily = cv2.imread("/content/genefamily.PNG")
plt.imshow(geneFamily)

In [None]:
count = human_data['class'].value_counts()
plt.figure(figsize= (8,4))
sns.barplot(x=count.index, y=count.values, alpha=0.8)
plt.title("Human Class ",fontsize=10)
plt.xlabel('No of Occurances')
plt.ylabel('Category');
plt.show()

In [None]:
ex.pie(human_data,names='class',title='Propotion Of Human Classes',hole=0.5)

In [None]:
human_data['length'] = human_data['sequence'].apply(len)

In [None]:
human_data.hist(column= 'length', by = 'class', bins = 20, figsize=(12,12) );

**Let's define a function to collect all possible overlapping n-grams of a specified length from any sequence string. We will basically apply the n-grams to the complete sequences.**

In [None]:
allData=[human_data]

In [None]:
def getKmers(gene, size=4):
    allKmers=""
    for i in range(len(gene) - size + 1):
        allKmers+=gene[i:i+size]
        allKmers+=" "
    return allKmers[:-1]

In [None]:
# function to convert sequence strings into n-grams words, default size = 6 (hexamer words)
def create_ngrams(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [None]:
human_data['n-grams'] = human_data['sequence'].apply(create_ngrams)


In [None]:
human_data.head(10)

**Since we are going to use scikit-learn natural language processing tools to do the k-mer counting, we need to now convert the lists of k-mers for each gene into string sentences of words that the count vectorizer can use. We can also make a y variable to hold the class labels. Let's do that now.**

In [None]:
human_text = list(human_data['n-grams'])

In [None]:
for item in range(len(human_text)):
   human_text[item] = ' '.join(human_text[item])

In [None]:
human_text[1]

In [None]:
human_data['human_text'] = human_text

In [None]:
human_data.head(5)

In [None]:
for i in range(len(allData)):
    allData[i]["human_text"]=allData[i]['sequence'].apply(lambda x: getKmers(x))
    allData[i].drop(columns='sequence',axis=1,inplace=True)

In [None]:
allData[0]

**Now we will apply the BAG of WORDS using CountVectorizer using NLP**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
for i in range(len(allData)):
    cv = TfidfVectorizer(ngram_range=(5,5))
    X=allData[i]['human_text']
    Y=allData[i]['class']
    X = cv.fit_transform(X)
    allData[i]=[X,Y]

In [None]:
from tensorflow import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,LSTM
from sklearn import metrics 
from sklearn import metrics

In [None]:
import tensorflow as tf
ACCURACY_THRESHOLD = 0.95
class endRun(tf.keras.callbacks.Callback): 
    def on_epoch_end(self, epoch, logs={}): 
        if(logs.get('accuracy') > ACCURACY_THRESHOLD):   
            self.model.stop_training = True
callbacks = endRun()

In [None]:
match=[0,0,0]
mismatch=[0,0,0]
for i in range(len(allData)):
    X=allData[i][0]
    y=allData[i][1]
    _,input_sp=X.shape
    x=X.toarray()
    y=pd.get_dummies(y).values
    X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0, train_size = .75)
    model = Sequential()
    model.add(Dense(64,input_shape = (input_sp,), activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(7,activation='softmax'))
    print(model.summary())
    
    model.compile(loss='categorical_crossentropy',metrics=['accuracy',])
    his = model.fit(X_train, y_train, epochs=5000, batch_size=128,verbose=1,callbacks=[callbacks])
    yPred=model.predict(X_test)
    for j in range(len(yPred)):
        ans=yPred[j].tolist()
        pred=ans.index(max(ans))
        actual=np.where(y_test[j]==1)[0].tolist()[0]
        if(pred==actual):
            match[i]+=1
            continue
        mismatch[i]+=1

In [None]:
dataof=["human"]
for i in range(1):
    accuracy=match[i]/(match[i]+mismatch[i])
    print(dataof[i],accuracy*100)

In [None]:
h = his.history
h.keys()

In [None]:
plt.plot(h['loss'], c= "red")
plt.plot(h['accuracy'], c = "blue",)

plt.title("loss vs accuracy")
plt.show()
     

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report,accuracy_score

In [None]:
rf = 'Random Forest Classfier'
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)
rf_acc_score = accuracy_score(y_test, rf_predicted) 

In [None]:
print("\n")
print("Accuracy of Random Forest:",rf_acc_score,'\n')
print(classification_report(y_test,rf_predicted))

In [None]:
dtc= DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred_dtc = dtc.predict(X_test)

In [None]:
dtc_acc_score = accuracy_score(y_test, y_pred_dtc)

In [None]:
print("\n")
print("Accuracy of Decision Tree :",dtc_acc_score,'\n')
print(classification_report(y_test,rf_predicted))

In [None]:
knn= KNeighborsClassifier()
knn.fit(X_train,y_train)
y_pred_knn = knn.predict(X_test)

In [None]:
knn_acc_score = accuracy_score(y_test, y_pred_knn)

In [None]:
print("\n")
print("Accuracy of Kneighbors :",knn_acc_score,'\n')
print(classification_report(y_test,rf_predicted))

In [None]:
model_ev = pd.DataFrame({'Model': ['Random Forest Classfier','Decision Tree Classifier','K Neighbors Classifier'],'Accuracy': [rf_acc_score*100,dtc_acc_score*100,knn_acc_score*100]})
model_ev