<a href="https://colab.research.google.com/github/sauravkb94/DNA-Sequencing-Classifier-/blob/main/DNA_Sequencing_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
df = pd.read_table('/content/human.txt')

In [None]:
df

In [None]:
count = df['class'].value_counts()
plt.figure(figsize= (8,4))
sns.barplot(count.index, count.values)
plt.xlabel('No of Occurances')
plt.ylabel('Category');

In [None]:
df['length'] = df['sequence'].apply(len)

In [None]:
df.hist(column= 'length', by = 'class', bins = 20, figsize=(12,12) );

**Let's define a function to collect all possible overlapping n-grams of a specified length from any sequence string. We will basically apply the n-grams to the complete sequences.**

In [None]:
# function to convert sequence strings into n-grams words, default size = 6 (hexamer words)
def create_ngrams(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [None]:
df['n-grams'] = df['sequence'].apply(create_ngrams)

In [None]:
df.head(10)

**Since we are going to use scikit-learn natural language processing tools to do the k-mer counting, we need to now convert the lists of k-mers for each gene into string sentences of words that the count vectorizer can use. We can also make a y variable to hold the class labels. Let's do that now.**

In [None]:
human_text = list(df['n-grams'])

In [None]:
for item in range(len(human_text)):
   human_text[item] = ' '.join(human_text[item])

In [None]:
human_text[1]

In [None]:
df['human_text'] = human_text

In [None]:
df.head(5)

**Now we will apply the BAG of WORDS using CountVectorizer using NLP**

In [None]:
# Creating the Bag of Words model using CountVectorizer()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(4,4))
X = vectorizer.fit_transform(df['human_text'])

In [None]:
df.shape

In [None]:
X.shape

In [None]:
# Creating the weight of Words model using TfidfTransformer()
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_data = TfidfTransformer().fit_transform(X)

In [None]:
tfidf_data.shape

In [None]:
# Splitting the human dataset into the training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_data, 
                                                df['class'], 
                                                    test_size = 0.20, 
                                                    random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

**A multinomial naive Bayes classifier will be created. .**

In [None]:
### Multinomial Naive Bayes Classifier ###
# The alpha parameter was determined by grid search previously
from sklearn.naive_bayes import MultinomialNB


In [None]:
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

 **Model performce metrics like the confusion matrix, accuracy, precision, recall and f1 score**

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(y_pred, name='Predicted')))


In [None]:

def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report,accuracy_score

In [None]:
rf = 'Random Forest Classfier'
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_predicted = rf.predict(X_test)
rf_acc_score = accuracy_score(y_test, rf_predicted)   

In [None]:
print("\n")
print("Accuracy of Random Forest:",rf_acc_score,'\n')
print(classification_report(y_test,rf_predicted))

In [None]:
dtc= DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred_dtc = dtc.predict(X_test)

In [None]:
dtc_acc_score = accuracy_score(y_test, y_pred_dtc)

In [None]:
print("\n")
print("Accuracy of Decision Tree :",dtc_acc_score,'\n')
print(classification_report(y_test,rf_predicted))

In [None]:
model_ev = pd.DataFrame({'Model': ['Confusion matrix Classifier','Decision Tree Classifier','Random Forest Classfier'],'Accuracy': [" %.3f" % (accuracy),dtc_acc_score,rf_acc_score]})
model_ev