## Implementing machine learning model to classify if a drug is more efficacious in the ABC-16 strain relative to the parental strain using the presence of MACCS fingerprints 

In [None]:
# importing libraries 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import metrics

from sklearn.metrics import auc
import matplotlib.pyplot as plt1
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# loading the excel file
dataset = pd.read_excel (r'..data/MACCSbinary.xlsx', sheet_name='MACCSbinary')
# droping the test set with NA labels
df_train =dataset.dropna()
X = df_train.iloc[:, 3:152]
Y = df_train.iloc[:, 0]

In [None]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [None]:
# Standardizing the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Implementation KNN 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 20, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)
# predicting the test set
y_pred_knn = classifier_knn.predict(X_test)
y_test_knn = y_test.values
#y_pred_probs
y_knn_probs = classifier_knn.predict_proba(X_test)

In [None]:
# calculating the fpr,tpr and area under the curve values for the knn implementation
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test_knn, y_knn_probs[:,1], pos_label = "Sensitive")
roc_auc_knn = auc(fpr_knn, tpr_knn)

### Implementing logistic Regression

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression
classifier_log = LogisticRegression(solver = "liblinear",random_state = 0, penalty = "l1")
classifier_log.fit(X_train, y_train)
y_pred_log = classifier_log.predict(X_test)

In [None]:
y_test_log = y_test.values
#y_pred_probs
y_probs_log = classifier_log.predict_proba(X_test)
fpr_log, tpr_log, thresholds_log = roc_curve(y_test_log, y_probs_log [:,1], pos_label = "Sensitive")
roc_auc_log = auc(fpr_log, tpr_log)

### Implementing GBM

In [None]:
# GBM methods
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
classifier_GBM = GradientBoostingClassifier(n_estimators= 50, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
classifier_GBM.score(X_test, y_test)
y_pred_GBM = classifier_GBM.predict(X_test)

In [None]:
y_test_GBM = y_test.values
#y_pred_probs
y_GBM_probs = classifier_GBM.predict_proba(X_test)
fpr_GBM, tpr_GBM, thresholds_GBM = roc_curve(y_test_GBM, y_GBM_probs[:,1], pos_label = "Sensitive")
roc_auc_GBM = auc(fpr_GBM, tpr_GBM)

### Implementing SVM

In [None]:
# SVM model
from sklearn import svm
from sklearn.svm import SVC

classifier_svm = SVC(kernel = "sigmoid", random_state = 0, probability=True)
classifier_svm.fit(X_train, y_train)
classifier_svm.score(X_test, y_test)
y_pred_svm = classifier_svm.predict(X_test)

In [None]:
y_test_SVM = y_test.values
#y_pred_probs
y_SVM_probs = classifier_svm.predict_proba(X_test)
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test_SVM, y_SVM_probs[:,1], pos_label = "Sensitive")
roc_auc_svm = auc(fpr_svm, tpr_svm)

### Implementing Naive Bayes

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier_NB = GaussianNB()
classifier_NB.fit(X_train, y_train)
y_pred_NB = classifier_NB.predict(X_test)

In [None]:
y_test_nb = y_test.values
#y_pred_probs
y_nb_probs = classifier_NB.predict_proba(X_test)
fpr_nb, tpr_nb, thresholds_nb = roc_curve(y_test_nb, y_nb_probs[:,1], pos_label = "Sensitive")
roc_auc_nb = auc(fpr_nb, tpr_nb)

### Implementing Decision Trees

In [None]:
# Decision trees
from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)
y_pred_dt = classifier_dt.predict(X_test)

In [None]:
y_test_dt = y_test.values
#y_pred_probs
y_dt_probs = classifier_dt.predict_proba(X_test)
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test_dt, y_dt_probs[:,1], pos_label = "Sensitive")
roc_auc_dt = auc(fpr_dt, tpr_dt)

## ROC Curve of the models

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr_knn, tpr_knn, 'b', label = 'AUC of KNN = %0.2f' % roc_auc_knn, color='yellow')
plt.plot(fpr_log, tpr_log, 'b', label = 'AUC of Logistic = %0.2f' % roc_auc_log, color='red')
plt.plot(fpr_GBM, tpr_GBM, 'b', label = 'AUC of GBM = %0.2f' % roc_auc_GBM, color='pink')
plt.plot(fpr_svm, tpr_svm, 'b', label = 'AUC of SVM = %0.2f' % roc_auc_svm, color = "blue")
plt.plot(fpr_nb, tpr_nb, 'b', label = 'AUC of NB = %0.2f' % roc_auc_nb, color = "orange")
plt.plot(fpr_dt, tpr_dt, 'b', label = 'AUC of Decision tree = %0.2f' % roc_auc_dt, color ="maroon")
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve')
plt.show()

Based on the AUC of the 6 models, KNN with 20 nearest neighbors and GBM with 50 estimators and depth of 1 gives the best classification for the data 

## Precision-Recall Curve

In [None]:

plt1.figure(figsize=(8, 6), dpi=80)
plot_precision_recall_curve(classifier_knn, X_test, y_test,ax = plt1.gca())

plot_precision_recall_curve(classifier_log, X_test, y_test, ax = plt1.gca())

plot_precision_recall_curve(classifier_GBM, X_test, y_test, ax = plt1.gca())
plot_precision_recall_curve(classifier_svm, X_test, y_test, ax = plt1.gca())
plot_precision_recall_curve(classifier_NB, X_test, y_test, ax = plt1.gca())
plot_precision_recall_curve(classifier_dt, X_test, y_test, ax = plt1.gca())
plt1.legend(loc = 'lower right')
plt1.title('Precision-Recall curve')

 Here again, the Knn and GBM models gives the best preformance based on the AP of the Precision-Recall curve.

In [None]:
# calculating the accuracy of the model
y_pred = [y_pred_knn, y_pred_log, y_pred_GBM, y_pred_svm, y_pred_NB, y_pred_dt]
method = ["KNN","Logistic","GBM", "SVM", "Naive Bayes","Decision Trees"]
acc = []
zip_object = zip(y_pred,method)
#y_test = y_test.values

for i,j in zip_object:
    print("Accuracy of " + str(j) + " is " + str(accuracy_score(y_test, i)))
    

KNN classifier gives the highest accuracy of 0.73 followed by GBM with an accuracy of 0.70