<font size=5>**Classification**</font>

In this notebook I split the given data into train and test set and then used the following classification models and compared their accuracy (from confusion metrix) in predicting the test set:
<ul>
<li>Logistic Regression 
<li>Support Vector Machine
<li>K-Neighbor Classifier
<li>Decision Tree Classifier
<li>Random Forest classifier
<li>Gradient Boost classifier
<li>Gaussian Naive Bayes
<li>Ada Boost Classifier
    </ul>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

dataset = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
dataset.head()

In [None]:
#Ploting distplot for all columns(for checking the distribution of data)
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in dataset.iloc[:,0:8].items():
    sns.distplot(v, ax=axs[index])
    index += 1
plt.plot()

**We can draw some major conclusion from the above plots and domain knowledge:
<br>Glucose, BloodPressure, SkinThickness, Insulin, BMI cannot be zero for any person.
<br>These are missing values replaced with zeros as mentioned in section 3.7 of this paper:**<br>https://www.sciencedirect.com/science/article/pii/S2352914816300016?via%3Dihub#s0050

In [None]:
# Removing the missing values
dataset[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
         'BMI']] = dataset[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
dataset.dropna(inplace=True)

# Spliting dataset into features(X) and target(y)
X = dataset.iloc[:,0:8].values
y = dataset[['Outcome']].values

# Feature Scaling
from sklearn.preprocessing import StandardScaler
scalerX = StandardScaler().fit(X)
X = scalerX.transform(X)

<h2>Logistic Regression

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size = 0.2, random_state = 89)
logreg = LogisticRegression()
logreg.fit(X_train1, y_train1.ravel())
y_pred1 = logreg.predict(X_test1)
cm1 = confusion_matrix(y_test1, y_pred1)
acc1 = (cm1[0,0]+cm1[1,1])/79
print("confusion matrix:\n", cm1)
print(classification_report(y_test1, y_pred1))
print("Accuracy:", acc1)

<h2>SVM

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=89)
svm = SVC(C=5)
svm.fit(X_train2, y_train2.ravel())
y_pred2 = svm.predict(X_test2)
cm2 = confusion_matrix(y_test2, y_pred2)
acc2 = (cm2[0,0]+cm2[1,1])/79
print("confusion matrix:\n", cm2)
print(classification_report(y_test2, y_pred2))
print("Accuracy:", acc2)

<h2>KNN

In [None]:
# Finding kvalue
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.2, random_state=89)
training_accuracy = []
test_accuracy = []
neighbors_settings = range(1, 11)
for n_neighbors in neighbors_settings:
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train3, y_train3.ravel())
    training_accuracy.append(knn.score(X_train3, y_train3))
    test_accuracy.append(knn.score(X_test3, y_test3))
plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()

In [None]:
knn = KNeighborsClassifier(n_neighbors=9) #Kvalue 9 from above graph
knn.fit(X_train3, y_train3.ravel())
y_pred3 = knn.predict(X_test3)
cm3 = confusion_matrix(y_test3, y_pred3)
acc3 = (cm3[0,0]+cm3[1,1])/79
print("confusion matrix:\n", cm3)
print(classification_report(y_test3, y_pred3))
print("Accuracy:", acc3)

<h2>Decision Tree

In [None]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, test_size=0.2, random_state=89)
tree = DecisionTreeClassifier(max_depth=1)
tree.fit(X_train4, y_train4.ravel())
y_pred4 = tree.predict(X_test4)
cm4 = confusion_matrix(y_test4, y_pred4)
acc4 = (cm4[0,0]+cm4[1,1])/79
print("confusion matrix:\n", cm4)
print(classification_report(y_test4, y_pred4))
print("Accuracy:", acc4)

<h2>Random Forest

In [None]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(X, y, test_size=0.2, random_state=89)
rf = RandomForestClassifier(max_depth=3, random_state=15)
rf.fit(X_train5, y_train5.ravel())
y_pred5 = rf.predict(X_test5)
cm5 = confusion_matrix(y_test5, y_pred5)
acc5 = (cm5[0,0]+cm5[1,1])/79
print("confusion matrix:\n", cm5)
print(classification_report(y_test5, y_pred5))
print("Accuracy:", acc5)

<h2>Gradient Boosting

In [None]:
X_train6, X_test6, y_train6, y_test6 = train_test_split(X, y, test_size=0.2, random_state=89)
gb = GradientBoostingClassifier(learning_rate=0.01, max_depth=4)
gb.fit(X_train6, y_train6.ravel())
y_pred6 = gb.predict(X_test6)
cm6 = confusion_matrix(y_test6, y_pred6)
acc6 = (cm6[0,0]+cm6[1,1])/79
print("confusion matrix:\n", cm6)
print(classification_report(y_test6, y_pred6))
print("Accuracy:", acc6)

<h2>Naive Bayes

In [None]:
X_train7, X_test7, y_train7, y_test7 = train_test_split(X, y, test_size=0.2, random_state=89)
gnb = GaussianNB()
gnb.fit(X_train7, y_train7.ravel())
y_pred7 = gnb.predict(X_test7)
cm7 = confusion_matrix(y_test7, y_pred7)
acc7 = (cm7[0,0]+cm7[1,1])/79
print("confusion matrix:\n", cm7)
print(classification_report(y_test7, y_pred7))
print("Accuracy:", acc7)

<h2>AdaBoost

In [None]:
X_train8, X_test8, y_train8, y_test8 = train_test_split(X, y, test_size=0.2, random_state=89)
ada = AdaBoostClassifier(learning_rate=0.01, n_estimators=30)
ada.fit(X_train8, y_train8.ravel())
y_pred8 = ada.predict(X_test8)
cm8 = confusion_matrix(y_test8, y_pred8)
acc8 = (cm8[0,0]+cm8[1,1])/79
print("confusion matrix:\n", cm8)
print(classification_report(y_test8, y_pred8))
print("Accuracy:", acc8)

In [None]:
#Plot of accuracy of each type of classifier
acc=np.array([acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8])
x=np.array(['Logistic Regression', 'Support Vector Machine', 'K-Neighbor Classifier', 'Decision Tree Classifier',
            'Random Forest classifier', 'Gradient Boost classifier', 'Gaussian Naive Bayes', 'Ada Boost Classifier'])
plt.scatter(x,acc,s=200,color=['cyan','green','red','yellow','black','magenta','blue','orange'])
plt.xticks(rotation=90)
plt.ylabel('Accuracy')