# Import important libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Read CSV file

In [None]:
df=pd.read_csv('../input/diabetes-data-set/diabetes.csv')

# Data Analysis

In [None]:
df.head(10)

In [None]:
df.tail(5)

In [None]:
df.describe

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
top_age = df.Age.value_counts().head(15)
top_age

# Data Visualization

In [None]:
plt.figure(figsize=(12,6))
plt.xticks(rotation=75)
plt.title('Top ages in diabates diagonised people')
sns.barplot(x=top_age.index, y=top_age)

In [None]:
plt.figure(figsize=(12, 6))
plt.title("BloodPressure Ratio")
plt.xlabel('BloodPressure')
plt.ylabel('Number of respondents')

plt.hist(df.BloodPressure, bins=np.arange(10,100,10), color='purple')

In [None]:
outcome_counts = df.Outcome.value_counts()
outcome_counts

In [None]:
plt.figure(figsize=(12,6))
plt.title('Outcome pie chart')
plt.pie(outcome_counts, labels=outcome_counts.index, autopct='%1.1f%%', startangle=180)

# Split X and Y data for training

In [None]:
y=df.drop(df.iloc[:,0:-1],axis=1)

In [None]:
y

In [None]:
x=df.iloc[:,:-1]

# Split data for training and testing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y, test_size=0.25,random_state=7)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc, plot_precision_recall_curve
from sklearn import metrics

In [None]:
dtc=DecisionTreeClassifier()

In [None]:
dtc=dtc.fit(xtrain,ytrain)

In [None]:
pred1=dtc.predict(xtest)

In [None]:
pred1

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred1))
print("Precision:",metrics.precision_score(ytest, pred1))
print("Recall:",metrics.recall_score(ytest, pred1))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred1)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = dtc.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision =average_precision_score(ytest, pred1)
disp = plot_precision_recall_curve(dtc, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg=LogisticRegression()

In [None]:
logreg.fit(xtrain,ytrain)

In [None]:
pred2=logreg.predict(xtest)

In [None]:
pred2

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred2))
print("Precision:",metrics.precision_score(ytest, pred2))
print("Recall:",metrics.recall_score(ytest, pred2))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred2)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = logreg.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision = average_precision_score(ytest, pred2)
print(average_precision)
disp = plot_precision_recall_curve(logreg, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
ran=RandomForestClassifier(n_estimators=100)

In [None]:
ran.fit(xtrain,ytrain)

In [None]:
pred3=ran.predict(xtest)

In [None]:
pred3

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred3))
print("Precision:",metrics.precision_score(ytest, pred3))
print("Recall:",metrics.recall_score(ytest, pred3))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred3)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = ran.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision = average_precision_score(ytest, pred2)
print(average_precision)
disp = plot_precision_recall_curve(ran, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# Support vector classifier

In [None]:
from sklearn.svm import SVC

In [None]:
sv=SVC(probability=True)

In [None]:
sv.fit(xtrain,ytrain)

In [None]:
pred4=sv.predict(xtest)

In [None]:
pred4

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred4))
print("Precision:",metrics.precision_score(ytest, pred4))
print("Recall:",metrics.recall_score(ytest, pred4))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred4)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = sv.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision = average_precision_score(ytest, pred4)
print(average_precision)
disp = plot_precision_recall_curve(sv, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# KNN classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn=KNeighborsClassifier(n_neighbors=23)

In [None]:
knn.fit(xtrain,ytrain)

In [None]:
pred5=knn.predict(xtest)

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred5))
print("Precision:",metrics.precision_score(ytest, pred5))
print("Recall:",metrics.recall_score(ytest, pred5))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred5)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = knn.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision = average_precision_score(ytest, pred5)
print(average_precision)
disp = plot_precision_recall_curve(knn, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
#from sklearn.naive_bayes import BernoulliNB
#from sklearn.naive_bayes import MultinomialNB

In [None]:
nbg=GaussianNB()

In [None]:
nbg.fit(xtrain,ytrain)

In [None]:
pred6=nbg.predict(xtest)

In [None]:
pred6

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred6))
print("Precision:",metrics.precision_score(ytest, pred6))
print("Recall:",metrics.recall_score(ytest, pred6))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred6)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = nbg.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision = average_precision_score(ytest, pred6)
print(average_precision)
disp = plot_precision_recall_curve(nbg, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)

model = abc.fit(xtrain, ytrain)

pred7 = model.predict(xtest)

In [None]:
pred7

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred7))
print("Precision:",metrics.precision_score(ytest, pred7))
print("Recall:",metrics.recall_score(ytest, pred7))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred7)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = nbg.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision = average_precision_score(ytest, pred7)
print(average_precision)
disp = plot_precision_recall_curve(nbg, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

# Adaboost and SVM classifier usingn ensemble

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [None]:
svc=SVC(probability=True, kernel='linear')
abc =AdaBoostClassifier(n_estimators=50, base_estimator=svc,learning_rate=1)

In [None]:
model = abc.fit(xtrain, ytrain)

In [None]:
pred8 = model.predict(xtest)

In [None]:
pred8

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred8))
print("Precision:",metrics.precision_score(ytest, pred8))
print("Recall:",metrics.recall_score(ytest, pred8))

In [None]:
cnf_matrix = metrics.confusion_matrix(ytest, pred8)
cnf_matrix
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
y_pred_proba = nbg.predict_proba(xtest)[::,1]
fpr, tpr, _ = metrics.roc_curve(ytest,  y_pred_proba)
auc = metrics.roc_auc_score(ytest, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.title('Receiver Operating Characteristic Curve(ROC AUC)', y=1.1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
average_precision = average_precision_score(ytest, pred8)
print(average_precision)
disp = plot_precision_recall_curve(nbg, xtest, ytest)
disp.ax_.set_title('Binary class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

In [None]:
from lightgbm import LGBMClassifier

In [None]:
model_lgb= LGBMClassifier()
model_lgb.fit(xtrain,ytrain)

In [None]:
pred9=model_lgb.predict(xtest)

In [None]:
print("Accuracy:",metrics.accuracy_score(ytest, pred9))
print("Precision:",metrics.precision_score(ytest, pred9))
print("Recall:",metrics.recall_score(ytest, pred9))