# 1. About Dataset
## 1.1. Context
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
## 1.2. Content
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
## 1.3. Variables
**Pregnancies:** Number of times pregnant

**Glucose:** Plasma glucose concentration a 2 hours in an oral glucose tolerance test

**BloodPressure:** Diastolic blood pressure (mm Hg)

**SkinThickness:** Triceps skin fold thickness (mm)

**Insulin:** 2-Hour serum insulin (mu U/ml)

**BMI:** Body mass index (weight in kg/(height in m)^2)

**DiabetesPedigreeFunction:** Diabetes pedigree function

**Age:** Age (years)

**Outcome:** Class variable (0 or 1) 268 of 768 are 1, the others are 0

# 2. Prerequisites
## 2.1. Importing libraries

In [None]:
import os
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB , MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import precision_score,accuracy_score,confusion_matrix,f1_score,recall_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.classifier import StackingCVClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

## 2.2. Defining the functions

In [None]:
def dist(i):
  plt.subplot(4,2,i+1)
  sns.histplot(df, x=df.columns[i], hue=df.Outcome, bins=17, kde=True)

# kde: kernel density estimate

In [None]:
def dist_box(df,col):
    fig,(ax1,ax2)=plt.subplots(2,1)
    sns.distplot(df[col],ax=ax1)
    sns.boxplot(df[col],ax=ax2)

## 2.3. Importing the data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

# 3. Data understanding
## 3.1 Details

In [None]:
print (">>> Data frame shape: " , df.shape , "<<<\n")
df.head()

In [None]:
df.info (verbose = True)

### ***!!!*** There isn't any null, because nulls has been entered as Zeros!

In [None]:
df.describe().T

In [None]:
df.Outcome.value_counts()

## 3.2. Preliminary Visualizations

In [None]:
plt.figure(figsize=(20,20), dpi = 300)
for i in range (0,len(df.columns)-1):
  dist(i)

In [None]:
from pandas.plotting import scatter_matrix
p=scatter_matrix(df,figsize=(15,15))

In [None]:
fig = plt.figure(figsize=(18,15))
gs = fig.add_gridspec(3,3)
gs.update(wspace=0.5, hspace=0.25)

ax0 = fig.add_subplot(gs[0,0])
ax1 = fig.add_subplot(gs[0,1])
ax2 = fig.add_subplot(gs[0,2])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[1,2])
ax6 = fig.add_subplot(gs[2,0])
ax7 = fig.add_subplot(gs[2,1])
ax8 = fig.add_subplot(gs[2,2])

background_color = "#c9c9ee"
# c9c9ee
color_palette = ["#f56476","#ff8811","#ff0040","#ff7f6c","#f0f66e","#990000"]
fig.patch.set_facecolor(background_color) 
ax0.set_facecolor(background_color) 
ax1.set_facecolor(background_color)
ax2.set_facecolor(background_color)
ax3.set_facecolor(background_color)
ax4.set_facecolor(background_color)
ax5.set_facecolor(background_color)
ax6.set_facecolor(background_color)
ax7.set_facecolor(background_color)
ax8.set_facecolor(background_color)

# Title of the plot
ax0.spines["bottom"].set_visible(False)
ax0.spines["left"].set_visible(False)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)
ax0.tick_params(left=False, bottom=False)
ax0.set_xticklabels([])
ax0.set_yticklabels([])
ax0.text(0.5,0.5,
         'Boxenplot plot \n features\n',
         horizontalalignment='center',
         verticalalignment='center',
         fontsize=18, fontweight='bold',
         fontfamily='serif',
         color="#000000")

# Pregnancies 
ax1.text(-0.18, 19, 'Pregnancies', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax1.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax1,y=df['Pregnancies'],palette=["#f56476"],width=0.6)
ax1.set_xlabel("")
ax1.set_ylabel("")

# Glucose 
ax2.text(-0.1, 217, 'Glucose', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax2.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax2,y=df['Glucose'],palette=["#ff8811"],width=0.6)
ax2.set_xlabel("")
ax2.set_ylabel("")

# BloodPressure 
ax3.text(-0.20, 132, 'BloodPressure', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax3.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax3,y=df['BloodPressure'],palette=["#ff0040"],width=0.6)
ax3.set_xlabel("")
ax3.set_ylabel("")

# SkinThickness 
ax4.text(-.2, 110, 'SkinThickness', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax4.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax4,y=df['SkinThickness'],palette=["#ff7f6c"],width=0.6)
ax4.set_xlabel("")
ax4.set_ylabel("")

# Insulin 
ax5.text(-0.10, 900, 'Insulin', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax5.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax5,y=df['Insulin'],palette=["#f0f66e"],width=0.6)
ax5.set_xlabel("")
ax5.set_ylabel("")

# BMI 
ax6.text(-0.08, 77, 'BMI', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax6.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax6,y=df['BMI'],palette=["#990000"],width=0.6)
ax6.set_xlabel("")
ax6.set_ylabel("")

# DPF 
ax7.text(-0.065, 2.8, 'DPF', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax7.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax7,y=df['DiabetesPedigreeFunction'],palette=["#3339FF"],width=0.6)
ax7.set_xlabel("")
ax7.set_ylabel("")

# Age 
ax8.text(-0.08, 86, 'Age', fontsize=14, fontweight='bold', fontfamily='serif', color="#000000")
ax8.grid(color='#000000', linestyle=':', axis='y', zorder=0,  dashes=(1,5))
sns.boxenplot(ax=ax8,y=df['Age'],palette=["#34495E"],width=0.6)
ax8.set_xlabel("")
ax8.set_ylabel("")



for s in ["top","right","left"]:
    ax1.spines[s].set_visible(False)
    ax2.spines[s].set_visible(False)
    ax3.spines[s].set_visible(False)
    ax4.spines[s].set_visible(False)
    ax5.spines[s].set_visible(False)
    ax6.spines[s].set_visible(False)
    ax7.spines[s].set_visible(False)
    ax8.spines[s].set_visible(False)

In [None]:
p=sns.pairplot(df, hue = 'Outcome')

In [None]:
plt.figure(figsize=(15,15))
p=sns.heatmap(df.corr(), annot=True,cmap ='RdYlGn')

In [None]:
print(df.Outcome.value_counts())
p=df.Outcome.value_counts().plot(kind='bar',figsize=(10,10)) 

### ***!!!*** Our data is imbalance

# 4. Data manipulation
## 4.1. Null Managing

In [None]:
print(df.replace(0,np.NaN).isnull().sum())

### ***!!!*** Pregnancies' zeros should not be considered nulls

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
print("\nCount of discovered nulls:\n" , df.isnull().sum())

In [None]:
df['Glucose'].fillna(df['Glucose'].mean(), inplace = True)
df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace = True)
df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace = True)
df['Insulin'].fillna(np.random.choice(df['Insulin'][~df['Insulin'].isna()]),inplace = True)
df['BMI'].fillna(df['BMI'].mean(), inplace = True)

print("\nCount of nulls in the secondary date frame:\n" , df.isnull().sum())

## 4.2. Outlier Managing

In [None]:
plt.figure(figsize=(20,20))
for i in range (0,len(df.columns)-1):
    dist_box(df,df.columns[i])

In [None]:
u=df['Insulin'].mean()+(3*df['Insulin'].std())
l=df['Insulin'].mean()-(3*df['Insulin'].std())

df_out_in=df[(df['Insulin']>u)|(df['Insulin']<l)]

print("Number of Outliers:" , len(df_out_in))
df_out_in

In [None]:
df['Insulin']=np.where(df['Insulin']>=415,df['Insulin'].mode()[0],df['Insulin'])

In [None]:
u=df['BloodPressure'].mean()+(3*df['BloodPressure'].std())
l=df['BloodPressure'].mean()-(3*df['BloodPressure'].std())

df_out_bp=df[(df['BloodPressure']>u)|(df['BloodPressure']<l)]

print("Number of Outliers:" , len(df_out_bp))
df_out_bp

In [None]:
df['BloodPressure']=np.where((df['BloodPressure']>=110)                                                       
                           | (df['BloodPressure']<=30  ) ,df['BloodPressure'].mode()[0],df['BloodPressure'])

In [None]:
u=df['SkinThickness'].mean()+(3*df['SkinThickness'].std())
l=df['SkinThickness'].mean()-(3*df['SkinThickness'].std())

df_out_st=df[(df['SkinThickness']>u)|(df['SkinThickness']<l)]

print("Number of Outliers:" , len(df_out_st))
df_out_st

In [None]:
df['SkinThickness']=np.where((df['SkinThickness']>=56)                                                       
                           ,df['SkinThickness'].mode()[0],df['SkinThickness'])

In [None]:
u=df['Pregnancies'].mean()+(3*df['Pregnancies'].std())
l=df['Pregnancies'].mean()-(3*df['Pregnancies'].std())

df_out_pr=df[(df['Pregnancies']>u)|(df['Pregnancies']<l)]

print("Number of Outliers:" , len(df_out_pr))
df_out_pr

In [None]:
df['Pregnancies']=np.where((df['Pregnancies']>13)                                                       
                           ,df['Pregnancies'].mode()[0],df['Pregnancies'])

## 4.3. Train / Test Splitting

In [None]:
x=df.drop('Outcome',axis=1)
y=df.Outcome

xtrain,xtest,ytrain,ytest= train_test_split(x,y,test_size=0.3,random_state=7)
print(xtrain.shape)
print(xtest.shape)

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x, y, random_state = 7, stratify=y)
smt = SMOTE()
xtrain, ytrain = smt.fit_resample(xtrain, ytrain)
np.bincount(ytrain)

## 4.4. Data Scaling

In [None]:
sc= preprocessing.StandardScaler()
xtrain=pd.DataFrame(sc.fit_transform(xtrain,ytrain ),index=xtrain.index,columns=xtrain.columns)

xtrain.head()

In [None]:
xtest=pd.DataFrame(sc.transform(xtest),index=xtest.index,columns=xtest.columns)
xtest.head()

In [None]:
"""sc_x = preprocessing.StandardScaler()
x =  pd.DataFrame(sc_x.fit_transform(df.drop(["Outcome"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])

x.describe().T"""

# 5. ML Algorithms

In [None]:
test_scores = []
train_scores = []
k_range = list(range(1,30))

for i in k_range:

    knn = KNeighborsClassifier(i)
    knn.fit(xtrain,ytrain)
    
    train_scores.append(knn.score(xtrain,ytrain))
    test_scores.append(knn.score(xtest,ytest))
    
plt.plot(k_range, test_scores)

plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')

plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')    

In [None]:
#LogisticRegression
lr_c=LogisticRegression(random_state=7)
lr_c.fit(xtrain,ytrain)
lr_pred=lr_c.predict(xtest)
lr_cm=confusion_matrix(ytest,lr_pred)
lr_ac=accuracy_score(ytest, lr_pred)

#MLP
MLP = MLPClassifier(random_state=7)
MLP.fit(xtrain,ytrain)
MLP_pred=MLP.predict(xtest)
MLP_cm=confusion_matrix(ytest,MLP_pred)
MLP_ac=accuracy_score(ytest, MLP_pred)

#Bayes
gaussian=GaussianNB()
gaussian.fit(xtrain,ytrain)
bayes_pred=gaussian.predict(xtest)
bayes_cm=confusion_matrix(ytest,bayes_pred)
bayes_ac=accuracy_score(bayes_pred,ytest)

#SVM  
svc_r=SVC(random_state=7)
svc_r.fit(xtrain,ytrain)
svr_pred=svc_r.predict(xtest)
svr_cm=confusion_matrix(ytest,svr_pred)
svr_ac=accuracy_score(ytest, svr_pred)

#RandomForest
rdf_c=RandomForestClassifier(random_state=7)
rdf_c.fit(xtrain,ytrain)
rdf_pred=rdf_c.predict(xtest)
rdf_cm=confusion_matrix(ytest,rdf_pred)
rdf_ac=accuracy_score(rdf_pred,ytest)

# DecisionTree Classifier
dtree_c=DecisionTreeClassifier(random_state=7)
dtree_c.fit(xtrain,ytrain)
dtree_pred=dtree_c.predict(xtest)
dtree_cm=confusion_matrix(ytest,dtree_pred)
dtree_ac=accuracy_score(dtree_pred,ytest)

#KNN
knn=KNeighborsClassifier(n_neighbors=28)
knn.fit(xtrain,ytrain)
knn_pred=knn.predict(xtest)
knn_cm=confusion_matrix(ytest,knn_pred)
knn_ac=accuracy_score(knn_pred,ytest)

In [None]:
plt.figure(figsize=(20,10))

plt.subplot(2,4,1)
plt.title("LogisticRegression_cm")
sns.heatmap(lr_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,2)
plt.title("MLP")
sns.heatmap(MLP_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,3)
plt.title("bayes_cm")
sns.heatmap(bayes_cm,annot=True,cmap="Oranges",fmt="d",cbar=False)

plt.subplot(2,4,4)
plt.title("RandomForest")
sns.heatmap(rdf_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,5)
plt.title("SVM")
sns.heatmap(svr_cm,annot=True,cmap="Reds",fmt="d",cbar=False)

plt.subplot(2,4,6)
plt.title("DecisionTree_cm")
sns.heatmap(dtree_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

plt.subplot(2,4,7)
plt.title("kNN_cm")
sns.heatmap(knn_cm,annot=True,cmap="Blues",fmt="d",cbar=False)

In [None]:
models = pd.DataFrame({'Model': ['LogisticRegression','MLP','Bayes','SVM',
                                      'RandomForest','DecisionTree_Classifier','KNN'],'Train & test accuracy': [lr_ac,MLP_ac,bayes_ac,svr_ac,rdf_ac,dtree_ac,knn_ac]})

models.sort_values(by = 'Train & test accuracy', ascending = False).reset_index(drop=True)

## 5.1. KNN

In [None]:
max_train_score = max(train_scores)
train_scores_ind = [i for i, v in enumerate(train_scores) if v == max_train_score]
print('Max train score {} % and k = {}'.format(max_train_score*100,list(map(lambda x: x+1, train_scores_ind))))

In [None]:
max_test_score = max(test_scores)
test_scores_ind = [i for i, v in enumerate(test_scores) if v == max_test_score]
print('Max test score {} % and k = {}'.format(max_test_score*100,list(map(lambda x: x+1, test_scores_ind))))

In [None]:
plt.figure(figsize=(20,5))
p = sns.lineplot(k_range,train_scores,marker='*',label='Train Score')
p = sns.lineplot(k_range,test_scores,marker='o',label='Test Score')

In [None]:
knn = KNeighborsClassifier(28)

knn.fit(xtrain,ytrain)
knn.score(xtest,ytest)

In [None]:
from sklearn.metrics import confusion_matrix
#let us get the predictions using the classifier we had fit above
ypred = knn.predict(xtest)
confusion_matrix(ytest,ypred)
pd.crosstab(ytest, ypred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
ypred = knn.predict(xtest)
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(ytest, ypred)
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

In [None]:
from sklearn.metrics import roc_curve
ypred_proba = knn.predict_proba(xtest)[:,1]
fpr, tpr, thresholds = roc_curve(ytest, ypred_proba)

In [None]:
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=21) ROC curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(ytest,ypred_proba)

## 5.2. Tree

In [None]:
DT=DecisionTreeClassifier(criterion= 'gini', max_depth= 5,
                          min_samples_leaf= 2, min_samples_split= 3, splitter= 'best',random_state=22)
DT.fit(xtrain,ytrain)

pred_dt=DT.predict(xtest)

acc_DT=accuracy_score(pred_dt,ytest)
acc_DT

In [None]:
from sklearn.model_selection import GridSearchCV

param={'criterion':['gini','antropy'],
      'max_depth':[3,4,5],
      'min_samples_split':[3,4,5,6],
      'min_samples_leaf':[2,3,4]}

GS=GridSearchCV(DT,param,cv=5,scoring='f1')
GS.fit(x,y)

print('best score :' , GS.best_score_)
print('best param :', GS.best_params_)

## 5.3. Naive Bayes

In [None]:
GNB=GaussianNB()
GNB.fit(xtrain,ytrain)
pred=GNB.predict(xtest)

print(confusion_matrix(ytest,pred))

acc_GNB=accuracy_score(ytest,pred)
print(acc_GNB)

## 5.4. Neural Network

In [None]:
MLP = MLPClassifier(random_state=7)
MLP.fit(xtrain,ytrain)
MLP_pred=MLP.predict(xtest)
MLPcm=confusion_matrix(ytest,MLP_pred)
MLP_ac=accuracy_score(ytest, MLP_pred)

MLP_ac

## 5.5. SVM

In [None]:
svm=SVC(C= 10, kernel= 'linear',random_state=7)
svm.fit(xtrain,ytrain)
svm_pred=svm.predict(xtest)
svm_cm=confusion_matrix(ytest,svm_pred)
acc_svm=accuracy_score(svm_pred,ytest)

acc_svm

## 5.6. Logistic Regression

In [None]:
lr_c=LogisticRegression(C= 10, penalty= 'l2',random_state=22)
lr_c.fit(xtrain,ytrain)
lr_pred=lr_c.predict(xtest)
lr_cm=confusion_matrix(ytest,lr_pred)
lr_ac=accuracy_score(ytest, lr_pred)

lr_ac

## 5.7. Random Forest Classifier

In [None]:
model=RandomForestClassifier(n_estimators=100,max_features=0.9,criterion='gini',max_depth=3,random_state=7)
#model.fit(x,y)
score=cross_val_predict(model,x,y,cv=10)
print(classification_report(y,score))
acc_rf=accuracy_score(y, score)
print(acc_rf)