In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://st1.thehealthsite.com/wp-content/uploads/2020/11/Diabetes-2-768x427.jpg)

<p style="background-color:Tomato;">Diabetes is a disease that occurs when your blood glucose, also called blood sugar, is too high. Blood glucose is your main source of energy and comes from the food you eat. Insulin, a hormone made by the pancreas, helps glucose from food get into your cells to be used for energy. Sometimes your body doesn’t make enough—or any—insulin or doesn’t use insulin well. Glucose then stays in your blood and doesn’t reach your cells.</p>

features: 

Pregnancies:  Number of times pregnant

Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test

BloodPressure: Diastolic blood pressure (mm Hg)

SkinThickness: Triceps skin fold thickness (mm)

Insulin: 2-Hour serum insulin (mu U/ml)

BMI: Body mass index (weight in kg/(height in m)^2)

DiabetesPedigreeFunction: Diabetes pedigree function

Age: Age (years)

<p style="background-color:Tomato;">importing essential libraries</p>


In [None]:
#importing essential libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
import plotly.express as px
import plotly.io as pio
warnings.filterwarnings("ignore")

In [None]:
import plotly.graph_objects as go

<p style="background-color:Tomato;">importing dataset</p>


In [None]:
df= pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

<p style="background-color:Tomato;">No missing values present in the dataset</p>


In [None]:
df.describe()

<p style="background-color:Tomato;">However the dataset shows some of the features having minimum value as 0 like insulin,BMI,glucose which is not logical and can be considered missing values</p>


In [None]:
print(f"the missing values for glucose are : ",len(df.loc[df.Glucose==0]))
print(f"the missing values for insulin are : ",len(df.loc[df.Insulin==0]))
print(f"the missing values for bloodpressure are : ",len(df.loc[df.BloodPressure==0]))
print(f"the missing values for SkinThickness are : ",len(df.loc[df.SkinThickness==0]))
print(f"the missing values for BMI are : ",len(df.loc[df.BMI==0]))


<p style="background-color:Tomato;">we will be imputing these missing values with the mean with help of sklearn imputer</p>




In [None]:
from sklearn.impute import SimpleImputer

In [None]:
impute= SimpleImputer(missing_values=0, strategy="mean")

In [None]:
df.iloc[:,1:6]= impute.fit_transform(df.iloc[:,1:6])

In [None]:
df.describe()

<p style="background-color:Tomato;">All the zero values have been imputed</p>



# **<h2 style="background-color:DodgerBlue;">EDA</h2>**

<p style="background-color:Tomato;">CORRELATION HEATMAP SEABORN</p>


In [None]:
plt.figure(figsize= (15,6))
sns.set_style("darkgrid")
sns.heatmap(df.corr(),annot= True)
plt.show()

<p style="background-color:Tomato;">lets see the distribution of age across dataset</p>


In [None]:
ax= px.histogram(df,x= "Age", template= "plotly_dark",color= "Outcome",title='Age distribution')
ax.show()

<p style="background-color:Tomato;">lets see the distribution of Outcome(diabetes or not)</p>

In [None]:
ax= px.pie(df, names= "Outcome",template= "plotly_dark",title= "chances of Diabetes",hole= 0.5)
ax.show()

The dataset is slightly imbalanced wrt to outcome as 65% of patients have low of diabetes and only 34% patients have high chance

<p style="background-color:Tomato;">Glucose and age correlation/ distribution across target variable</p>

In [None]:
ax= px.scatter(df,x= "Glucose",y= "Age",marginal_x='histogram', marginal_y='histogram',size="Age", size_max=20,
              template= "plotly_dark",color= "Outcome",title="age and glucose correlation")
ax.show()

<p style="background-color:Tomato;">blood pressure and age correlation/ distribution across target variable</p>

In [None]:
ax= px.scatter(df,x= "BloodPressure",y= "Age",marginal_x='histogram', marginal_y='histogram',size="Age", size_max=20,
              template= "plotly_dark",color= "Outcome",title="age and bloodpressure correlation")
ax.show()

<p style="background-color:Tomato;">Insulin and age correlation/ distribution across target variable</p>

In [None]:
ax= px.scatter(df,x= "Insulin",y= "Age",marginal_x='histogram', marginal_y='histogram',size="Age", size_max=20,
              template= "plotly_dark",color= "Outcome",title="age and Insulin correlation")
ax.show()

<p style="background-color:Tomato;">3D scatterplot showing correlation between insulin and glucose across ages and it impacts the target variable</p>

In [None]:
ax = px.scatter_3d(df, x="Age", y="Glucose", z="Insulin",template= "plotly_dark",color="Outcome")
ax.show()

<p style="background-color:Tomato;">Visualizing the BMI and diabetes pedigree function</p>

In [None]:
dt= df.iloc[:,6:]

In [None]:
ax= px.parallel_coordinates(dt, color="Outcome",template= "plotly_dark")
ax.show()

In [None]:
fig = px.area(df, x="Age", y="BMI", color="Outcome",template= "plotly_dark")
fig.show()

In [None]:
dt= df.iloc[:,1:8]

In [None]:
ax = px.box(dt,template= "plotly_dark")
ax.show()


# **<h2 style="background-color:DodgerBlue;">MODELS</h2>**

In [None]:
#importing essential libraries
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
df.var()

<p style="background-color:Tomato;">The features have high variance and hence need to be normalized</p>

In [None]:
df["Glucose"]= np.log(df.Glucose)
df["BloodPressure"]= np.log(df.BloodPressure)
df["Insulin"]= np.log(df.Insulin)
df["BMI"]= np.log(df.BMI)

In [None]:
df["SkinThickness"]= np.log(df.SkinThickness)
df["Age"]= np.log(df.Age)

In [None]:
df.var()

In [None]:
#train test split
label= df["Outcome"]
train= df.drop("Outcome",axis= 1)

In [None]:
x_train,x_test,y_train,y_test= train_test_split(train,label,test_size= 0.3,random_state= 5)

# **<h2 style="background-color:DodgerBlue;">KNN</h2>**

In [None]:
scores= []
for i in range(1,50):
    knn= KNeighborsClassifier(n_neighbors= i)
    knn.fit(x_train,y_train)
    scores.append(accuracy_score(y_test,knn.predict(x_test)))

In [None]:
plt.figure(figsize= (15,6))
sns.lineplot(np.arange(1,50),scores)
plt.show()

In [None]:
knn= KNeighborsClassifier(n_neighbors= 15)
knn.fit(x_train,y_train)
knnpred = knn.predict(x_test)
print(f"accuracy score: ",accuracy_score(y_test,knnpred))

In [None]:
#classification report
cr= classification_report(y_test,knnpred)
print(cr)

# **<h2 style="background-color:DodgerBlue;">LOGISTIC REGRESSION</h2>**

In [None]:
lr= LogisticRegression()
lr.fit(x_train,y_train)
lrpred= lr.predict(x_test)
print(f"accuracy score: ",accuracy_score(y_test,lrpred))

In [None]:
#classification report
cr= classification_report(y_test,lrpred)
print(cr)

# **<h2 style="background-color:DodgerBlue;">RANDOM FOREST</h2>**

In [None]:
rf= RandomForestClassifier()
rf.fit(x_train,y_train)
rfpred= rf.predict(x_test)
accuracy_score(y_test,rfpred)

In [None]:
#hyperparameter tuning
params={
    'max_depth': [80, 90, 100],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200]
}

In [None]:
grid_search = GridSearchCV(estimator = rf, param_grid = params,cv = 3)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_score_

In [None]:
#classification report
cr= classification_report(y_test,rfpred)
print(cr)

* # **<h2 style="background-color:DodgerBlue;">AdaBoost</h2>**

In [None]:
ad=AdaBoostClassifier()
ad.fit(x_train,y_train)
adpred= ad.predict(x_test)
accuracy_score(y_test,adpred)

In [None]:
#hyperparameter tuning
params={
   "learning_rate":[0.2,0.1,0.5],
   "n_estimators": [100,150,200]
}

In [None]:
grid_search = GridSearchCV(estimator = ad, param_grid = params,cv = 3)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
ad=AdaBoostClassifier(learning_rate= 0.1,n_estimators=100)
ad.fit(x_train,y_train)
adpred= ad.predict(x_test)
print(f"accuracy score: ",accuracy_score(y_test,adpred))

In [None]:
#classification report
cr= classification_report(y_test,adpred)
print(cr)

* # **<h2 style="background-color:DodgerBlue;">GradientBoosting</h2>**

In [None]:
gb= GradientBoostingClassifier()
gb.fit(x_train,y_train)
predgb= gb.predict(x_test)
accuracy_score(y_test,predgb)

In [None]:
#classification report
cr= classification_report(y_test,predgb)
print(cr)

In [None]:
from pandas import DataFrame
model=[["KNN",accuracy_score(y_test,knnpred)],["GRADIENTBOOST",accuracy_score(y_test,predgb)],["RANDOMFOREST",accuracy_score(y_test,rfpred)]
       ,["ADABOOST",accuracy_score(y_test,adpred)],["LOGISTICREGRESSION",accuracy_score(y_test,lrpred)]]
dx= DataFrame(model,columns= ["model","score"])

In [None]:
ax= px.bar(dx,x= "model",y="score",template="plotly_dark",title= "scores",color= "model")
ax.show()

The scores and models can be further improved using hyperparameter tuning with different parameters