## Diabetes mellitus, commonly known as diabetes, is a metabolic disease that causes high blood sugar. The hormone insulin moves sugar from the blood into your cells to be stored or used for energy. With diabetes, your body either doesn't make enough insulin or can't effectively use the insulin it does make

![](https://images.squarespace-cdn.com/content/v1/53e3bacbe4b022bcdbe1f538/1504716606889-SRWLJ8EH744M4IHRX3LQ/ke17ZwdGBToddI8pDm48kDrMjE7hBq4fQV3wYHraitJZw-zPPgdn4jUwVcJE1ZvWQUxwkmyExglNqGp0IvTJZUJFbgE-7XRK3dMEBRBhUpzj2bmKhA1a89vhGCTEuFcMrGIAhTIwGn2DOXg1A8iNSPxvh_zK_LmuDa3ZMbEzfBk/Diabetes-is-a-Drag.gif)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Get feel of data

In [None]:
df=pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
df.head()

In [None]:
df.info()

In [None]:
# Distribution of our data
df.hist(bins=25,figsize=(20,8))

In [None]:
# Correlation 
corr=df.corr()
f,ax=plt.subplots(1,1,figsize=(12,8))
sns.heatmap(corr, annot=True, cmap="YlGnBu", ax=ax)

In [None]:
ax=sns.countplot('Outcome', data=df)
print('1:- Diabetes....|||||....0:- healthy')

# Removing outliers !
### In above Distribution graph we can notice outliers in :
* **BMI**  

A BMI of less than 18.5 means that a person is underweight. A BMI of between 18.5 and 24.9 is ideal. A BMI of between 25 and 29.9 is overweight. A BMI over 30 indicates obesity.
<hr>

* **Blood Pressure**

As a general guide: ideal blood pressure is considered to be between 90/60mmHg and 120/80mmHg. high blood pressure is considered to be 140/90mmHg or higher. low blood pressure is considered to be 90/60mmHg or lower.

<hr>

* **Glucose**

For the majority of healthy individuals, normal blood sugar levels are as follows: Between 4.0 to 5.4 mmol/L (72 to 99 mg/dL) when fasting. Up to 7.8 mmol/L (140 mg/dL) 2 hours after eating.

<hr>

* **SkinThickness**

For adults, the standard normal values for triceps skinfolds are (see TableH): 2.5mm (men) or about 20% fat; 18.0mm (women) or about 30% fat

In [None]:
df=df.loc[(df.BMI>10) & (df.BloodPressure>20) & (df.Glucose>25)]

In [None]:
# Distribution of our data
df.hist(bins=25,figsize=(20,8))

In [None]:
df.info()

In [None]:
df.describe()

> **These are missing values replaced with zeros. Take a look at section 3.7 of this paper:**
https://www.sciencedirect.com/science/article/pii/S2352914816300016

In [None]:
df.SkinThickness.hist(bins=20)

In [None]:
df.loc[(df.SkinThickness<5)& (df.Outcome==0), 'SkinThickness']=int(df[(df.Outcome==0)]['SkinThickness'].median())
df.loc[(df.SkinThickness<5)& (df.Outcome==1), 'SkinThickness']=int(df[(df.Outcome==1)]['SkinThickness'].median())

In [None]:
df.loc[(df.Insulin==0)& (df.Outcome==0), 'Insulin']=int(df[(df.Outcome==0)]['Insulin'].median())
df.loc[(df.Insulin==0)& (df.Outcome==1), 'Insulin']=int(df[(df.Outcome==1)]['Insulin'].median())

In [None]:
df.Insulin.hist(bins=20)

In [None]:
df.sample(6)

# When Should You Use Normalization And Standardization:

* **Normalization** is a good technique to use when you do not know the distribution of your data or when you know the distribution is not Gaussian (a bell curve). Normalization is useful when your data has varying scales and the algorithm you are using does not make assumptions about the distribution of your data, such as k-nearest neighbors and artificial neural networks.

* **Standardization** assumes that your data has a Gaussian (bell curve) distribution. This does not strictly have to be true, but the technique is more effective if your attribute distribution is Gaussian. Standardization is useful when your data has varying scales and the algorithm you are using does make assumptions about your data having a Gaussian distribution, such as linear regression, logistic regression, and linear discriminant analysis.

In [None]:
scaler = StandardScaler()
data_x=scaler.fit_transform(df.drop(['Outcome'], axis=1))
#data_x=df.drop(['Outcome'], axis=1)

In [None]:
data_y=df.Outcome.values
#data_y=data_y.reshape((-1,1))

In [None]:
data_x.shape,data_y.shape

# PCA 

In [None]:
from sklearn.decomposition import PCA

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(data_x,data_y,random_state=998)
xtrain.shape, xtest.shape

> **Must fit pca with only training set. For gaining insights how it performs on real data**

In [None]:
pca=PCA(n_components=2)
pca.fit(xtrain)

In [None]:
pca_xtrain=pca.transform(xtrain)
pca_xtest=pca.transform(xtest)
pca_xtrain.shape, pca_xtest.shape

> Plot our 2D-data

In [None]:
def plot_2d(x_train,y_train,x_test,y_test):
    plt.figure(figsize=(16,8))
    sns.scatterplot(x=x_train[:,0], y=x_train[:,1], hue=y_train, marker = 'v', alpha=0.9,)
    sns.scatterplot(x=x_test[:,0], y=x_test[:,1], hue=y_test, alpha=0.8,  marker = 'o')
    

In [None]:
plot_2d(pca_xtrain,ytrain, pca_xtest, ytest)

# Machine learning Algo...


In [None]:
# imports we need............
from sklearn.model_selection import cross_val_score, ShuffleSplit, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## LogisticRegression

In [None]:
def fit(model, cv):
    return cross_val_score(model,data_x, data_y, cv=cv).mean()

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.18)
acc=[]
for i in range(1,21):
    log_clf=LogisticRegression(C=i)
    acc.append(fit(log_clf,cv))
plt.grid(True)
plt.plot(acc ,marker='o')

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.18)
acc=[]
for i in tqdm(range(1,76)):
    log_clf=KNeighborsClassifier(n_neighbors=i)
    acc.append(fit(log_clf,cv))
plt.figure(figsize=(12,5))
plt.grid(True)
plt.plot(acc ,marker='o')

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.18)
acc=[]
for i in tqdm(range(1,60)):
    log_clf=SVC(C=i)
    acc.append(fit(log_clf,cv))

plt.figure(figsize=(12,5))
plt.grid(True)
plt.plot(acc ,marker='o')

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.18)
acc=[]
dict_={}
for i in tqdm(range(1,152)):
    log_clf=RandomForestClassifier(n_estimators=i)
    Accuracy=fit(log_clf,cv)
    acc.append(Accuracy)
    dict_[i]=Accuracy

plt.figure(figsize=(12,5))
plt.grid(True)
plt.plot(acc ,marker='o')

In [None]:
sorted(dict_.items(), key=lambda x: x[1], reverse=True)[:6]

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve, classification_report
xtrain,xtest,ytrain,ytest=train_test_split(data_x,data_y,random_state=998)

In [None]:
rand_clf=RandomForestClassifier(n_estimators=101)
rand_clf.fit(xtrain,ytrain)

In [None]:
print(confusion_matrix(ytest, rand_clf.predict(xtest)))
print('Accuracy of our model is: ', accuracy_score(ytest, rand_clf.predict(xtest)))

In [None]:
print(classification_report(ytest, rand_clf.predict(xtest)))

# Save our model

In [None]:
rand_clf=RandomForestClassifier(n_estimators=91)
cross_val_score(rand_clf,data_x, data_y, cv=cv).mean()

In [None]:
%cd /kaggle/working

In [None]:
import pickle
Pkl_Filename = "Pima_final.pkl"  

In [None]:
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(rand_clf, file)

# Ann

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(data_x,data_y,random_state=998)

In [None]:
ytrain.sum(),len(ytrain),ytest.sum(),len(ytest)

In [None]:
ytest=ytest.reshape(-1,1)
ytrain=ytrain.reshape(-1,1)

In [None]:
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l1_l2

In [None]:
check_point=tf.keras.callbacks.ModelCheckpoint(
    filepath='diabetes.h5', monitor='val_loss', verbose=1, save_best_only=True,
    save_weights_only=False, mode='min')

In [None]:
model=Sequential([
    Dense(80,activation='relu',input_shape=(None,8)),
    Dropout(0.5),
    Dense(120,activation='relu', kernel_regularizer=l1_l2()),
    Dropout(0.5),
    Dense(128,activation='relu'),
    Dropout(0.5),
    Dense(60,activation='relu'),
    Dropout(0.5),
    Dense(30,activation='relu'),
    Dense(1,activation='sigmoid')
])

model.compile(loss='BinaryCrossentropy', optimizer=tf.keras.optimizers.Adam(0.001), metrics=['accuracy'])
model.summary()

In [None]:
history=model.fit(xtrain,ytrain,epochs=300,validation_data=(xtest,ytest), callbacks=[check_point])

In [None]:
plt.figure(1, figsize = (25, 12))
plt.subplot(1,2,1)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot( history.history["loss"], label = "Training Loss")
plt.plot( history.history["val_loss"], label = "Validation Loss")
plt.grid(True)
plt.legend()

plt.subplot(1,2,2)
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.plot( history.history["accuracy"], label = "Training Accuracy")
plt.plot( history.history["val_accuracy"], label = "Validation Accuracy")
plt.grid(True)
plt.legend()

In [None]:
model_new=keras.models.load_model('diabetes.h5')

In [None]:
model_new.evaluate(xtest,ytest)

In [None]:
print(confusion_matrix(ytest, model_new.predict_classes(xtest)))
print('Accuracy of our model is: ', accuracy_score(ytest, model_new.predict_classes(xtest)))

In [None]:
print(classification_report(ytest, model_new.predict_classes(xtest)))