<a href="https://colab.research.google.com/github/radhika3131/Diabetes-Prediction-Using-Keras/blob/main/Diabetes_Prediction_Using_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keras



# Step 1 : Import Libraries

In [None]:
from sklearn import linear_model # for handling missing values
import h5py as h5 #To save model weights
from keras.models import Sequential, model_from_json # to create and save model
from keras.utils import plot_model
from keras.layers import Dense
import numpy as np
import os
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Step 2 : Load the Dataset

### Load dataframe from `.csv` file

In [None]:
data = pd.read_csv("/content/diabetes.csv" , names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'PedigreeFunction', 'Age', 'Outcome'])

In [None]:
data.head(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,PedigreeFunction,Age,Outcome
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0
5,0,137,40,35,168,43.1,2.288,33,1
6,5,116,74,0,0,25.6,0.201,30,0
7,3,78,50,32,88,31,0.248,26,1
8,10,115,0,0,0,35.3,0.134,29,0
9,2,197,70,45,543,30.5,0.158,53,1


## Step 3: Data statistics
### Check for missing values

In [None]:
a = data.isnull().sum()
b = a.sort_values(ascending=False)
b.head()

Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
dtype: int64

### Display dataset description

In [None]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,PedigreeFunction,Age,Outcome
count,769,769,769,769,769,769,769.0,769,769
unique,18,137,48,52,187,249,518.0,53,3
top,1,100,70,0,0,32,0.254,22,0
freq,135,17,57,227,374,13,6.0,72,500


## Step 4: Create the pairplot using seaborn library

In [None]:
sns.set_context("paper", rc={"axes.labelsize":18})
plot = sns.pairplot(data, hue = 'Outcome', palette= 'Set2', corner=True, height=1.5)
for ax in plot.axes.flatten():
    if ax:
        # rotate x axis labels
        ax.set_xlabel(ax.get_xlabel(), rotation = -55, horizontalalignment='left')
        # rotate y axis labels
        ax.set_ylabel(ax.get_ylabel(), rotation = -55, horizontalalignment='right')

## Step 5: Missing value handling

### Plot the correlation matrix

In [None]:
corr = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr[(corr >= 0.2) | (corr <= -0.2)], cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1, annot=True, annot_kws={"size": 8}, square=True)

### Get the number of missing values in each column

In [None]:
data_copy = data.copy()

for c in ['BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Glucose']:
  data_copy[c].replace(to_replace = 0, value = np.nan, inplace=True)

data_copy.isnull().sum()

### Impute feature medians in all columns that will not be predicted


In [None]:
for c in ['BloodPressure', 'BMI', 'SkinThickness', 'Insulin']:
      data[c].replace(to_replace = 0, value = data[c].median(), inplace=True)

### Use linear regression to predict the missing values in the column you chose for prediction

In [None]:
for metric in ["Glucose"]:
    X = data[data[metric] != 0]
    X = X.drop(columns = [metric])
    y = data.loc[data[metric] != 0, metric]
    print("Training: ", metric, X.shape, y.shape)
    model = linear_model.LinearRegression()
    model.fit(X, y)
    X_miss = data[data[metric] == 0]
    X_miss = X_miss.drop(columns = [metric])
    print("missing values:", X_miss.shape)
    data.loc[data[metric] == 0, metric] = model.predict(X_miss)

## Step 6: Handling Skewness and Feature Scaling

### Handling Skewness

In [None]:
for feature in ['SkinThickness', 'Insulin', 'BMI', 'PedigreeFunction', 'Age']:
    data[feature] = np.log(data[feature])

### Creating Distribution Plots

In [None]:
feature_names = list(data.columns)[:8]
rcParams['figure.figsize'] = 20,15
sns.set(font_scale = 1)
sns.set_style("white")
sns.set_palette("bright")
plt.subplots_adjust(hspace=0.5)
i = 1;
for name in feature_names:
    plt.subplot(4,2,i)
    sns.histplot(data=data, x=name, hue="Outcome",kde=True,palette="BuGn")
    i = i + 1

### Feature Scaling

In [None]:
scaler = StandardScaler()
train_data = data.to_numpy()
train_data[:, :8] = scaler.fit_transform(train_data[:, :8])

## Step 7: Dataset Splitting

In [None]:
X = train_data[:, [0, 1, 5, 7]]
Y = pd.get_dummies(train_data[:, 8])
X_train, X_assess, Y_train, Y_assess = train_test_split(X, Y, test_size=0.4, random_state=10)
X_val, X_test, Y_val, Y_test = train_test_split(X_assess, Y_assess, test_size=0.5, random_state=10)
print("Y_train, Y_val, Y_test ", Y_train.shape, Y_val.shape, Y_test.shape )
print("X_train, X_val, X_test ", X_train.shape, X_val.shape, X_test.shape)

## Step 8: Creating a neural network model

In [None]:
model = Sequential()
model.add(Dense(12, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='softmax'))

## Step 9: Model Training
### Compiling the model

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Training the model

In [None]:
h = model.fit(X_train, Y_train, epochs=40, validation_data=(X_val, Y_val), batch_size=32, verbose=1)

### Evaluating the model

In [None]:
scores = model.evaluate(X_test, Y_test)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

### Saving the model

In [None]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

### Saving the model weights

In [None]:
model.save_weights("model.h5")
print("Saved model to disk")

## Step 10: Printing the training curves

In [None]:
plt.rcParams["figure.figsize"] = (12,8)
N = np.arange(0, 40)
plt.style.use("ggplot")
plt.figure()
plt.plot(N, h.history["loss"], label="train_loss")
plt.plot(N, h.history["val_loss"], label="val_loss")
plt.plot(N, h.history['accuracy'], label="train_acc")
plt.plot(N, h.history["val_accuracy"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()

## Step 10: Confusion Matrix
### Test set predictions

In [None]:
actuals = np.argmax([Y_test.T], axis=1)[0]
Y_predicted = model.predict(X_test)
classes= np.argmax(Y_predicted,axis=1)

### plot Confusion matrix

In [None]:
cm = confusion_matrix(actuals, classes)

plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('True')

## Step 12: Compute the Classification Metrics

### Classification report

In [None]:
target_names = ["No-Diabetes", "Diabetes"]
print(classification_report(actuals, classes, target_names=target_names))

## End