In [None]:
# Importing the Libraries

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Loading the dataset
data = pd.read_csv("../input/diabetes/diabetes.csv")
data_new = data # Storing original data is new variable, so that we can use it later for training the model


In [None]:
data.head()

In [None]:
# Study the data to check Null value
data.info()

In [None]:
data.describe()

In [None]:
# From describe function it is clear that there are some value as 0, lets get the count for the same.

print("Number of Zero value in Pregnancies:")
pre = data[data["Pregnancies"] == 0].shape[0]
print(pre)
print("Number of Zero value in Glucose:")
glu = data[data["Glucose"] == 0].shape[0]
print(glu)
print("Number of Zero value in BloodPressure:")
bp = data[data["BloodPressure"] == 0].shape[0]
print(bp)
print("Number of Zero value in SkinThickness:")
st = data[data["SkinThickness"] == 0].shape[0]
print(st)
print("Number of Zero value in Insulin:")
ins = data[data["Insulin"] == 0].shape[0]
print(ins)
print("Number of Zero value in BMI:")
bmi = data[data["BMI"] == 0].shape[0]
print(bmi)
print("Number of Zero value in DiabetesPedigreeFunction:")
dpf = data[data["DiabetesPedigreeFunction"] == 0].shape[0]
print(dpf)
print("Number of Zero value in Age:")
age = data[data["Age"] == 0].shape[0]
print(age)

In [None]:
# From the above data, it is clear that there are too many "0" values in column "Insulin"
# and "SkinThickness" which can not be filled with the value, so we will drop these columns
#from out table for the prediction

data = data.drop(["SkinThickness","Insulin"],axis = 1) 

data


In [None]:
# Visualisation of data

fig, ax = plt.subplots(2, 3, figsize = (15, 7)) # Making Subplots

sns.barplot(data = data, x ="Pregnancies", y="Outcome", ax=ax[0,0]);
sns.barplot(data = data, x ="Glucose", y="Outcome", ax=ax[0,1]);
sns.barplot(data = data, x ="BloodPressure", y="Outcome", ax=ax[0,2]);
sns.barplot(data = data, x ="Age", y="Outcome", ax=ax[1,0]);
sns.barplot(data = data, x ="BMI", y="Outcome", ax=ax[1,1]);
sns.barplot(data = data, x ="DiabetesPedigreeFunction", y="Outcome", ax=ax[1,2]);


plt.tight_layout() # you can use this function for clear visualization
plt.show()

In [None]:
# Since we are not able to visualize the data due to large dataset,
#we will split out data into 6 parts anbd then indentify the dependent variables



data['Pregnancies'] = pd.qcut(data['Pregnancies'], 6, labels = [1, 2, 3, 4,5,6])
data['Glucose'] = pd.qcut(data['Glucose'], 6, labels = [1, 2, 3, 4,5,6])
data['BloodPressure'] = pd.qcut(data['BloodPressure'], 6, labels = [1, 2, 3, 4,5,6])
data['BMI'] = pd.qcut(data['BMI'], 6, labels = [1, 2, 3, 4,5,6])
data['DiabetesPedigreeFunction'] = pd.qcut(data['DiabetesPedigreeFunction'], 6, labels = [1, 2, 3, 4,5,6])
data['Age'] = pd.qcut(data['Age'], 6, labels = [1, 2, 3, 4,5,6])

data.head()

In [None]:
# Lets Visualize the same data again

# Visualisation of data

fig, ax = plt.subplots(2, 3, figsize = (15, 7)) # Making Subplots

sns.barplot(data = data, x ="Pregnancies", y="Outcome", ax=ax[0,0]);
sns.barplot(data = data, x ="Glucose", y="Outcome", ax=ax[0,1]);
sns.barplot(data = data, x ="BloodPressure", y="Outcome", ax=ax[0,2]);
sns.barplot(data = data, x ="Age", y="Outcome", ax=ax[1,0]);
sns.barplot(data = data, x ="BMI", y="Outcome", ax=ax[1,1]);
sns.barplot(data = data, x ="DiabetesPedigreeFunction", y="Outcome", ax=ax[1,2]);


plt.tight_layout() # you can use this function for clear visualization
plt.show()

In [None]:
# From the above figure, it is clear that all the above are independent variable, 
# so we will keep them to train our dataset and only remove SkinThickness and Insulin 


data_new = data_new.drop(["SkinThickness","Insulin"],axis = 1) 
data_new

In [None]:
# Splitting datset to X and y axis (working on data_new as it contains the origional values)

X=data_new.iloc[: , 0:6].values
y=data_new.iloc[: ,-1].values

In [None]:
# Print dataset on X axis

X

In [None]:
# Print dataset on X axis

y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [None]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)


In [None]:
# Adding layers and activation function

import tensorflow as tf
ann=tf.keras.models.Sequential()

ann.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann.add(tf.keras.layers.Dense(units=6,activation='relu'))
ann.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))

#Compiling
ann.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
# Training the model

ann.fit(X_train,y_train,batch_size=32,epochs=104)

In [None]:
# Prediction on Test data

y_pred = ann.predict(X_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Calculating quality for model

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
