# Diabetes Prediction Model

Installing required dependencies using pip

In [8]:
!pip install pandas numpy tensorflow sklearn xgboost



Importing necessary libraries

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 
from sklearn.model_selection import cross_val_score, KFold
from xgboost import XGBClassifier

### Loading the dataset

In [10]:
df = pd.read_csv('archive/diabetes_prediction_dataset.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB
None


### Checking for missing values

In [11]:
print(df.isnull().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


### Displaying the first few rows of the dataset

In [12]:
print(df.head())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


### Preprocessing the data

In [13]:
columns_considered = ['gender','age','hypertension', 'smoking_history','heart_disease',
       'bmi', 'HbA1c_level', 'blood_glucose_level']
x = df[columns_considered].values
y = df['diabetes']

print(x)

[['Female' 80.0 0 ... 25.19 6.6 140]
 ['Female' 54.0 0 ... 27.32 6.6 80]
 ['Male' 28.0 0 ... 27.32 5.7 158]
 ...
 ['Male' 66.0 0 ... 27.83 5.7 155]
 ['Female' 24.0 0 ... 35.42 4.0 100]
 ['Female' 57.0 0 ... 22.43 6.6 90]]


In [14]:
print("Unique values of Smoking History: ",np.unique(x[:,3]))
x[:,3] = np.where(x[:,3]=='No Info',0,np.where(x[:,3]=='current',1,np.where(x[:,3]=='ever',2,np.where(x[:,3]=='former',3,np.where(x[:,3]=='never',4,5)))))
print("Unique values of Smoking History after changing to numerical values: ",np.unique(x[:,3]))

Unique values of Smoking History:  ['No Info' 'current' 'ever' 'former' 'never' 'not current']
Unique values of Smoking History after changing to numerical values:  [0 1 2 3 4 5]


In [15]:
print("Unique values of Gender: ",np.unique(x[:,0]))
x[:,0] = np.where(x[:,0]=='Female',0,np.where(x[:,0]=='Male',1,2))
print("Unique values of Gender after changing to numerical values: ",np.unique(x[:,0]))

Unique values of Gender:  ['Female' 'Male' 'Other']
Unique values of Gender after changing to numerical values:  [0 1 2]


### Checking the distribution of target classes

In [16]:
print("Number of non diabetes record: ",np.count_nonzero(y==0))
print("Number of diabetes record: ",np.count_nonzero(y==1))

Number of non diabetes record:  91500
Number of diabetes record:  8500


#### Handling class imbalance using SMOTE

In [17]:
from imblearn.over_sampling import SMOTE
#Resampling the minority class. The strategy can be changed as required.
sm = SMOTE(sampling_strategy='minority', random_state=42)
#Fit the model to generate the data.
oversampled_X, oversampled_Y = sm.fit_resample(x, y)
print("Number of non diabetes record: ",np.count_nonzero(oversampled_Y==0))
print("Number of diabetes record: ",np.count_nonzero(oversampled_Y==1))

Number of non diabetes record:  91500
Number of diabetes record:  91500


### Normalizing the input data

In [18]:
norm_l = tf.keras.layers.Normalization(axis=-1)
norm_l.adapt(oversampled_X)  # learns mean, variance

norm_model = Sequential([
    norm_l
])
norm_model.compile()
x_norm = norm_l(oversampled_X)
x_norm = x_norm.numpy()

### Splitting the dataset into training and testing sets

In [19]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x_norm,oversampled_Y, test_size = 0.33, random_state = 42)
print("Training shape: ",Xtrain.shape)
print("Test shape: ",Xtest.shape)

Training shape:  (122610, 8)
Test shape:  (60390, 8)


### Building and training the model

In [20]:
model = XGBClassifier()
model.fit(Xtrain,Ytrain)

### Evaluating the model

In [22]:
Y_pred = model.predict(Xtest)
#Y_pred = tf.nn.sigmoid(Y_pred)
#Y_pred = np.where(Y_pred<=0.5,0,1)
f1 = f1_score(Ytest, Y_pred)
precision = precision_score(Ytest,Y_pred)
recall = recall_score(Ytest,Y_pred)
accuracy = accuracy_score(Ytest, Y_pred)
print("F1 score:", f1)
print("Precision: ", precision)
print("Recall: ",recall)
print("Accuracy: ",accuracy)

F1 score: 0.982637668056323
Precision:  0.9970779110461758
Recall:  0.9686097174544495
Accuracy:  0.9828282828282828


### Model summary and saving the model

In [23]:
# Save the TensorFlow model
model.save_model('diabetes_model')
norm_model.save("Normalized_model")

INFO:tensorflow:Assets written to: Normalized_model\assets
