In [None]:
import numpy as np 
import pandas as pd 
from scipy import stats 
import seaborn as sns 
data= pd.read_csv('../input/diabetes-dataset/diabetes.csv')
data.head()

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


This data does not contain any null values. So we don't need to worry about filling/dropping values.

In [3]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
data.duplicated()
data=data.drop_duplicates()
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


The data doesn't contain any duplicate values also. Hence we need not worry about them.

In [5]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


All bioparameters are in the range. Hence the data does not have observational/structural errors in them. Hence we need to worry about them.

In [6]:
for x in data.columns:
    z=np.abs(stats.zscore(data[x]))
    print(x+str(z))

Pregnancies0      0.639947
1      0.844885
2      1.233880
3      0.844885
4      1.141852
         ...   
763    1.827813
764    0.547919
765    0.342981
766    0.844885
767    0.844885
Name: Pregnancies, Length: 768, dtype: float64
Glucose0      0.848324
1      1.123396
2      1.943724
3      0.998208
4      0.504055
         ...   
763    0.622642
764    0.034598
765    0.003301
766    0.159787
767    0.873019
Name: Glucose, Length: 768, dtype: float64
BloodPressure0      0.149641
1      0.160546
2      0.263941
3      0.160546
4      1.504687
         ...   
763    0.356432
764    0.046245
765    0.149641
766    0.470732
767    0.046245
Name: BloodPressure, Length: 768, dtype: float64
SkinThickness0      0.907270
1      0.530902
2      1.288212
3      0.154533
4      0.907270
         ...   
763    1.722735
764    0.405445
765    0.154533
766    1.288212
767    0.656358
Name: SkinThickness, Length: 768, dtype: float64
Insulin0      0.692891
1      0.692891
2      0.692891
3      0.

According to the z-score statistical parameter, the above datapoints are considered as outliers. But I dont think these are outliers and I consider that these data are essential for identfying diabetes condition.

The next step is feature scaling. I am doing a comparison of ML and DL for this dataset. For ML, i am considering SVM, which is a distance based algorithm. Hence normalization of data would be appropriate. On the other hand, for DL I am considering ANN, which is gradient descent based algorithm for which standardisation of data would be appropriate, since it can help in faster identification of local minima.

In [7]:
data_norm=data.copy()
for column in data.columns:
    data_norm[column] = (data_norm[column] - data_norm[column].min()) / (data_norm[column].max() - data_norm[column].min()) 
data_norm.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0


In [8]:
lis=['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Pregnancies']
def standartization(x):
    x_std = x.copy(deep=True)
    for column in lis:
        x_std[column] = (x_std[column] - x_std[column].mean()) / x_std[column].std() 
    return x_std

data= standartization(data)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.63953,0.847771,0.149543,0.906679,-0.692439,0.20388,0.468187,1.425067,1
1,-0.844335,-1.122665,-0.160441,0.530556,-0.692439,-0.683976,-0.364823,-0.190548,0
2,1.233077,1.942458,-0.263769,-1.287373,-0.692439,-1.102537,0.604004,-0.105515,1
3,-0.844335,-0.997558,-0.160441,0.154433,0.123221,-0.493721,-0.920163,-1.040871,0
4,-1.141108,0.503727,-1.503707,0.906679,0.765337,1.408828,5.481337,-0.020483,1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    float64
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 60.0 KB


In [10]:
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [11]:
y=data['Outcome']
x=data.drop(['Outcome'],axis=1)

In [12]:
yn=data_norm['Outcome']
xn=data_norm.drop(['Outcome'],axis=1)

I am creating separate train and test sets for standardised and normalised data. the ones having n as suffix are normalised.

In [13]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest= train_test_split(x,y,test_size=0.15,stratify=y)
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(652, 8)
(116, 8)
(652,)
(116,)


In [14]:
from sklearn.model_selection import train_test_split
xntrain,xntest,yntrain,yntest= train_test_split(xn,yn,test_size=0.15,stratify=y)
print(xntrain.shape)
print(xntest.shape)
print(yntrain.shape)
print(yntest.shape)

(652, 8)
(116, 8)
(652,)
(116,)


In [15]:
from sklearn.svm import SVC
svm_model= SVC(kernel='rbf',gamma=8)
svm_model.fit(xntrain,yntrain)

SVC(gamma=8)

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions= svm_model.predict(xntrain)
percentage=svm_model.score(xntrain,yntrain)
res=confusion_matrix(yntrain,predictions)
print("Training confusion matrix")
print(res)
predictions= svm_model.predict(xntest)
percentage=svm_model.score(xntest,yntest)
res=confusion_matrix(yntest,predictions)
print("validation confusion matrix")
print(res)
print(classification_report(ytest, predictions))
# check the accuracy on the training set
print('training accuracy = '+str(svm_model.score(xntrain, yntrain)*100))
print('testing accuracy = '+str(svm_model.score(xntest, yntest)*100))

Training confusion matrix
[[392  32]
 [ 75 153]]
validation confusion matrix
[[68  8]
 [15 25]]
              precision    recall  f1-score   support

           0       0.64      0.70      0.67        76
           1       0.30      0.25      0.27        40

    accuracy                           0.54       116
   macro avg       0.47      0.47      0.47       116
weighted avg       0.52      0.54      0.53       116

training accuracy = 83.58895705521472
testing accuracy = 80.17241379310344


I trained the SVM without feature scaling and with standardisation. It produced test acc of 55% and 62% for no feature scaling and with standardisation. Hence normalisation is good for distance based algorithms like SVM.

For the DL part, I am considering ANN, comprising of 2 layers of 256 neurons as hidden layers. Considering more neurons and layers resulted in overfitting. Hence i limited with this hyperparameters. It was compiled using adam optimiser and crossentropy loss function.

In [26]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
dl_model = Sequential() 

dl_model.add(Dense(256,  activation = 'relu' ,input_shape=([8]))) #input layer
dl_model.add(Dense(256,  activation = 'relu'))
dl_model.add(Dense(1,activation = 'sigmoid'))
dl_model.summary()
dl_model.compile(optimizer = 'adam' , loss = 'binary_crossentropy' ,metrics = ['accuracy','Precision','Recall','AUC'])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 256)               2304      
_________________________________________________________________
dense_9 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 257       
Total params: 68,353
Trainable params: 68,353
Non-trainable params: 0
_________________________________________________________________


In [27]:
num_epochs = 50
history = dl_model.fit(xtrain ,
                    ytrain ,
                    epochs= num_epochs ,
                    steps_per_epoch=200,
                    validation_data=(xtest ,ytest))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50


In [29]:
dl_model.evaluate(xtrain,ytrain)



[0.04649108648300171,
 0.987730085849762,
 0.9867256879806519,
 0.9780701994895935,
 0.9991880059242249]

In [30]:
dl_model.evaluate(xtest,ytest)



[1.4893760681152344,
 0.681034505367279,
 0.5365853905677795,
 0.550000011920929,
 0.7108553051948547]

As you can see, the ANN produced 68% test accuracy which is way less than SVM. Hence we can say that the ML algorithm produced well than that of DL algorithm. Can we stop with this conclusion or are we missing something??

In [31]:
print(data['Outcome'].value_counts())
df_class_0 = data[data['Outcome'] == 0]
df_class_1 = data[data['Outcome'] == 1]

0    500
1    268
Name: Outcome, dtype: int64


As you can see, there is class imbalance, the amount of diabetes negative is twice than that of diabetes positive. In this scenario, we cant compare the performance of algorithms based on accuracy. So to overcome the class imbalance I oversampled the minority class to the samples of majority class (500). So the total data consits of 1000 samples with equal distribution. I repeated this process for the standardised and normalized datasets.

In [33]:
print(data_norm['Outcome'].value_counts())
df_n_class_0 = data_norm[data_norm['Outcome'] == 0]
df_n_class_1 = data_norm[data_norm['Outcome'] == 1]

0.0    500
1.0    268
Name: Outcome, dtype: int64


In [34]:
df_class_1_over = df_class_1.sample(500, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)
df_test_over.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 397
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               1000 non-null   float64
 1   Glucose                   1000 non-null   float64
 2   BloodPressure             1000 non-null   float64
 3   SkinThickness             1000 non-null   float64
 4   Insulin                   1000 non-null   float64
 5   BMI                       1000 non-null   float64
 6   DiabetesPedigreeFunction  1000 non-null   float64
 7   Age                       1000 non-null   float64
 8   Outcome                   1000 non-null   float64
dtypes: float64(9)
memory usage: 78.1 KB


In [35]:
df_n_class_1_over = df_n_class_1.sample(500, replace=True)
df_test_n_over = pd.concat([df_n_class_0, df_n_class_1_over], axis=0)
df_test_n_over.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 740
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               1000 non-null   float64
 1   Glucose                   1000 non-null   float64
 2   BloodPressure             1000 non-null   float64
 3   SkinThickness             1000 non-null   float64
 4   Insulin                   1000 non-null   float64
 5   BMI                       1000 non-null   float64
 6   DiabetesPedigreeFunction  1000 non-null   float64
 7   Age                       1000 non-null   float64
 8   Outcome                   1000 non-null   float64
dtypes: float64(9)
memory usage: 78.1 KB


In [37]:
y1=df_test_over['Outcome']
df_test_over=df_test_over.drop(['Outcome'],axis=1)
X1=df_test_over

In [43]:
y1n=df_test_n_over['Outcome']
df_test_n_over=df_test_n_over.drop(['Outcome'],axis=1)
X1n=df_test_n_over

The rest of the parameters are the same. This includes the train test split ratio and the algorithm parameters. Now the SVM and ANN are trained using the upsampled datasets. The same as before, normalized dataset for SVM and standardized dataset for ANN. 

In [44]:
from sklearn.model_selection import train_test_split

X1_s_train,X1_s_test ,y1_s_train, y1_s_test = train_test_split(X1,y1,
                                                   test_size=0.2,
                                                   random_state=0,
                                                  shuffle = True,
                                                  stratify = y1)

print('training data shape is :{}.'.format(X1_s_train.shape))
print('training label shape is :{}.'.format(y1_s_train.shape))
print('testing data shape is :{}.'.format(X1_s_test.shape))
print('testing label shape is :{}.'.format(y1_s_test.shape))

training data shape is :(800, 8).
training label shape is :(800,).
testing data shape is :(200, 8).
testing label shape is :(200,).


In [45]:
from sklearn.model_selection import train_test_split

X1_s_n_train,X1_s_n_test ,y1_s_n_train, y1_s_n_test = train_test_split(X1n,y1n,
                                                   test_size=0.2,
                                                   random_state=0,
                                                  shuffle = True,
                                                  stratify = y1n)

print('training data shape is :{}.'.format(X1_s_n_train.shape))
print('training label shape is :{}.'.format(y1_s_n_train.shape))
print('testing data shape is :{}.'.format(X1_s_n_test.shape))
print('testing label shape is :{}.'.format(y1_s_n_test.shape))

training data shape is :(800, 8).
training label shape is :(800,).
testing data shape is :(200, 8).
testing label shape is :(200,).


In [47]:
from sklearn.svm import SVC
svc_s_model = SVC(kernel='rbf',gamma=8)
svc_s_model.fit(X1_s_n_train, y1_s_n_train)

SVC(gamma=8)

In [49]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predictions= svc_s_model.predict(X1_s_n_train)
percentage=svc_s_model.score(X1_s_n_train,y1_s_n_train)
res=confusion_matrix(y1_s_n_train,predictions)
print("Training confusion matrix")
print(res)
predictions= svc_s_model.predict(X1_s_n_test)
percentage=svc_s_model.score(X1_s_n_test,y1_s_n_test)
res=confusion_matrix(y1_s_n_test,predictions)
print("validation confusion matrix")
print(res)
print(classification_report(y1_s_n_test, predictions))
# check the accuracy on the training set
print('training accuracy = '+str(svc_s_model.score(X1_s_n_train, y1_s_n_train)*100))
print('testing accuracy = '+str(svc_s_model.score(X1_s_n_test, y1_s_n_test)*100))

Training confusion matrix
[[342  58]
 [ 73 327]]
validation confusion matrix
[[83 17]
 [16 84]]
              precision    recall  f1-score   support

         0.0       0.84      0.83      0.83       100
         1.0       0.83      0.84      0.84       100

    accuracy                           0.83       200
   macro avg       0.84      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200

training accuracy = 83.625
testing accuracy = 83.5


There isn't much change in the accuracy of the algorithm but there is huge improvement in the classification report before and after sampling, especially for the diabetes class. 

In [50]:
num_epochs = 50
history = dl_model.fit(X1_s_train ,
                    y1_s_train ,
                    epochs= num_epochs ,
                    steps_per_epoch=200,
                    validation_data=(X1_s_test ,y1_s_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [52]:
dl_model.evaluate(X1_s_train ,
                    y1_s_train)



[0.2244451344013214,
 0.9024999737739563,
 0.8440170884132385,
 0.987500011920929,
 0.9863780736923218]

In [53]:
dl_model.evaluate(X1_s_test ,y1_s_test)



[0.3796664774417877,
 0.875,
 0.8318583965301514,
 0.9399999976158142,
 0.9362000823020935]

The ANN trained on the standardised and upsampled data performed the best result with 88% test accuracy. I have done a lot of work in this notebook, hope this deserves an upvote!! Thanks...
Please do mention if I have done something wrong.