In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
cols=["age","gender","fam_diabetes","highBP","pActivity","bmi","smoking","alcohol","sleep","soundSleep","medicine","junk_food","stress","BP_level","pregnancies","Pdiabetes","urine_freq","diabetic"]
df=pd.read_csv("diabetes_dataset__2019.csv",names=cols).drop(["highBP","Pdiabetes"],axis=1)
df=df.drop(df.index[0:1])
df.head()

Unnamed: 0,age,gender,fam_diabetes,pActivity,bmi,smoking,alcohol,sleep,soundSleep,medicine,junk_food,stress,BP_level,pregnancies,urine_freq,diabetic
1,50-59,Male,no,one hr or more,39,no,no,8,6,no,occasionally,sometimes,high,0,not much,no
2,50-59,Male,no,less than half an hr,28,no,no,8,6,yes,very often,sometimes,normal,0,not much,no
3,40-49,Male,no,one hr or more,24,no,no,6,6,no,occasionally,sometimes,normal,0,not much,no
4,50-59,Male,no,one hr or more,23,no,no,8,6,no,occasionally,sometimes,normal,0,not much,no
5,40-49,Male,no,less than half an hr,27,no,no,8,8,no,occasionally,sometimes,normal,0,not much,no


# Pre-processing

In [4]:
df=df.dropna()
for column in df.columns:
  print("unique values in "+column+" are: ",end="")
  print(df[column].unique())

unique values in age are: ['50-59' '40-49' 'less than 40' '60 or older']
unique values in gender are: ['Male' 'Female']
unique values in fam_diabetes are: ['no' 'yes']
unique values in pActivity are: ['one hr or more' 'less than half an hr' 'none' 'more than half an hr']
unique values in bmi are: ['39' '28' '24' '23' '27' '21' '20' '26' '22' '15' '34' '30' '29' '18'
 '32' '31' '36' '38' '40' '35' '19' '33' '17' '25' '42' '45']
unique values in smoking are: ['no' 'yes']
unique values in alcohol are: ['no' 'yes']
unique values in sleep are: ['8' '6' '10' '7' '11' '9' '4' '5']
unique values in soundSleep are: ['6' '8' '10' '7' '11' '4' '9' '5' '3' '2' '6 ' '1' '0']
unique values in medicine are: ['no' 'yes' 'o']
unique values in junk_food are: ['occasionally' 'very often' 'often' 'always']
unique values in stress are: ['sometimes' 'not at all' 'very often' 'always']
unique values in BP_level are: ['high' 'normal' 'low' 'Low' 'High' 'normal ']
unique values in pregnancies are: ['0' '1' '2'

In [5]:
df["gender"]=(df["gender"]=="Male").astype(int)
df["fam_diabetes"]=(df["fam_diabetes"]=="yes").astype(int)
df["smoking"]=(df["smoking"]=="yes").astype(int)
df["alcohol"]=(df["alcohol"]=="yes").astype(int)
df["medicine"]=df["medicine"].replace("o","no")
df["medicine"]=(df["medicine"]=="yes").astype(int)
df["BP_level"]=df["BP_level"].replace("High","high")
df["BP_level"]=df["BP_level"].replace("Low","low")
df["BP_level"]=df["BP_level"].replace("normal ","normal")
df["diabetic"]=df["diabetic"].replace(" no","no")
df["urine_freq"]=(df["urine_freq"]=="quite often").astype(int)
df["diabetic"]=(df["diabetic"]=="yes").astype(int)
df["bmi"]=pd.to_numeric(df["bmi"],errors="coerce")
df["sleep"]=pd.to_numeric(df["sleep"],errors="coerce")
df["soundSleep"]=pd.to_numeric(df["soundSleep"],errors="coerce")
df["pregnancies"]=pd.to_numeric(df["pregnancies"],errors="coerce")

In [6]:
for column in df.columns:
  print("unique values in "+column+" are: ",end="")
  print(df[column].unique())

unique values in age are: ['50-59' '40-49' 'less than 40' '60 or older']
unique values in gender are: [1 0]
unique values in fam_diabetes are: [0 1]
unique values in pActivity are: ['one hr or more' 'less than half an hr' 'none' 'more than half an hr']
unique values in bmi are: [39 28 24 23 27 21 20 26 22 15 34 30 29 18 32 31 36 38 40 35 19 33 17 25
 42 45]
unique values in smoking are: [0 1]
unique values in alcohol are: [0 1]
unique values in sleep are: [ 8  6 10  7 11  9  4  5]
unique values in soundSleep are: [ 6  8 10  7 11  4  9  5  3  2  1  0]
unique values in medicine are: [0 1]
unique values in junk_food are: ['occasionally' 'very often' 'often' 'always']
unique values in stress are: ['sometimes' 'not at all' 'very often' 'always']
unique values in BP_level are: ['high' 'normal' 'low']
unique values in pregnancies are: [0 1 2 3 4]
unique values in urine_freq are: [0 1]
unique values in diabetic are: [0 1]


In [7]:

encoded=pd.get_dummies(df["age"],prefix="age",dtype=int)
df=pd.concat([encoded,df],axis=1)
df=df.drop("age",axis=1)
encoded=pd.get_dummies(df["pActivity"],prefix="pActivity",dtype=int)
df=pd.concat([encoded,df],axis=1)
df=df.drop("pActivity",axis=1)
encoded=pd.get_dummies(df["junk_food"],prefix="junk_food",dtype=int)
df=pd.concat([encoded,df],axis=1)
df=df.drop("junk_food",axis=1)
encoded=pd.get_dummies(df["stress"],prefix="stress",dtype=int)
df=pd.concat([encoded,df],axis=1)
df=df.drop("stress",axis=1)
encoded=pd.get_dummies(df["BP_level"],prefix="BP_level",dtype=int)
df=pd.concat([encoded,df],axis=1)
df=df.drop("BP_level",axis=1)

In [8]:
x_train,x_test,y_train,y_test=train_test_split(df[df.columns[:-1]],df[df.columns[-1]],test_size=0.2,random_state=42)

In [9]:
def scale_dataset(x,y,oversample=False):
  scaler=StandardScaler()
  x=scaler.fit_transform(x)
  if oversample:
    ros=RandomOverSampler(random_state=42)
    x,y=ros.fit_resample(x,y)
  return x,y

In [10]:
x_train,y_train=scale_dataset(x_train,y_train,oversample=True)
x_test,y_test=scale_dataset(x_test,y_test)

# Models

## KNN

In [81]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [82]:
grid_params={"n_neighbors":[1,3,5,7,9],"weights":["uniform","distance"],"p":[1,2,3]}
clf=GridSearchCV(KNeighborsClassifier(),grid_params,cv=5)
clf.fit(x_train,y_train)
result_df=pd.DataFrame(clf.cv_results_)
print("Best score=",clf.best_score_)
print("Best parameters: ",clf.best_params_)
best_model=clf.best_estimator_

#using the model with best parameters

knn_model=best_model
knn_model.fit(x_train,y_train)
y_pred=knn_model.predict(x_test)
print(classification_report(y_test,y_pred))
auc=roc_auc_score(y_test,y_pred)
print("AUC: ",auc)

Best score= 0.9533080748283211
Best parameters:  {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       129
           1       0.91      0.96      0.94        53

    accuracy                           0.96       182
   macro avg       0.95      0.96      0.95       182
weighted avg       0.96      0.96      0.96       182

AUC:  0.9617522305104579


## Naive Bayes

In [83]:
from sklearn.naive_bayes import GaussianNB

In [84]:
nb_model=GaussianNB()
nb_model=nb_model.fit(x_train,y_train)
y_pred=nb_model.predict(x_test)
print(classification_report(y_test,y_pred))
auc=roc_auc_score(y_test,y_pred)
print("AUC: ",auc)

              precision    recall  f1-score   support

           0       0.71      1.00      0.83       129
           1       0.00      0.00      0.00        53

    accuracy                           0.71       182
   macro avg       0.35      0.50      0.41       182
weighted avg       0.50      0.71      0.59       182

AUC:  0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Logistic Regression

In [85]:
from sklearn.linear_model import LogisticRegression

In [86]:
grid_params ={'penalty' : ['l2'],
    'C' : [0.001,0.01,0.1,1,10,100],
    'solver' : ['lbfgs','newton-cg','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }

clf=GridSearchCV(LogisticRegression(),grid_params,cv=5)
clf.fit(x_train,y_train)
result_df=pd.DataFrame(clf.cv_results_)
print("Best score=",clf.best_score_)
print("Best parameters: ",clf.best_params_)

#using best hyperparameters
log_reg_best=clf.best_estimator_
log_reg_best.fit(x_train,y_train)
y_pred=log_reg_best.predict(x_test)
print(classification_report(y_test,y_pred))
auc=roc_auc_score(y_test,y_pred)
print("AUC: ",auc)



Best score= 0.8570068671560502
Best parameters:  {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       129
           1       0.71      0.79      0.75        53

    accuracy                           0.85       182
   macro avg       0.81      0.83      0.82       182
weighted avg       0.85      0.85      0.85       182

AUC:  0.8303349422261226


## SVM

In [87]:
from sklearn.svm import SVC

In [88]:
grid_params = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
clf=GridSearchCV(SVC(),grid_params,cv=5)
clf.fit(x_train,y_train)
result_df=pd.DataFrame(clf.cv_results_)
print("Best score=",clf.best_score_)
print("Best parameters: ",clf.best_params_)

#using best hyperparameters
svm_best=clf.best_estimator_
svm_best.fit(x_train,y_train)
y_pred=svm_best.predict(x_test)
print(classification_report(y_test,y_pred))
auc=roc_auc_score(y_test,y_pred)
print("AUC: ",auc)

Best score= 0.9523324650722236
Best parameters:  {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       129
           1       0.93      0.96      0.94        53

    accuracy                           0.97       182
   macro avg       0.96      0.97      0.96       182
weighted avg       0.97      0.97      0.97       182

AUC:  0.9656281995027058


## Decision Tree

In [89]:
from sklearn.tree import DecisionTreeClassifier


In [90]:
grid_params={
    "criterion":["gini","entropy","log_loss"],
    "splitter":["best","random"],
    "max_depth":[None,1,2,3,4,5,7],
    "max_features":["sqrt","log2",0.5,1,3,5,None]
}
clf=GridSearchCV(DecisionTreeClassifier(),grid_params,cv=5)
clf.fit(x_train,y_train)
result_df=pd.DataFrame(clf.cv_results_)
print("Best score=",clf.best_score_)
print("Best parameters: ",clf.best_params_)

#using best hyperparameterd
dt_best=clf.best_estimator_
dt_best.fit(x_train,y_train)
y_pred=dt_best.predict(x_test)
print(classification_report(y_test,y_pred))
auc=roc_auc_score(y_test,y_pred)
print("AUC: ",auc)

Best score= 0.954274212645039
Best parameters:  {'criterion': 'gini', 'max_depth': None, 'max_features': 1, 'splitter': 'random'}
              precision    recall  f1-score   support

           0       0.90      0.85      0.88       129
           1       0.68      0.77      0.73        53

    accuracy                           0.83       182
   macro avg       0.79      0.81      0.80       182
weighted avg       0.84      0.83      0.83       182

AUC:  0.8131490419774755


## Random Forest

In [91]:
from sklearn.ensemble import RandomForestClassifier

In [92]:
grid_params={
    "n_estimators":[50,100,200],
    "criterion":["gini","entropy","log_loss"],
    "max_features":["sqrt","log2",0.5,1,3,5,None],
    "n_jobs":[-1]
}
clf=GridSearchCV(RandomForestClassifier(),grid_params,cv=5)
clf.fit(x_train,y_train)
result_df=pd.DataFrame(clf.cv_results_)
print("Best score=",clf.best_score_)
print("Best parameters: ",clf.best_params_)

#using best hyperparameters
rf_best=clf.best_estimator_
rf_best.fit(x_train,y_train)
y_pred=rf_best.predict(x_test)
print(classification_report(y_test,y_pred))
auc=roc_auc_score(y_test,y_pred)
print("AUC: ",auc)

Best score= 0.9610940089983424
Best parameters:  {'criterion': 'log_loss', 'max_features': 0.5, 'n_estimators': 200, 'n_jobs': -1}
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       129
           1       0.93      0.96      0.94        53

    accuracy                           0.97       182
   macro avg       0.96      0.97      0.96       182
weighted avg       0.97      0.97      0.97       182

AUC:  0.9656281995027058


## Neural Network

In [11]:
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout


In [12]:
pip install -U keras-tuner

Note: you may need to restart the kernel to use updated packages.


In [13]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [121]:
import keras_tuner as kt
from tensorflow.keras.optimizers import SGD, RMSprop, Adam

In [124]:
def build_model(hp):
    nn_model = Sequential()
    count = 0
    for i in range(hp.Int('num_layers', min_value=1, max_value=4)):
        if count == 0:
            nn_model.add(Dense(hp.Int('num_nodes' + str(i), min_value=32, max_value=128, step=32),
                               activation='relu',
                               input_dim=x_train.shape[1]))
        else:
            nn_model.add(Dense(hp.Int('num_nodes' + str(i), min_value=32, max_value=128, step=32),
                               activation='relu'))
        count += 1

    nn_model.add(Dense(1, activation='sigmoid'))
    
    # Define optimizer with the specified learning rate
    optimizer_choice = hp.Choice('optimizer', values=['SGD', 'RMSprop', 'Adam'])
    learning_rate = hp.Choice('learning_rate', values=[1e-4])  # Add different learning rates
    if optimizer_choice == 'SGD':
        optimizer = SGD(learning_rate=learning_rate)
    elif optimizer_choice == 'RMSprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = Adam(learning_rate=learning_rate)

    nn_model.compile(optimizer=optimizer,
                     loss='binary_crossentropy',
                     metrics=['accuracy'])
    return nn_model


In [125]:
tuner=kt.GridSearch(build_model, objective= 'val_accuracy',directory="mydir",project_name="dib5")
tuner.search(x_train,y_train,epochs=5, validation_split=0.2)

Trial 29 Complete [00h 00m 03s]
val_accuracy: 0.5339806079864502

Best val_accuracy So Far: 0.9902912378311157
Total elapsed time: 00h 01m 18s

Search: Running Trial #30

Value             |Best Value So Far |Hyperparameter
2                 |2                 |num_layers
64                |32                |num_nodes0
RMSprop           |Adam              |optimizer
0.0001            |0.0001            |learning_rate
64                |96                |num_nodes1



Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000001E69DC50700>
Traceback (most recent call last):
  File "C:\Users\subha\miniconda3\envs\tf\lib\weakref.py", line 371, in remove
    self = selfref()
KeyboardInterrupt: 


Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [108]:
result=[]
for i in range(1,20):
    model=tuner.get_best_models(num_models=i)[i-1]
    model.fit(x_train,y_train,epochs=100,initial_epoch=5,validation_data=(x_test,y_test))
    result.append(f"model {i}: {model.evaluate(x_test,y_test)}")

Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epo

KeyboardInterrupt: 

In [104]:
result


['model 1: [0.3073215186595917, 0.9725274443626404]',
 'model 2: [0.5148547291755676, 0.9560439586639404]',
 'model 3: [0.15155695378780365, 0.9505494236946106]',
 'model 4: [0.18674252927303314, 0.9615384340286255]',
 'model 5: [0.14606580138206482, 0.9560439586639404]',
 'model 6: [0.3570745289325714, 0.9615384340286255]',
 'model 7: [0.3437439799308777, 0.9615384340286255]',
 'model 8: [0.6385170221328735, 0.9725274443626404]',
 'model 9: [0.28594180941581726, 0.9560439586639404]',
 'model 10: [0.15249550342559814, 0.9505494236946106]',
 'model 11: [0.44141605496406555, 0.9560439586639404]',
 'model 12: [0.14456388354301453, 0.9615384340286255]',
 'model 13: [0.29338720440864563, 0.9615384340286255]',
 'model 14: [0.453723669052124, 0.9670329689979553]',
 'model 15: [0.4493361711502075, 0.9615384340286255]',
 'model 16: [0.1821203976869583, 0.9505494236946106]',
 'model 17: [0.42640891671180725, 0.9560439586639404]',
 'model 18: [0.41630494594573975, 0.9505494236946106]',
 'model 19

In [114]:
model=tuner.get_best_models(num_models=1)[0]
model.fit(x_train,y_train,epochs=100,initial_epoch=5,validation_data=(x_test,y_test))

Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epo

<keras.callbacks.History at 0x1e8b446d2e0>