# Diabetes Onset Detection using Tensorflow

In [9]:
# import libraries
import pandas as pd
import numpy as np


# read data and rename columns
col_names = ['n_pregnant', 'glucose_concentration', 'blood_pressure (mm Hg)', 
         'skin_thickness (mm)', 'serum_insulin (mu U/ml)', 'BMI', 
         'pedigree_function', 'age', 'class']

df = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", names=col_names)

df.describe()

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Looking at the minumums, we see there are a few categories with zero where there shouldn't be. Let's find and replace all the zero values in glucose concentration, blood pressure, skin thickness, serum insulin, and BMI with NaN and remove those rows. 

In [11]:
columns = ['glucose_concentration', 'blood_pressure (mm Hg)', 
           'skin_thickness (mm)', 'serum_insulin (mu U/ml)', 'BMI']
for col in columns:
    df[col].replace(0, np.NaN, inplace=True)
    
df.dropna(inplace=True)

#confirm new data
df.describe()

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777,0.471401
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [12]:
# create target and feature datasets
dataset = df.values
X = dataset[:, 0:8]
Y = dataset[:, 8].astype(int)

# standardize data
X_standardized = scaler.transform(X)
data = pd.DataFrame(X_standardized)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)

Now let's build our model in Keras, writing a function in which the layers are defined: <br>
- The input layer  <br>
- A dense layer, where all the neurons are connected to every input <br>
- The output layer


In [13]:
from sklearn.model_selection import GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam

from keras.layers import Dropout          
# Define a random seed
seed = 42
np.random.seed(seed)
# Start defining the model
def create_model(learn_rate, dropout_rate): # Added the learn rate and dropout variables
    model = Sequential()
    model.add(Dense(8, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))        # Added a Dropout layer here
    model.add(Dense(4, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))        # Added a Dropout layer here       
    model.add(Dense(1, activation='sigmoid'))
    # compile the model
    adam = Adam(lr = learn_rate)            # Defined the learn_rate variable here
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model


Next I'll use a grid search to determine the optimal learn rate and dropout rate. 

In [14]:
# Defined 2 parameters here
model = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 20, verbose = 0) 
# Defined the grid search parameters here
learn_rate = [0.001, 0.01, 0.1]
dropout_rate = [0.0, 0.1, 0.2]
# Made a dictionary of the grid search parameters here
param_grid = dict(learn_rate=learn_rate, dropout_rate=dropout_rate)
# build and fit the GridSearchCV
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = KFold(random_state=seed), verbose = 10)
grid_results = grid.fit(X_standardized, Y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] dropout_rate=0.0, learn_rate=0.001 ..............................
[CV] .. dropout_rate=0.0, learn_rate=0.001, score=0.733, total=   4.1s
[CV] dropout_rate=0.0, learn_rate=0.001 ..............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.1s remaining:    0.0s


[CV] .. dropout_rate=0.0, learn_rate=0.001, score=0.763, total=   5.6s
[CV] dropout_rate=0.0, learn_rate=0.001 ..............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.8s remaining:    0.0s


[CV] .. dropout_rate=0.0, learn_rate=0.001, score=0.815, total=   3.6s
[CV] dropout_rate=0.0, learn_rate=0.01 ...............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.4s remaining:    0.0s


[CV] ... dropout_rate=0.0, learn_rate=0.01, score=0.763, total=   6.0s
[CV] dropout_rate=0.0, learn_rate=0.01 ...............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   19.4s remaining:    0.0s


[CV] ... dropout_rate=0.0, learn_rate=0.01, score=0.756, total=   2.5s
[CV] dropout_rate=0.0, learn_rate=0.01 ...............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   22.0s remaining:    0.0s


[CV] ... dropout_rate=0.0, learn_rate=0.01, score=0.808, total=   2.5s
[CV] dropout_rate=0.0, learn_rate=0.1 ................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   24.5s remaining:    0.0s


[CV] .... dropout_rate=0.0, learn_rate=0.1, score=0.702, total=   3.4s
[CV] dropout_rate=0.0, learn_rate=0.1 ................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   27.9s remaining:    0.0s


[CV] .... dropout_rate=0.0, learn_rate=0.1, score=0.740, total=   4.0s
[CV] dropout_rate=0.0, learn_rate=0.1 ................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   32.0s remaining:    0.0s


[CV] .... dropout_rate=0.0, learn_rate=0.1, score=0.777, total=   5.0s
[CV] dropout_rate=0.1, learn_rate=0.001 ..............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   36.9s remaining:    0.0s


[CV] .. dropout_rate=0.1, learn_rate=0.001, score=0.718, total=   5.4s
[CV] dropout_rate=0.1, learn_rate=0.001 ..............................
[CV] .. dropout_rate=0.1, learn_rate=0.001, score=0.763, total=   6.6s
[CV] dropout_rate=0.1, learn_rate=0.001 ..............................
[CV] .. dropout_rate=0.1, learn_rate=0.001, score=0.831, total=   5.6s
[CV] dropout_rate=0.1, learn_rate=0.01 ...............................
[CV] ... dropout_rate=0.1, learn_rate=0.01, score=0.725, total=   4.7s
[CV] dropout_rate=0.1, learn_rate=0.01 ...............................
[CV] ... dropout_rate=0.1, learn_rate=0.01, score=0.779, total=   4.6s
[CV] dropout_rate=0.1, learn_rate=0.01 ...............................
[CV] ... dropout_rate=0.1, learn_rate=0.01, score=0.823, total=   4.7s
[CV] dropout_rate=0.1, learn_rate=0.1 ................................
[CV] .... dropout_rate=0.1, learn_rate=0.1, score=0.611, total=   4.7s
[CV] dropout_rate=0.1, learn_rate=0.1 ................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  2.5min finished


I'll print the results in a more readable format for easier interpretation:

In [6]:
# summarize the results
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.7908163312442449, using {'dropout_rate': 0.2, 'learn_rate': 0.01}
0.7882652916774457 (0.04183757813687412) with: {'dropout_rate': 0.0, 'learn_rate': 0.001}
0.7602040956214983 (0.03798047799145346) with: {'dropout_rate': 0.0, 'learn_rate': 0.01}
0.7500000033451586 (0.04771839237564345) with: {'dropout_rate': 0.0, 'learn_rate': 0.1}
0.7729591745503095 (0.04779843317135224) with: {'dropout_rate': 0.1, 'learn_rate': 0.001}
0.7857142814568111 (0.037286094022184794) with: {'dropout_rate': 0.1, 'learn_rate': 0.01}
0.7423469314769823 (0.006792465817075864) with: {'dropout_rate': 0.1, 'learn_rate': 0.1}
0.7806122514362238 (0.04782512280485408) with: {'dropout_rate': 0.2, 'learn_rate': 0.001}
0.7908163312442449 (0.028826584263085618) with: {'dropout_rate': 0.2, 'learn_rate': 0.01}
0.724489796526578 (0.01753062747717179) with: {'dropout_rate': 0.2, 'learn_rate': 0.1}


The best outcome was produced by a drop out rate of 0.2 and a learn rate of 0.01, achieving an accuracy of 79.1%. 
