In [None]:
import sys
import pandas
import numpy
import sklearn
import keras

In [None]:
import pandas as pd
import numpy as np

#load dataset
data=pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
data.describe()

In [None]:
#preprocess data to replace zero values to nan and drop

columns=['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in columns:
    data[col].replace(0,np.NaN,inplace=True)

data.describe()

In [None]:
#drop rows with missing values
data.dropna(inplace=True)
data.describe()

In [None]:
#converting to numpy array
df=data.values
print(df.shape)

In [None]:
X=df[:,:-1]
y=df[:,-1].astype(int)

In [None]:
print(X.shape)
print(y.shape)


In [None]:
#Normalising data
from sklearn.preprocessing import StandardScaler
sc=StandardScaler().fit(X)

In [None]:
X_sc=sc.transform(X)
df=pd.DataFrame(X_sc)
df.describe()

In [None]:
#import algorithms
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [None]:
#define scoring method
scoring='accuracy'

#define models to train

names=["K Nearest Neighbors", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Network","AdaBoost", "Naive Bayes"
      ,"SVM Linear","SVM RBF", "SVM Sigmoid"]

classifiers=[KNeighborsClassifier(n_neighbors=3),GaussianProcessClassifier(1.0*RBF(1.0)), DecisionTreeClassifier(max_depth=5)
    ,RandomForestClassifier(max_depth=5, n_estimators=50,max_features=1), MLPClassifier(alpha=1),AdaBoostClassifier(),GaussianNB(),
            SVC(kernel='linear'),SVC(kernel='rbf'),SVC(kernel='sigmoid') ]

models=zip(names,classifiers)

results=[]
names=[]

for name,model in models:
    kfold=model_selection.KFold(n_splits=10, random_state=seed)
    cv_results=model_selection.cross_val_score(model,X_sc,y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg="{0} : {1}({2})".format(name, cv_results.mean(), cv_results.std())
    print(msg)

K Nearest Neighbors : 0.7577564102564103(0.07983507922637882)
Gaussian Process : 0.7803205128205128(0.09586540068924587)
Decision Tree : 0.755448717948718(0.07293353566494007)
Random Forest : 0.7803846153846153(0.09916449121642132)
Neural Network : 0.7803205128205127(0.08495756859479006)
AdaBoost : 0.7625(0.07488060296802303)
Naive Bayes : 0.7752564102564102(0.0673090353956351)
SVM Linear : 0.7803846153846153(0.09154609247802299)
SVM RBF : 0.7678205128205129(0.09068045782106383)
SVM Sigmoid : 0.727051282051282(0.07940069737056646)

Let's go ahead with neural network and optimise it

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
def create_model():
    model=Sequential()
    model.add(Dense(8, input_dim=8, kernel_initializer='normal',activation='relu'))
    model.add(Dense(4,input_dim=8, kernel_initializer='normal',activation='relu'))
    model.add(Dense(1,input_dim=8,activation='sigmoid'))
    
    #compile the model
    adam=Adam(lr=0.01)
    model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])
    return model

model=create_model()
print(model.summary())

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam

# Define a random seed
seed = 6
np.random.seed(seed)

# Start defining the model
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # compile the model
    adam = Adam(lr = 0.01)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

# create the model
model = KerasClassifier(build_fn = create_model, verbose = 1)

# define the grid search parameters
batch_size = [10, 20, 40]
epochs = [10, 50, 100]

# make a dictionary of the grid search parameters
param_grid = dict(batch_size=batch_size, epochs=epochs)

# build and fit the GridSearchCV
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = KFold(random_state=seed,n_splits=3), verbose = 10)
grid_results = grid.fit(X_sc, y)

# summarize the results
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.7781757712364197, using {'batch_size': 20, 'epochs': 100}

In [None]:
# Do a grid search for learning rate and dropout rate
# import necessary packages 
from keras.layers import Dropout

# Define a random seed
seed = 6
np.random.seed(seed)

# Start defining the model
def create_model(learn_rate, dropout_rate):
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    # compile the model
    adam = Adam(lr = learn_rate)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

# create the model
model = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 20, verbose = 0)

# define the grid search parameters
learn_rate = [0.001, 0.01, 0.1]
dropout_rate = [0.0, 0.1, 0.2]

# make a dictionary of the grid search parameters
param_grid = dict(learn_rate=learn_rate, dropout_rate=dropout_rate)

# build and fit the GridSearchCV
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = KFold(random_state=seed, n_splits=3), verbose = 10)
grid_results = grid.fit(X_sc, y)

# summarize the results
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.783163272148, using {'learn_rate': 0.001, 'dropout_rate': 0.0}

In [None]:
# Do a grid search to optimize kernel initialization and activation functions
# Start defining the model 
def create_model(activation,init):
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim = 8, kernel_initializer=init, activation=activation))
    
    model.add(Dense(4, input_dim = 8, kernel_initializer=init, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    
    # compile the model
    adam = Adam(lr = 0.001)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

# create the model
model = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 20, verbose = 0)

# define the grid search parameters
activation = ['softmax', 'relu', 'tanh', 'linear']
init = ['uniform', 'normal', 'zero']

# make a dictionary of the grid search parameters
param_grid = dict(activation=activation,init=init)

# build and fit the GridSearchCV
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = KFold(random_state=seed, n_splits=3), verbose = 10)
grid_results = grid.fit(X_sc, y)

# summarize the results
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.793367353173, using {'activation': 'linear', 'init': 'uniform'}

In [None]:
# Do a grid search to find the optimal number of neurons in each hidden layer
# Start defining the model 
def create_model(neuron1, neuron2):
    # create model
    model = Sequential()
    model.add(Dense(neuron1, input_dim = 8, kernel_initializer='uniform', activation='linear'))
    
    model.add(Dense(neuron2, input_dim = neuron1, kernel_initializer='uniform', activation='linear'))
    
    model.add(Dense(1, activation='sigmoid'))
    
    # compile the model
    adam = Adam(lr = 0.001)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

# create the model
model = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 20, verbose = 0)

# define the grid search parameters
neuron1 = [4, 8, 16]
neuron2 = [2, 4, 8]

# make a dictionary of the grid search parameters
param_grid = dict(neuron1=neuron1, neuron2=neuron2)

# build and fit the GridSearchCV
grid = GridSearchCV(estimator = model, param_grid = param_grid, cv = KFold(random_state=seed, n_splits=3),refit=True, verbose = 10)
grid_results = grid.fit(X_sc, y)

# summarize the results
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.790816335198, using {'neuron1': 16, 'neuron2': 2}

In [None]:
# generate predictions with optimal hyperparameters
y_pred = grid.predict(X_sc)

print(y_pred.shape) #(392L, 1L)

In [None]:
# Generate a classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(Y, y_pred))
print(classification_report(Y, y_pred))

0.7806122448979592
             precision    recall  f1-score   support

          0       0.81      0.89      0.84       262
          1       0.71      0.57      0.63       130

avg / total       0.77      0.78      0.77       392