## How to tune some hyper-parameter

In [3]:
import numpy as np

import keras

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [10]:
from keras.datasets import mnist

# load dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
input_shape = (X_train.shape[1],X_train.shape[2], )

# Making sure that the values are float so that we can get decimal points after division
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalizing the RGB codes by dividing it to the max RGB value.
X_train /= 255
X_test /= 255

# convert class vectors to binary class matrices
num_classes = len(np.unique(y_train))
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

X_train.shape, y_train.shape, input_shape

((60000, 28, 28), (60000, 10), (28, 28))

In [11]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Flatten
from keras.constraints import maxnorm

# Function to create model, required for KerasClassifier
def create_model(neurons=30, dropout_rate=0.5,
                 init_mode='uniform', weight_constraint=0,
                 optimizer='adam', learn_rate=0.01, momentum=0,
                 activation='relu'):
    
    # create model
    inputs = Input(shape=input_shape)
    x = Flatten()(inputs)
    x = Dense(neurons,
              kernel_initializer=init_mode, kernel_constraint=maxnorm(weight_constraint),
              activation=activation)(x)
    
    x = Dropout(dropout_rate)(x)
    outputs = Dense(num_classes,
                   kernel_initializer=init_mode,
                   activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    # Compile model
    #optimizer = optimizer(lr=learn_rate, momentum=momentum)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# create model
model = create_model()

# Print the model
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 28, 28)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 30)                23550     
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                310       
Total params: 23,860
Trainable params: 23,860
Non-trainable params: 0
_________________________________________________________________


In [12]:
# define the grid search parameters
#batch_size = [8, 16, 32, 64, 128, 256, 512, 1024]
batch_size = [8, 1024]
#epochs = [10, 50, 100]
epochs = [10]
#optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
optimizer = ['SGD']
#learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
learn_rate = [0.001, 0.3]
#momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
#init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
#activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
activation = ['relu', 'sigmoid']

#weight_constraint = [1, 2, 3, 4, 5]
#dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
dropout_rate = [0.0, 0.9]
#neurons = [8, 16, 32, 64, 128, 256, 512, 1024]
neurons = [8, 1024]

#param_grid = dict(batch_size=batch_size, epochs=epochs, optimizer=optimizer, learn_rate=learn_rate, momentum=momentum, init_mode=init_mode)
param_grid = dict(batch_size=batch_size
                  ,neurons=neurons
                  ,dropout_rate=dropout_rate
                  ,epochs=epochs
                  ,optimizer=optimizer 
                  ,learn_rate=learn_rate 
                  #,momentum=momentum
                  ,activation=activation
                  #,init_mode=init_mode
                 )

In [13]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model, verbose=2)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
# We use only a small part of the dataset
grid_result = grid.fit(X_train[:1000], y_train[:1000], verbose=2)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10
 - 1s - loss: 0.3251 - accuracy: 0.9000
Epoch 2/10
 - 0s - loss: 0.3251 - accuracy: 0.9000
Epoch 3/10
 - 0s - loss: 0.3251 - accuracy: 0.9000
Epoch 4/10
 - 0s - loss: 0.3250 - accuracy: 0.9000
Epoch 5/10
 - 1s - loss: 0.3250 - accuracy: 0.9000
Epoch 6/10
 - 0s - loss: 0.3250 - accuracy: 0.9000
Epoch 7/10
 - 0s - loss: 0.3250 - accuracy: 0.9000
Epoch 8/10
 - 0s - loss: 0.3250 - accuracy: 0.9000
Epoch 9/10
 - 0s - loss: 0.3250 - accuracy: 0.9000
Epoch 10/10
 - 0s - loss: 0.3250 - accuracy: 0.9000
Best: 0.900000 using {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.001, 'neurons': 8, 'optimizer': 'SGD'}


In [14]:
# summarize results
means = grid_result.cv_results_['mean_test_score']
#stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
print("mean\t\tparams")
print("--------------------------------")
for mean, param in zip(means, params):
    print("%f with: %r" % (mean, param))

mean		params
--------------------------------
0.900000 with: {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.001, 'neurons': 8, 'optimizer': 'SGD'}
0.900000 with: {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.001, 'neurons': 1024, 'optimizer': 'SGD'}
0.900000 with: {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.3, 'neurons': 8, 'optimizer': 'SGD'}
0.900000 with: {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.0, 'epochs': 10, 'learn_rate': 0.3, 'neurons': 1024, 'optimizer': 'SGD'}
0.900000 with: {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.9, 'epochs': 10, 'learn_rate': 0.001, 'neurons': 8, 'optimizer': 'SGD'}
0.900000 with: {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.9, 'epochs': 10, 'learn_rate': 0.001, 'neurons': 1024, 'optimizer': 'SGD'}
0.900000 with: {'activation': 'relu', 'batch_size': 8, 'dropout_rate': 0.9, 'ep

## Tips for Hyperparameter Optimization

This section lists some handy tips to consider when tuning hyperparameters of your neural network.

* **k-fold Cross Validation.** You can see that the results from the examples in this post show some variance. A default cross-validation of 3 was used, but perhaps k=5 or k=10 would be more stable. Carefully choose your cross validation configuration to ensure your results are stable.
* **Review the Whole Grid.** Do not just focus on the best result, review the whole grid of results and look for trends to support configuration decisions.
* **Parallelize.** Use all your cores if you can, neural networks are slow to train and we often want to try a lot of different parameters. Consider spinning up a lot of AWS instances.
* **Use a Sample of Your Dataset.** Because networks are slow to train, try training them on a smaller sample of your training dataset, just to get an idea of general directions of parameters rather than optimal configurations.
* **Start with Coarse Grids.** Start with coarse-grained grids and zoom into finer grained grids once you can narrow the scope.
* **Do not Transfer Results.** Results are generally problem specific. * Try to avoid favorite configurations on each new problem that you see. It is unlikely that optimal results you discover on one problem will transfer to your next project. Instead look for broader trends like number of layers or relationships between parameters.
* **Reproducibility is a Problem.** Although we set the seed for the random number generator in NumPy, the results are not 100% reproducible. There is more to reproducibility when grid searching wrapped Keras models than is presented in this post.
