In [1]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
# Set random number generator with constant seed. Ensures we compare model consistently
seed = 7
numpy.random.seed(seed)

In [3]:
# Import complete dataset
from sklearn.datasets import load_boston
features,target = load_boston(return_X_y=True)
Y = target.reshape(506,1)
X = features.astype(float)
#Limit tuning with limited rows to ensure code runs quickly
X_train=X[1:101,:]
Y_train=Y[1:101,:]
print(X_train.shape)
print(Y_train.shape)

(100, 13)
(100, 1)


In [4]:
# Basic single layer model without tuning parameters
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

In [5]:
estimator = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)

In [6]:
# Evaluae basic single layer model without tuning parameters
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: 57.82 (42.25) MSE


In [7]:
# Parameter tuning using Grid Search

In [8]:
# Tuning optimization algorithms

In [9]:
# Model to obtain best optimizer
def create_model_optimizer(optimizer='SGD'):
	# create model
	model = Sequential()
# As a rule of thumb, no. of neurons in hidden layer is kept same as no. of input features
	model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model

In [10]:
model = KerasRegressor(build_fn=create_model_optimizer, verbose=0)

In [11]:
# Define the grid search parameters - Optimization Algorithms
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

In [12]:
# Summarize results - Optimization Algorithms
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -70.252350 using {'optimizer': 'Adagrad'}
-1210191.495685 (1015514.115262) with: {'optimizer': 'SGD'}
-264.188642 (146.012773) with: {'optimizer': 'RMSprop'}
-70.252350 (21.798538) with: {'optimizer': 'Adagrad'}
-340.369199 (200.028205) with: {'optimizer': 'Adadelta'}
-188.814646 (52.591622) with: {'optimizer': 'Adam'}
-108.382188 (59.600982) with: {'optimizer': 'Adamax'}
-213.236959 (160.906641) with: {'optimizer': 'Nadam'}


In [13]:
# Tuning Epoch and Batchsize

In [14]:
# Model to optain best Epoch and Batchsize
def create_model_epoch_batchsize():
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='Adagrad')
	return model

In [15]:
model = KerasRegressor(build_fn=create_model_epoch_batchsize, verbose=0)

In [16]:
# Define the grid search parameters - Epochs and Batch Size
batch_size = [1, 3 , 5]
epochs = [200, 400, 500]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

In [17]:
# Summarize results - Epochs and Batch Size
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -20.364575 using {'epochs': 500, 'batch_size': 3}
-22.109959 (4.581594) with: {'epochs': 200, 'batch_size': 1}
-23.964450 (3.626423) with: {'epochs': 400, 'batch_size': 1}
-22.562409 (5.993306) with: {'epochs': 500, 'batch_size': 1}
-20.669693 (4.790279) with: {'epochs': 200, 'batch_size': 3}
-22.360193 (3.535647) with: {'epochs': 400, 'batch_size': 3}
-20.364575 (4.420059) with: {'epochs': 500, 'batch_size': 3}
-20.961925 (6.086711) with: {'epochs': 200, 'batch_size': 5}
-21.101111 (3.962289) with: {'epochs': 400, 'batch_size': 5}
-23.337207 (5.679260) with: {'epochs': 500, 'batch_size': 5}


In [19]:
# Tuning Learning Rate
# Adagrad doesn't have momentum as an hyper parameter. So we look only for Learning Rate

In [20]:
from keras.optimizers import Adagrad

In [25]:
# Model to optain best Learning Rate
def create_model_learningRate(learn_rate=0.1):
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1,kernel_initializer='normal'))
	# Compile model
	optimizer = Adagrad(lr=learn_rate)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model

In [26]:
model = KerasRegressor(build_fn=create_model_learningRate, epochs=500, batch_size=3, verbose=0)

In [29]:
# Define the grid search parameters - Learning Rate
learn_rate = [0.1, 0.075, 0.06]
param_grid = dict(learn_rate=learn_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

In [30]:
# Summarize results - Learning Rate
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -16.810857 using {'learn_rate': 0.1}
-16.810857 (4.082318) with: {'learn_rate': 0.1}
-24.376610 (8.526254) with: {'learn_rate': 0.075}
-27.136983 (3.028903) with: {'learn_rate': 0.06}


In [31]:
# Tuning Kernal Initializer

In [32]:
# Model to optain best Kernal Initializer
def create_model_init(init_mode='uniform'):
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=13, kernel_initializer=init_mode, activation='relu'))
	model.add(Dense(1, kernel_initializer=init_mode))
	# Compile model
	optimizer = Adagrad(lr=0.1)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model

In [33]:
model = KerasRegressor(build_fn=create_model_init, epochs=500, batch_size=3, verbose=0)

In [34]:
# Define the grid search parameters - Kernal Initializer
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
param_grid = dict(init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

In [35]:
# Summarize results - Kernal Initializer
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -22.710047 using {'init_mode': 'uniform'}
-22.710047 (6.437737) with: {'init_mode': 'uniform'}
-50.615799 (27.585039) with: {'init_mode': 'lecun_uniform'}
-39.134081 (25.993987) with: {'init_mode': 'normal'}
-72.045191 (22.079562) with: {'init_mode': 'zero'}
-25.417910 (8.388176) with: {'init_mode': 'glorot_normal'}
-28.496305 (7.501855) with: {'init_mode': 'glorot_uniform'}
-39.720868 (22.390881) with: {'init_mode': 'he_normal'}
-25.305118 (1.867289) with: {'init_mode': 'he_uniform'}


In [36]:
# Tuning Activation Function

In [37]:
# Model to optain best Activation function
def create_model_activation(activation='softmax'):
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=13, kernel_initializer='uniform', activation=activation))
	model.add(Dense(1, kernel_initializer='uniform'))
	# Compile model
	optimizer = Adagrad(lr=0.1)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model

In [38]:
model = KerasRegressor(build_fn=create_model_activation, epochs=500, batch_size=3, verbose=0)

In [39]:
# Define the grid search parameters - Activation function
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(activation=activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

In [40]:
# Summarize results - Activation function
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -15.623848 using {'activation': 'softsign'}
-39.875107 (8.515270) with: {'activation': 'softmax'}
-40.768370 (26.455653) with: {'activation': 'softplus'}
-15.623848 (3.964667) with: {'activation': 'softsign'}
-30.994681 (13.384043) with: {'activation': 'relu'}
-40.888661 (11.598617) with: {'activation': 'tanh'}
-40.742364 (11.609105) with: {'activation': 'sigmoid'}
-40.744912 (11.583928) with: {'activation': 'hard_sigmoid'}
-21.720744 (7.037041) with: {'activation': 'linear'}


In [41]:
# Tuning Dropout Regularization

In [42]:
from keras.constraints import maxnorm
from keras.layers import Dropout

In [43]:
# Model to optain best Droput Regularization
def create_model_weightRegularization(dropout_rate=0.0, weight_constraint=0):
	# create model
	model = Sequential()
	model.add(Dense(13, input_dim=13, kernel_initializer='uniform', activation='softsign', kernel_constraint=maxnorm(weight_constraint)))
	model.add(Dropout(dropout_rate))
	model.add(Dense(1, kernel_initializer='uniform'))
	# Compile model
	optimizer = Adagrad(lr=0.1)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model

In [44]:
model = KerasRegressor(build_fn=create_model_weightRegularization, epochs=500, batch_size=3, verbose=0)

In [45]:
# Define the grid search parameters - Dropout Regularization
weight_constraint = [1, 2, 3, 4, 5]
dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

In [46]:
# Summarize results - Dropout Regularization
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -14.406803 using {'dropout_rate': 0.1, 'weight_constraint': 2}
-17.805588 (4.512625) with: {'dropout_rate': 0.0, 'weight_constraint': 1}
-16.308607 (4.748015) with: {'dropout_rate': 0.0, 'weight_constraint': 2}
-20.491957 (10.771632) with: {'dropout_rate': 0.0, 'weight_constraint': 3}
-22.948010 (4.417483) with: {'dropout_rate': 0.0, 'weight_constraint': 4}
-20.002946 (6.896428) with: {'dropout_rate': 0.0, 'weight_constraint': 5}
-19.564888 (7.021127) with: {'dropout_rate': 0.1, 'weight_constraint': 1}
-14.406803 (4.420103) with: {'dropout_rate': 0.1, 'weight_constraint': 2}
-19.047971 (6.350501) with: {'dropout_rate': 0.1, 'weight_constraint': 3}
-20.343022 (11.022341) with: {'dropout_rate': 0.1, 'weight_constraint': 4}
-22.215568 (8.599333) with: {'dropout_rate': 0.1, 'weight_constraint': 5}
-18.685232 (3.958685) with: {'dropout_rate': 0.2, 'weight_constraint': 1}
-20.402755 (7.756556) with: {'dropout_rate': 0.2, 'weight_constraint': 2}
-21.318868 (3.774191) with: {'dropout_rat

In [47]:
# Tuning No. of Neurons

In [48]:
# Model to optain best no. of neurons
def create_model_neurons(neurons=1):
	# create model
	model = Sequential()
	model.add(Dense(neurons, input_dim=13, kernel_initializer='uniform', activation='softsign', kernel_constraint=maxnorm(2)))
	model.add(Dropout(0.1))
	model.add(Dense(1, kernel_initializer='uniform'))
	# Compile model
	optimizer = Adagrad(lr=0.1)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model

In [49]:
model = KerasRegressor(build_fn=create_model_neurons, epochs=500, batch_size=3, verbose=0)

In [50]:
# Define the grid search parameters - No. of neurons
neurons = [5, 10, 15, 20, 25, 30]
param_grid = dict(neurons=neurons)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train, Y_train)

In [51]:
# Summarize results - No. of neurons
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -13.830474 using {'neurons': 25}
-30.164018 (9.190192) with: {'neurons': 5}
-20.192496 (6.956323) with: {'neurons': 10}
-17.994083 (5.175047) with: {'neurons': 15}
-16.983643 (5.959624) with: {'neurons': 20}
-13.830474 (3.903831) with: {'neurons': 25}
-15.500631 (4.090846) with: {'neurons': 30}


In [53]:
#Base model with best parameters

In [54]:
def create_model_base():
	# create model
	model = Sequential()
	model.add(Dense(25, input_dim=13, kernel_initializer='uniform', activation='softsign', kernel_constraint=maxnorm(2)))
	model.add(Dropout(0.1))
	model.add(Dense(1, kernel_initializer='uniform'))
	# Compile model
	optimizer = Adagrad(lr=0.1)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model

In [55]:
# Evaluate model with complete dataset
model = KerasRegressor(build_fn=create_model_base, epochs=500, batch_size=3, verbose=0)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: 32.74 (26.01) MSE


In [56]:
# The above result shows decrease in error as we tuned the parameters

# Standardizing Scales
# The features in Boston dataset are all numerical. 
# The scales of each feature in the dataset is different. 
# It is a good practice to normalize the features to get a more precise model

In [58]:
# Evaluate model with standardized dataset
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=create_model_base, epochs=500, batch_size=3, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: 33.63 (42.42) MSE


In [59]:
# Deeper model

# Theoretically, we can expect an increase in performace as we increase the number of layers.
# Our model might learn higher order features as we increase layer

In [60]:
def create_larger():
	# create model
	model = Sequential()
	model.add(Dense(25, input_dim=13, kernel_initializer='uniform', activation='softsign', kernel_constraint=maxnorm(2)))
	model.add(Dropout(0.1))
	model.add(Dense(10, kernel_initializer='uniform', activation='softsign'))
	model.add(Dense(5, kernel_initializer='uniform', activation='softsign'))
	model.add(Dense(1, kernel_initializer='uniform'))
	# Compile model
	optimizer = Adagrad(lr=0.1)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=create_larger, epochs=500, batch_size=3, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: 22.35 (28.52) MSE


In [61]:
# The deeper model has certainly lesser error than our base model

# Wider model

# Another approach to build a model with increased performance capability is to increase the number of neurons
# In general, the performance of Deeper model is better than Wider model
# In some cases, Wider model might perform better. Hence, it is essential to always try with different combinations

In [62]:
def create_wider():
	# create model
	model = Sequential()
	model.add(Dense(35, input_dim=13, kernel_initializer='uniform', activation='softsign', kernel_constraint=maxnorm(2)))
	model.add(Dropout(0.1))
	model.add(Dense(1, kernel_initializer='uniform'))
	# Compile model
	optimizer = Adagrad(lr=0.1)
	model.compile(loss='mean_squared_error', optimizer=optimizer)
	return model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=create_wider, epochs=500, batch_size=3, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: 30.07 (31.71) MSE


In [None]:
# The wider model performs better than standardized baseline model, but is not as good as Deeper model