In [181]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping

In [182]:
# Format data - features
boston = load_boston()
features = np.array(boston.data)

In [183]:
# Format data - target
target = np.array(boston.target)

In [184]:
# Split into training and test sets
Xtrain, Xtest, ytrain, ytest = train_test_split(features, target, test_size=0.25)

In [185]:
# Get the shape of the data
print(Xtrain.shape)
print(ytrain.shape)

n_cols = Xtrain.shape[1]

(379, 13)
(379,)


### Basic model

In [186]:
# Create model
model = Sequential()
model.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(1))

In [187]:
# Compile and fit model
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(Xtrain, ytrain, validation_split=0.25)

Train on 284 samples, validate on 95 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x224c9642400>

In [188]:
mean_squared_error(ytest, model.predict(Xtest)[:, 0])

65.377005699694195

### Try different learning rates

Note that the SGD optimizer does not work well for continuous regression. Adam works best for regression (as shown above) while SGD works for classification (as shown in the Datacamp slides).

Further, the using the SGD optimizer + a learning rate with a Sequential model where the activation functions for all layers is 'relu' leads to a loss of nan. I've changed the activation function of the last layer to be 'tanh' to get this exercise to work. 'softmax' and 'sigmoid' also work to a certain extent. However in reality, I wouldn't use the below structure for a regression network.

In [189]:
def get_new_model():
    # Create a basic model
    n_cols = X.shape[1]
    mod = Sequential()
    mod.add(Dense(100, activation='relu', input_shape=(n_cols,)))
    mod.add(Dense(100, activation='relu'))
    # Activation would be 'relu' normally, but is 'tanh' here to get the SGD learning rate to work
    mod.add(Dense(100, activation='tanh'))
    mod.add(Dense(1))
    return(mod)

In [190]:
lr_to_test = [0.0001, 0.001, 0.01, 0.1, 0.5, 1]

for lr in lr_to_test:
    print('\nTesting model with learning rate %f' % lr)
    model2 = get_new_model()
    sgd = SGD(lr=lr)
    model2.compile(optimizer=sgd, loss='mean_squared_error')
    model2.fit(X, y)


Testing model with learning rate 0.000100
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing model with learning rate 0.001000
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing model with learning rate 0.010000
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing model with learning rate 0.100000
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing model with learning rate 0.500000
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing model with learning rate 1.000000
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Early stopping

In [194]:
early_stopping_monitor = EarlyStopping(patience=3)

model3 = Sequential()
model3.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model3.add(Dense(100, activation='relu'))
model3.add(Dense(100, activation='relu'))
model3.add(Dense(1))

In [195]:
model3.compile(optimizer='adam', loss='mean_squared_error')
model3.fit(Xtrain, ytrain, validation_split=0.25, epochs=30, callbacks=[early_stopping_monitor])

Train on 284 samples, validate on 95 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30


<keras.callbacks.History at 0x224d07e6e10>

In [196]:
mean_squared_error(ytest, model3.predict(Xtest)[:, 0])

49.616126585894719

### More experimenting

#### More layers

In [197]:
model4 = Sequential()
model4.add(Dense(100, activation='relu', input_shape=(n_cols,)))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(1))

In [198]:
model4.compile(optimizer='adam', loss='mean_squared_error')
model4.fit(Xtrain, ytrain, validation_split=0.25, epochs=30, callbacks=[early_stopping_monitor])

Train on 284 samples, validate on 95 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30


<keras.callbacks.History at 0x224d0d76a20>

In [200]:
mean_squared_error(ytest, model4.predict(Xtest)[:, 0])

64.667983680161015

#### More nodes

In [201]:
model5 = Sequential()
model5.add(Dense(1000, activation='relu', input_shape=(n_cols,)))
model5.add(Dense(1000, activation='relu'))
model5.add(Dense(1000, activation='relu'))
model5.add(Dense(1))

In [202]:
model5.compile(optimizer='adam', loss='mean_squared_error')
model5.fit(Xtrain, ytrain, validation_split=0.25, epochs=30, callbacks=[early_stopping_monitor])

Train on 284 samples, validate on 95 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30


<keras.callbacks.History at 0x224d27fcb00>

In [203]:
mean_squared_error(ytest, model5.predict(Xtest)[:, 0])

45.810263704275414

#### More layers and nodes

In [204]:
model6 = Sequential()
model6.add(Dense(1000, activation='relu', input_shape=(n_cols,)))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1000, activation='relu'))
model6.add(Dense(1))

In [205]:
model6.compile(optimizer='adam', loss='mean_squared_error')
model6.fit(Xtrain, ytrain, validation_split=0.25, epochs=30, callbacks=[early_stopping_monitor])

Train on 284 samples, validate on 95 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30


<keras.callbacks.History at 0x224d03f6ac8>

In [206]:
mean_squared_error(ytest, model6.predict(Xtest)[:, 0])

67.420533055254552

#### 1,000 nodes, 2 layers

In [211]:
model7 = Sequential()
model7.add(Dense(1000, activation='relu', input_shape=(n_cols,)))
model7.add(Dense(1000, activation='relu'))
model7.add(Dense(1))

In [212]:
model7.compile(optimizer='adam', loss='mean_squared_error')
model7.fit(Xtrain, ytrain, validation_split=0.25, epochs=30, callbacks=[early_stopping_monitor])

Train on 284 samples, validate on 95 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x224d3a09c50>

In [213]:
mean_squared_error(ytest, model7.predict(Xtest)[:, 0])

52.323601616701026

### Conclusions

Based on the above, it seems that for this particular application, less layers but more nodes is optimal. Specifically, 3 layers with 1,000 nodes each produced the lowest MSE.