## Reduce Overfitting with Dropouts
1. How dropout works
2. Dropout on input layers
3. Dropout on hidden layers

Dropout is a regularization technique for neural network models proposed by Srivastava, et al. in their 2014 paper Dropout: A Simple Way to Prevent Neural Networks from Overfitting1. Dropout is a technique where randomly selected neurons are ignored during training. They are dropped-out randomly. This means that their contribution to the activation of downstream neurons is temporally removed on the forward pass and any weight updates are not applied to the neuron on the backward pass.

Dropout is only used during the training of a model and is not used when evaluating the model.

In [3]:
import numpy
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
# set random seed for reproduction
numpy.random.seed(7)

# load the dataset
dataframe = read_csv("Accessory_files/sonar.csv", header=None)
dataset = dataframe.values

# split to X and Y
X = dataset[:, 0:60].astype(float)
Y = dataset[:, 60]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# baseline
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, kernel_initializer='normal',activation = 'relu'))
    model.add(Dense(30, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
    
    # compile model
    sgd = SGD(lr=0.01, momentum=0.8, decay = 0.0, nesterov=False)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

estimators=[]
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=300, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 82.14% (7.30%)


## With Dropouts
Dropout can be applied to input neurons called the visible layer. In the example below we add a new Dropout layer between the input (or visible layer) and the first hidden layer. The dropout rate is set to 20%, meaning one in five inputs will be randomly excluded from each update cycle.
Additionally, as recommended in the original paper on dropout, a constraint is imposed on the weights for each hidden layer, ensuring that the maximum norm of the weights does not exceed a value of 3. This is done by setting the kernel constraint argument on the Dense class when constructing the layers. The learning rate was lifted by one order of magnitude and the momentum was increased to 0.9. These increases in the learning rate were also recommended in the original dropout paper. Continuing on from the baseline example above, the code below exercises the same network with input dropout.

In [13]:
from keras.layers import Dropout
from keras.constraints import maxnorm
# set random seed for reproduction
numpy.random.seed(7)

# load the dataset
dataframe = read_csv("Accessory_files/sonar.csv", header=None)
dataset = dataframe.values

# split to X and Y
X = dataset[:, 0:60].astype(float)
Y = dataset[:, 60]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# baseline
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dropout(0.2, input_shape=(60,)))
    model.add(Dense(60, kernel_initializer='normal',activation = 'relu', kernel_constraint=maxnorm(3)))
    model.add(Dense(30, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
    
    # compile model
    sgd = SGD(lr=0.01, momentum=0.9, decay = 0.0, nesterov=False)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

estimators=[]
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=300, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 87.49% (4.87%)


In [15]:
from keras.layers import Dropout
from keras.constraints import maxnorm
# set random seed for reproduction
numpy.random.seed(7)

# load the dataset
dataframe = read_csv("Accessory_files/sonar.csv", header=None)
dataset = dataframe.values

# split to X and Y
X = dataset[:, 0:60].astype(float)
Y = dataset[:, 60]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# baseline
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim = 60, kernel_initializer='normal',activation = 'relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(30, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
    
    # compile model
    sgd = SGD(lr=0.01, momentum=0.9, decay = 0.0, nesterov=False)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

estimators=[]
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=300, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 82.68% (6.48%)


## Tips for using dropouts
1. choose 20% to 50%
2. large network
3. dropout on both the input and the hidden layers
4. increase learning rate and use a high momentum value
5. Constrain the size of the network weights

In [16]:
from keras.layers import Dropout
from keras.constraints import maxnorm
# set random seed for reproduction
numpy.random.seed(7)

# load the dataset
dataframe = read_csv("Accessory_files/sonar.csv", header=None)
dataset = dataframe.values

# split to X and Y
X = dataset[:, 0:60].astype(float)
Y = dataset[:, 60]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# baseline
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dropout(0.2,input_shape=(60,)))
    model.add(Dense(60, kernel_initializer='normal',activation = 'relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(30, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(1,kernel_initializer='normal', activation='sigmoid'))
    
    # compile model
    sgd = SGD(lr=0.1, momentum=0.9, decay = 0.0, nesterov=False)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

estimators=[]
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=300, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 84.59% (7.11%)
