In [None]:
# 1. L1/L2 Regularization

from keras import regularizers
model.add(Dense(64, input_dim=64,
                kernel_regularizer=regularizers.l2(0.01)

# Here, 0.01 is the lamda value (regularization parameter, which could be a hyper parameter aswell 
# and can optimize using grid-search)

In [None]:
# 2. Dropout

from keras.layers.core import Dropout

model = Sequential([
 Dense(output_dim=hidden1_num_units, input_dim=input_num_units, activation='relu'),
 Dropout(0.25),

Dense(output_dim=output_num_units, input_dim=hidden5_num_units, activation='softmax'),])

# Dropout is like an ensemble model (randomly dropsout the 25% of the neurons for every epoch)
# being ensemble in nature, at times, this produces better results

In [None]:
# 3. Data Augmentation (in keras, we use ImageDataGenerator for this)

from keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(horizontal flip=True)
datagen.fit(train)

In [None]:
# 4. Early Stopping

from keras.callbacks import EarlyStopping

EarlyStopping(monitor='val_err', patience=5)

# monitor - which metric to monitor
# patience - number of epochs with no further improvement after which the training will be stopped

In [None]:
# 5. Batch Normalization
# generally, one uses this between the linear and nonlinear layers (it normalizes the input to your non-linear function)

# import BatchNormalization
from keras.layers.normalization import BatchNormalization

# instantiate model
model = Sequential()

# we can think of this chunk as the input layer
model.add(Dense(64, input_dim=14, init='uniform'))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))

#model.add(Convolution2D(64, 3, 3, use_bias=False))
#model.add(BatchNormalization())
#model.add(Activation('relu'))

# we can think of this chunk as the hidden layer    
model.add(Dense(64, init='uniform'))
model.add(BatchNormalization())
model.add(Activation('tanh'))
model.add(Dropout(0.5))

# we can think of this chunk as the output layer
model.add(Dense(2, init='uniform'))
model.add(BatchNormalization())
model.add(Activation('softmax'))

# setting up the optimization of our weights 
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd)

# running the fitting
model.fit(X_train, y_train, nb_epoch=20, batch_size=16, show_accuracy=True, validation_split=0.2, verbose = 2)

# For CNN's
model.add(Conv2D(64, (3, 3), padding="same",input_shape=inputShape))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(64, (3, 3), padding="same",input_shape=inputShape))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

In [None]:
# Batch Normalization in TensorFlow

training = tf.placeholder_with_default(False, shape=(), name='training')

hidden1 = tf.layers.dense(X, n_hidden_layers, name='hidden1')
bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
bn1_act = tf.nn.elu(bn1)

hidden2 = tf.layers.dense(bn1_act, n_hidden_layers, name='hidden2')
bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
bn2_act = tf.nn.elu(bn2)

logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name='outputs')
logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=0.9)

In [None]:
# 6. Gradient Clipping

rsmprop = RMSprop(clipvalue=0.5)
#or
rmsprop = RMSprop(clipnorm=1.0)
# all parameter gradients will be clipped to
# a maximum value of 0.5 and
# a minimum value of -0.5.
model.compile(loss='mse', optimizer=rsmprop)

In [None]:
# Gradient Clipping in TensorFlow

threshold = 1.0
optimizer = tf.train.GradinetDescenetOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)
#it will compute the gradients and clip them between -1.0 and 1.0

In [None]:
# Optimizers in NN

1. momentum optimizer
2. Nesterov Accelerated Gradient
3. AdaGrad
4. RMSProp
5. Adam Optimization
6. SGD