In [None]:
# solve the unstable gradients problem with Glorot or He initialization
# default is Glorot
# if you want to initialize with He initialization use
keras.layers.Dense(10, activation="relu", kernel_initializer="he_normal")
# initialization based on fan_avg vs fan_in
he_avg_init = keras.initializers.VarianceScaling(scale=2., mode="fan_avg", distribution="uniform")
keras.layers.Dense(10, activation="sigmoid", kernel_initializer=he_avg_init)
# Leaky ReLu prevents neurons from "dying" and underfitting (vanishing gradient problem)
keras.layers.Dense(10, kernel_initializer="he_normal")
keras.layers.LeakyReLU(alpha=0.2) # apply it right after the layer you want to be the leaky one
# alpha = leak rate, ~ probability of coming back to life
# for SELU (self normalizing layer, also prevents leaking it's good one...)
layer = keras.layers.Dense(10, activation="selu", kernel_initializer="lecun_normal")


In [None]:
# Batch normalization, the initialization techniques stop the function 
# from having gradient problems at the beginning, but they can come back later
# adding normalization operation just before or after the activation function resolves this
# this removes the need for things like dropout or early stopping

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(10, activation="softmax")   
])

# you can also add the normalization before the layer if you add the activation after the layer
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28,28]),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu")
    keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu")
    keras.layers.Dense(10, activation="softmax")   
])

In [None]:
# can also avoid exploding gradient problem with gradient clipping
# keep gradients between a threshold after training, clip them down if they go over

optimizer = keras.optimizers.SGD(clipvalue=1.0) clips every gradient vector value to a weight between -1 and 1
optimizer = keras.optimizers.SGD(clipnorm=1.0) a better way to do it is with clipnorm so it doesn't change the angle
model.compile(loss="mse", optimizer=optimizer)

In [None]:
# transfer learning, reusing the layers of a pre trained NN
model_a = keras.models.load_model("my_model_A.h5")
model_b_on_a = keras.models.Sequential(model_a.layers[:-1]) adding sequential layer onto model a
model_b_on_a.add(keras.layers.Dense(1, activation="sigmoid"))
# additional training will also train model_a's original layers, to avoid this make a clone/copy
model_a_clone = keras.models.clone(model_a)
model_a_clone.set_weights(model_a.get_weights())

In [None]:
# freeze the old layers to stay the same
for layer in model_b_on_a[:-1]:
    layer.trainable = False

# now when you run the whole thing, your new layers will update their weights, and the old one
model_b_on_a.compile(loss="binary_crossentropy", optimizer="sgd", metrics=["accuracy"])
history = model_b_on_a.fit(x_train_b, y_train_b, epochs=4, validation_data=(x_valid_b, y_valid_b))

In [None]:
# Now unfreeze the frozen layers
for layer in model_b_on_a[:-1]:
    layer.trainable = True
# reduce learning rate to preserve your weights
optimizer = keras.optimizers.SGD(lr=1e-4)
# now refit the whole thing
model_b_on_a.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model_b_on_a.fit(x_train_b, y_train_b, epochs=4, validation_data=(x_valid_b, y_valid_b))

In [None]:
# optimizers
# momentum, picks up speed as it goes in one direction, unlike SGD which has regular steps
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9)
# nesterov accelerator gradient, measures gradient of the local position 
# in direction of momentum instead of at current location
optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
# Adagrad is good for simple problems because it avoids overfitting, 
# over time it will decay too much so avoid it for deep networks
# accumulator is the initial value of the division term, 
# epsilon is the constant added to the accumulator to avoid division by zero
optimizer = tf.keras.optimizers.Adagrad(lr=0.001, initial_accumulator_value=0.1, epsilon=1e-07)
# Use RMSProp for more complex problems, it has an exponential 
# decay in the division accumulator (it won't grow so fast)
optimizer = tf.keras.optimizers.RMSprop(lr=0.001, rho=0.9)
# Adam = combo of rmsprop plus momentum, this is the most popular one for deep networks
optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
# Nadam is the same as adam but it uses the nesterov trick of calculating ahead of the current position

In [None]:
# power scheduling : learning rate is linear compared to epochs

optimizer = keras.optimizers.SGD(lr=0.01, decay=1e-4)

# exponential decay learning rate drops by a factor of 10 every 20 steps

def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * 0.1**(epoch/s)
    return exponential_decay_fn


exponential_decay_optimizer = exponential_decay(lr0=0.01, s=20)

# now we pass it a callback

lr_scheduler = keras.callbacks.LearningRateScheduler(exponential_decay_optimizer)

# now we can use it

history = model.fit(x_train_scaled, y_train, epochs=4, validation_data=(x_valid_b, y_valid_b), callbacks=[lr_scheduler])


In [None]:
# use a constant learning rate based on the epochs
# use this function just like exponential_decay_fn above

def piecewise_constant_fn(epoch):
    if epoch < 5:
        return 0.01
    elif epoch < 15:
        return 0.005
    else:
        return 0.001
    
# this scheduler will multiply lr by 0.5 whenever the best validation doesn't improve for 5 epochs consecutively 

lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)

# you can define a learning rate based on one of Keras's schedules

s = 20*len(x_train) // 32
learning_rate = keras.optimizers.schedules.ExponentialDecay(0.01, s, 0.1)
optimizer = keras.optimizers.SGD(learning_rate)

In [None]:
# regularization l1(), l2(), l1_l2()

layer = keras.layers.Dense(100, 
                           activation="elu", 
                           kernel_initializer="he_normal", 
                           kernel_regularizer=keras.regularizers.l2(0.01))

In [None]:
# using partial to avoid repeat calls to function inputs, give function defaults you want

from functools import partial

RegularizedDense = partial(keras.layers.Dense,
                           activation="elu",
                           kernel_initializer="he_normal",
                           kernel_regularizer=keras.regularizers.l2(0.01)
                          )

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    RegularizedDense(300),
    RegularizedDense(100),
    RegularizedDense(10, activation="softmax", kernel_initializer="glorot_uniform"),
])

In [None]:
# Dropout
import numpy as np

model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(300, activation="elu", kernel_initializer="he_normal"),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(100, activation="elu", kernel_initializer="he_normal"),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(10, activation="softmax")
])

# MC Dropout, even better

y_probas = np.stack([model(x_test_scaled, training=True) for sample in range(100)])

y_proba = y_probas.mean(axis=10)

# better way to implement

class MCDropout(keras.layers.Dropout):
    def call(self, inputs):
        return super().call(inputs, training=True)
    
# max norm regularization, instead of adding a term it justs constrains the weights of the neurons
# to use with convolutional layers we need to set max_norm(1., axis =[0, 1, 2])
keras.layers.Dense(100, 
                   activation="elu", 
                   kernel_initializer="he_normal",
                   kernel_constraint=keras.constraints.max_norm(1.))