# CH11 Training Deep Neural Networks


In [1]:
import tensorflow as tf

In [2]:
tf.__version__

'2.10.1'

### Activations and he initializations


In [3]:
dense = tf.keras.layers.Dense(
    50, activation='relu', kernel_initializer='he_normal')

In [9]:
dense.kernel_initializer.distribution

'truncated_normal'

In [10]:
he_avg_init = tf.keras.initializers.VarianceScaling(scale=2., mode="fan_avg",
                                                    distribution="uniform")
dense = tf.keras.layers.Dense(50, activation="sigmoid",
                              kernel_initializer=he_avg_init)

leaky relu


In [11]:
leaky_relu = tf.keras.layers.LeakyReLU(alpha=.2)
dense = tf.keras.layers.Dense(
    50, activation=leaky_relu, kernel_initializer='he_normal')

elu


In [12]:
dense = tf.keras.layers.Dense(
    50, activation='elu', kernel_initializer='he_normal')

`SELU` , but it has few considerations:


• The input features must be standardized: mean 0 and standard deviation 1.

• Every hidden layer’s weights must be initialized using LeCun normal initializa
tion. In Keras, this means setting kernel_initializer="lecun_normal".

• The self-normalizing property is only guaranteed with plain MLPs.

• You cannot use regularization techniques like ℓ1
or ℓ2
regularization, max-norm,
batch-norm, or regular dropout.


In [13]:
dense = tf.keras.layers.Dense(
    50, activation='selu', kernel_initializer='lecun_normal')

an example of a self-regularized network using SELU


In [14]:
tf.random.set_seed(42)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[28, 28]))
for i in range(100):
    model.add(tf.keras.layers.Dense(100, activation='selu',
              kernel_initializer='lecun_normal'))
model.add(tf.keras.layers.Dense(10, activation='softmax'))

In [15]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(
    learning_rate=0.001), metrics=['accuracy'])
fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()
(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist
X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]
X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]
X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255

class_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
               "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

In [16]:
pixel_means = X_train.mean(axis=0, keepdims=True)
pixel_stds = X_train.std(axis=0, keepdims=True)
X_train_scaled = (X_train - pixel_means)/pixel_stds
X_valid_scaled = (X_valid - pixel_means)/pixel_stds
X_test_scaled = (X_test - pixel_means)/pixel_stds

In [17]:
history = model.fit(X_train_scaled, y_train, epochs=5,
                    validation_data=(X_valid_scaled, y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


The network managed to learn, despite how deep it is. Now look at what happens if we try to use the ReLU activation function instead:


In [18]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=[28, 28]))
for layer in range(100):
    model.add(tf.keras.layers.Dense(100, activation="relu",
                                    kernel_initializer="he_normal"))
model.add(tf.keras.layers.Dense(10, activation="softmax"))

In [19]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
              metrics=["accuracy"])

In [20]:
history = model.fit(X_train_scaled, y_train, epochs=5,
                    validation_data=(X_valid_scaled, y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Not great at all, we suffered from the vanishing/exploding gradients problem.


In [None]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)

GELU, Swish and Mish


In [22]:
def mish(x):
    return x * tf.math.tanh(tf.math.softplus(x))


# Using GELU activation
dense_gelu = tf.keras.layers.Dense(
    50, activation=tf.keras.activations.gelu, kernel_initializer='he_normal')

# Using built-in Swish activation
dense_swish = tf.keras.layers.Dense(
    50, activation='swish', kernel_initializer='he_normal')

# Using Mish activation
dense_mish = tf.keras.layers.Dense(
    50, activation=mish, kernel_initializer='he_normal')

---


### Batch Normalization


In [23]:
tf.keras.backend.clear_session()
tf.random.set_seed(42)

In [26]:
from tensorflow.keras.layers import Flatten, Dense, BatchNormalization

model = tf.keras.Sequential([
    Flatten(input_shape=[28, 28]),
    BatchNormalization(),
    Dense(300, activation='relu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dense(100, activation='relu', kernel_initializer='he_normal'),
    BatchNormalization(),
    Dense(10, activation='softmax'),

])

In [27]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_2 (Flatten)         (None, 784)               0         
                                                                 
 batch_normalization (BatchN  (None, 784)              3136      
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 300)               235500    
                                                                 
 batch_normalization_1 (Batc  (None, 300)              1200      
 hNormalization)                                                 
                                                                 
 dense_1 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_2 (Batc  (None, 100)             

In [33]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [34]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd",
              metrics="accuracy")
model.fit(X_train, y_train, epochs=2, validation_data=(X_valid, y_valid))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1d70fb05db0>

Sometimes applying BN before the activation function works better (there's a debate on this topic). Moreover, the layer before a BatchNormalization layer does not need to have bias terms, since the BatchNormalization layer some as well, it would be a waste of parameters, so you can set use_bias=False when creating those layers:


In [35]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Activation
tf.keras.backend.clear_session()
tf.random.set_seed(42)


model = Sequential([
    Flatten(input_shape=[28, 28]),
    Dense(300, kernel_initializer='he_normal'),
    BatchNormalization(),
    Activation('relu'),
    Dense(100, kernel_initializer='he_normal'),
    BatchNormalization(),
    Activation('relu'),
    Dense(10),
    Activation('softmax')
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd",
              metrics="accuracy")
model.fit(X_train, y_train, epochs=1, validation_data=(X_valid, y_valid))



<keras.callbacks.History at 0x1d70fb04c40>

### Gradient Clipping

to avoid gradients explosion


All tf.keras.optimizers accept clipnorm or clipvalue arguments:


In [37]:
optim = tf.keras.optimizers.SGD(clipvalue=1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optim)

In [38]:
optim = tf.keras.optimizers.SGD(clipnorm=1.0)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optim)

---
