# Training Depp Neural Nets

## Vanishing / Exploding Gradients Problems

### Xavier and He Initialization
- normal distribution with mean 0 and standard deviation $\sigma$
- uniform distribution between $-r$ and $+r$

### Nonsaturating Activation Functions
- ELU (exponential linear unit)
    - slower to compute
    - $\alpha = 1$
- leaky ReLU, RReLU(randomized leaky ReLU), PReLU(parametric leaky ReLU)
    - $\alpha = 0.01$
    - RReLU if overfitting
    - PReLU if a huge training set
- ReLU
- tanh
- logistic

### Batch Normalization

### Gradient Clipping

## ......

## Faster Optimizers

### Monmentum optimization

### Nesterov Accelerated Gradient

### AdaGrad

### RMSProp

### Adam Optimization

### Learning Rate Scheduling

In [22]:
import tensorflow as tf
import numpy as np
from functools import partial

tf.reset_default_graph()

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
learning_rate = 0.01
batch_size = 25
n_epochs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
# training = tf.placeholder_with_default(False, shape=(), name="training") # Batch Normalization

with tf.name_scope("dnn"):
    
    he_init = tf.variance_scaling_initializer()
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, kernel_initializer=he_init, name="hidden1") # ReLU activation
    # hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, kernel_initializer=he_init, name="hidden1") # ELU activation
    # hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.selu, kernel_initializer=he_init, name="hidden1") # SELU activation
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, kernel_initializer=he_init, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")
    
    
    '''
    # batch normalization
    hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
    bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
    bn1_act = tf.nn.elu(bn1)
    
    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
    bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
    bn2_act = tf.nn.elu(bn2)
    
    logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
    logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=0.9)
    '''
    
    '''
    # batch normalization with partial
    my_batch_norm_layer = partial(tf.layers.batch_normalization, training=training, momentum=0.9)
    my_dense_layer = partial(tf.layers.dense, kernel_initializer=he_init)
    
    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    bn1 = my_batch_norm_layer(hidden1)
    bn1_act = tf.nn.elu(bn1)
    hidden2 = my_dense_layer(bn1_act, n_hidden2, name="hidden2")
    bn2 = my_batch_norm_layer(hidden2)
    bn2_act = tf.nn.elu(bn2)
    logits_before_bn = my_dense_layer(bn2_act, n_outputs, name="outputs")
    logits = my_batch_norm_layer(logits_before_bn)
    '''

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
with tf.name_scope("train"):
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # GD
    # optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) # Momentum
    # optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9, use_nesterov=True) # NAG
    # optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate) # AdaGrad
    # optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, momentum=0.9, decay=0.9, epsilon=1e-10) # RMSProp
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # Adam
    training_op = optimizer.minimize(loss)
    
    '''
    # learning rate scheduling
    initial_learning_rate = 0.1
    decay_steps = 10000
    decay_rate = 1/10
    global_step = tf.Variable(0, trainable=False, name="global_step")
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, decay_steps, decay_rate)
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss, global_step=global_step)
    '''
    
    '''
    # Gradient Clipping
    threshold = 1.0
    grads_and_vars = optimizer.compute_gradients(loss)
    capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars]
    training_op = optimizer.apply_gradients(capped_gvs)
    '''
    
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch

# extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # batch normalization

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
            # sess.run([training_op, extra_update_ops], feed_dict={training:True, X:X_batch, y:y_batch}) # batch normalization
        if epoch % 2 == 0:
            acc_batch = accuracy.eval(feed_dict={X:X_batch, y:y_batch})
            acc_valid = accuracy.eval(feed_dict={X:X_valid, y:y_valid})
            print(epoch, "Batch accuracy:", acc_batch, "Validation accuracy:", acc_valid)
    save_path = saver.save(sess, "./mymodel/my_model_final.ckpt")

0 Batch accuracy: 1.0 Validation accuracy: 0.9574
2 Batch accuracy: 0.96 Validation accuracy: 0.9618
4 Batch accuracy: 1.0 Validation accuracy: 0.9598
6 Batch accuracy: 1.0 Validation accuracy: 0.9644
8 Batch accuracy: 1.0 Validation accuracy: 0.9642


## Reusing Pretrained Layers

In [10]:
'''
from IPython.display import clear_output, Image, display, HTML
def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))
'''

'\nfrom IPython.display import clear_output, Image, display, HTML\ndef strip_consts(graph_def, max_const_size=32):\n    """Strip large constant values from graph_def."""\n    strip_def = tf.GraphDef()\n    for n0 in graph_def.node:\n        n = strip_def.node.add() \n        n.MergeFrom(n0)\n        if n.op == \'Const\':\n            tensor = n.attr[\'value\'].tensor\n            size = len(tensor.tensor_content)\n            if size > max_const_size:\n                tensor.tensor_content = "<stripped %d bytes>"%size\n    return strip_def\n\ndef show_graph(graph_def, max_const_size=32):\n    """Visualize TensorFlow graph."""\n    if hasattr(graph_def, \'as_graph_def\'):\n        graph_def = graph_def.as_graph_def()\n    strip_def = strip_consts(graph_def, max_const_size=max_const_size)\n    code = """\n        <script>\n          function load() {{\n            document.getElementById("{id}").pbtxt = {data};\n          }}\n        </script>\n        <link rel="import" href="https://te

## Avoiding Overfitting Through Regularization

### Early Stopping

### $l_1$ and $l_2$ Regularization

In [1]:
import tensorflow as tf
import numpy as np
from functools import partial

tf.reset_default_graph()

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

scale = 0.001

my_dense_layer = partial(tf.layers.dense, activation = tf.nn.relu, kernel_regularizer = tf.contrib.layers.l1_regularizer(scale))

with tf.name_scope("dnn"):
    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    hidden2 = my_dense_layer(hidden1, n_hidden2, name="hidden2")
    logits = my_dense_layer(hidden2, n_outputs, activation = None, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    base_loss = tf.reduce_mean(xentropy, name="avg_xentropy")
    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    loss = tf.add_n([base_loss] + reg_losses, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 10
batch_size = 25

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

def shuffle_batch(X_train, y_train, batch_size):
    rnd_idx = np.random.permutation(len(X_train))
    n_batches = len(X_train) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
        yield X_batch, y_batch
    

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for  X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
        accuracy_val = accuracy.eval(feed_dict={X:X_valid, y:y_valid})
        print(epoch, "Validation accuracy:", accuracy_val)
    
    save_path = saver.save(sess, "./mymodel/my_model_final.ckpt")

0 Validation accuracy: 0.912
1 Validation accuracy: 0.913
2 Validation accuracy: 0.9156
3 Validation accuracy: 0.918
4 Validation accuracy: 0.92
5 Validation accuracy: 0.9226
6 Validation accuracy: 0.9332
7 Validation accuracy: 0.9294
8 Validation accuracy: 0.9308
9 Validation accuracy: 0.9358


### Dropout

In [3]:
import tensorflow as tf
import numpy as np
from functools import partial

tf.reset_default_graph()

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

training = tf.placeholder_with_default(False, shape=(), name="training")
dropout_rate = 0.5

X_drop = tf.layers.dropout(X, dropout_rate, training=training)

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu, name="hidden2")
    hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)
    logits = tf.layers.dense(hidden2_drop, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 10
batch_size = 25

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

def shuffle_batch(X_train, y_train, batch_size):
    rnd_idx = np.random.permutation(len(X_train))
    n_batches = len(X_train) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
        yield X_batch, y_batch
    

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for  X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch, training:True})
        accuracy_val = accuracy.eval(feed_dict={X:X_valid, y:y_valid})
        print(epoch, "Validation accuracy:", accuracy_val)
    
    save_path = saver.save(sess, "./mymodel/my_model_final.ckpt")

0 Validation accuracy: 0.8704
1 Validation accuracy: 0.9002
2 Validation accuracy: 0.9174
3 Validation accuracy: 0.925
4 Validation accuracy: 0.9342
5 Validation accuracy: 0.9392
6 Validation accuracy: 0.945
7 Validation accuracy: 0.9484
8 Validation accuracy: 0.9508
9 Validation accuracy: 0.9538


### Max-Norm Regularization

In [4]:
import tensorflow as tf
import numpy as np
from functools import partial

tf.reset_default_graph()

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
threshold = 1.0
weights = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
clipped_weights = tf.clip_by_norm(weights, clip_norm=threshold, axes=1)
clip_weights = tf.assign(weights, clipped_weights)

weights2 = tf.get_default_graph().get_tensor_by_name("hidden2/kernel:0")
clipped_weights2 = tf.clip_by_norm(weights2, clip_norm=threshold, axes=1)
clip_weights2 = tf.assign(weights2, clipped_weights2)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 10
batch_size = 25

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

def shuffle_batch(X_train, y_train, batch_size):
    rnd_idx = np.random.permutation(len(X_train))
    n_batches = len(X_train) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]
        yield X_batch, y_batch
    

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for  X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
            clip_weights.eval()
            clip_weights2.eval()
        accuracy_val = accuracy.eval(feed_dict={X:X_valid, y:y_valid})
        print(epoch, "Validation accuracy:", accuracy_val)
    
    save_path = saver.save(sess, "./mymodel/my_model_final.ckpt")

0 Validation accuracy: 0.9246
1 Validation accuracy: 0.9394
2 Validation accuracy: 0.9514
3 Validation accuracy: 0.9584
4 Validation accuracy: 0.9618
5 Validation accuracy: 0.9648
6 Validation accuracy: 0.9676
7 Validation accuracy: 0.9696
8 Validation accuracy: 0.973
9 Validation accuracy: 0.9714


### Data Augmentation