## 用 Tensorflow 来实现批量归一化

In [1]:
import tensorflow as tf
from tensorflow.contrib.layers import batch_norm
from tensorflow.contrib.layers import fully_connected

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
is_training = tf.placeholder(tf.bool, shape=(), name='is_training')
bn_params = {
    'is_training': is_training,
    'decay': 0.99,
    'updates_collections': None,
}

hidden1 = fully_connected(X, n_hidden1, scope='hidden1', normalizer_fn=batch_norm, normalizer_params=bn_params)
hidden2 = fully_connected(hidden1, n_hidden2, scope='hidden2', normalizer_fn=batch_norm, normalizer_params=bn_params)
logits = fully_connected(hidden2, n_outputs, scope='outputs', activation_fn=None, normalizer_fn=batch_norm, normalizer_params=bn_params)

Instructions for updating:
Colocations handled automatically by placer.


In [2]:
# 利用 arg_scope() 方法构造一个参数范围：第一个参数是一个函数列表，其他参数会自动传给这些函数

with tf.contrib.framework.arg_scope([fully_connected], normalizer_fn=batch_norm, normalizer_params=bn_params):
    hidden1 = fully_connected(X, n_hidden1, scope='hidden1', reuse=tf.AUTO_REUSE)
    hidden2 = fully_connected(hidden1, n_hidden2, scope='hidden2', reuse=tf.AUTO_REUSE)
    logits = fully_connected(hidden2, n_outputs, scope='outputs', activation_fn=None, reuse=tf.AUTO_REUSE)

In [3]:
# contrib 尽量少使用
# 由于tf.layers.dense()函数与本书中使用的tf.contrib.layers.arg_scope()不兼容，我们现在使用 python 的functools.partial()函数

from functools import partial

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=None, name = 'y')
training = tf.placeholder_with_default(False, shape=(), name='training')

my_batch_norm_layer = partial(tf.layers.batch_normalization,
                              training=training, momentum=0.9)

hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", reuse=tf.AUTO_REUSE)
bn1 = my_batch_norm_layer(hidden1)
bn1_act = tf.nn.elu(bn1)
hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2", reuse=tf.AUTO_REUSE)
bn2 = my_batch_norm_layer(hidden2)
bn2_act = tf.nn.elu(bn2)
logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs", reuse=tf.AUTO_REUSE)
logits = my_batch_norm_layer(logits_before_bn)

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.batch_normalization instead.


完整代码如下：

In [4]:
from functools import partial
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf


n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

mnist = input_data.read_data_sets("data/")

batch_norm_momentum = 0.9
learning_rate = 0.01

# X = tf.placeholder(tf.float32, shape=(None, n_inputs), name = 'X')
# y = tf.placeholder(tf.int64, shape=None, name = 'y')
# training = tf.placeholder_with_default(False, shape=(), name = 'training')#给Batch norm加一个placeholder

with tf.name_scope("dnn"):
    he_init = tf.contrib.layers.variance_scaling_initializer()
    #对权重的初始化

    my_batch_norm_layer = partial(
        tf.layers.batch_normalization,
        training = training,
        momentum = batch_norm_momentum
    )

    my_dense_layer = partial(
        tf.layers.dense,
        kernel_initializer = he_init
    )

    hidden1 = my_dense_layer(X ,n_hidden1 ,name = 'hidden1', reuse=tf.AUTO_REUSE)
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name = 'hidden2', reuse=tf.AUTO_REUSE)
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    logists_before_bn = my_dense_layer(bn2, n_outputs, name = 'outputs', reuse=tf.AUTO_REUSE)
    logists = my_batch_norm_layer(logists_before_bn)

with tf.name_scope('loss'):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits= logists)
    loss = tf.reduce_mean(xentropy, name = 'loss')

with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logists, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epoches = 10
batch_size = 200
# 注意：由于我们使用的是 tf.layers.batch_normalization() 而不是 tf.contrib.layers.batch_norm()（如本书所述），
# 所以我们需要明确运行批量规范化所需的额外更新操作（sess.run([ training_op，extra_update_ops], ...)。
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epoches):
        for iteraton in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X: X_batch, y: y_batch})
        accuracy_val = accuracy.eval(feed_dict= {X: mnist.test.images,
                                                y: mnist.test.labels})
        print(epoch, 'Test accuracy:', accuracy_val)

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Use tf.cast instead.
0 Test accuracy: 0.8869
1 Test accuracy: 0.9086
2 Test accuracy: 0.9204
3 Test accuracy: 0.929
4 Test accuracy: 0.9374
5 Test accuracy: 0.9419
6 Test accuracy: 0.9475
7 Test accuracy: 0.9498
8 Test accuracy: 0.9549
9 Test accuracy: 0.9567


## 梯度剪裁

一般情况下，大家倾向于批量归一化，但是梯度裁剪对于 RNN 网络非常有效，可以减轻梯度爆炸问题

In [None]:
threshold = 1.0
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
grads_and_vars = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var) for grad, var in grads_and_vars]
training_op = optimizer.apply_gradients(capped_gvs)

## 重用预训练图层

从头开始训练一个非常庞大的 DNN 并不明智，大多数时候应该试着找一个能处理相似问题的已有的神经网络，然后重用它的低层网络，这叫做迁移学习。这不仅能极大地提升训练速度，也很大程度地减少训练数据。

## 重用 TensorFlow 模型

通常我们只想要重用原来模型的一部分，一种简单的解决方案就是配置Saver使之在还原的时候后值还原所有参数的一个子集