In [1]:
import sys
import sklearn
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os

np.random.seed(42)

In [2]:
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

### 在隐藏层的激活函数后添加BN层

In [3]:
"""
    每个BN层的每个输入添加了四个参数（γ、β、μ、σ）,
    创建BN层时还会创建两个操作：用于更新和移动平均值
"""
model_after = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),#展平输入图像成二维
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(300, activation="relu"),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(100, activation="relu"),
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(10, activation="softmax")
])

### 在隐藏层的激活函数前添加BN层

#### BN层每个输入都包含一个偏移参数，可从上一层中删除偏置项（传递use_bias=False即可）

In [4]:
"""
在激活函数之前添加BN层，必须从隐藏层中删除激活函数，
并将其作为单独的层添加在BN层之后
"""
model_before = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),#展平输入图像
    keras.layers.BatchNormalization(),
    
    keras.layers.Dense(300, use_bias=False),#删除偏置项
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    
    keras.layers.Dense(100, use_bias=False),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("elu"),
    
    keras.layers.Dense(10, use_bias=False)
])

#### γ（缩放参数）和β（偏移参数）是可通过反向传播进行训练的

#### μ和σ是移动的平均值，不可训练，不受反向传播影响

In [5]:
model_after.summary()
#第一个BN层添加了4×784=3136个参数

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 784)               0         
                                                                 
 batch_normalization (BatchN  (None, 784)              3136      
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 300)               235500    
                                                                 
 batch_normalization_1 (Batc  (None, 300)              1200      
 hNormalization)                                                 
                                                                 
 dense_1 (Dense)             (None, 100)               30100     
                                                                 
 batch_normalization_2 (Batc  (None, 100)              4

In [6]:
bn1 = model_after.layers[1]#第一个BN层
[(var.name, var.trainable) for var in bn1.variables]

[('batch_normalization/gamma:0', True),
 ('batch_normalization/beta:0', True),
 ('batch_normalization/moving_mean:0', False),
 ('batch_normalization/moving_variance:0', False)]

In [7]:
type(bn1.variables)

list

In [8]:
model_after.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])
model_before.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

In [9]:
history_after = model_after.fit(X_train, y_train, epochs=10,
                    validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
history_before = model_before.fit(X_train, y_train, epochs=10,
                    validation_data=(X_valid, y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
