### Examining the Batch Normalization method

    * From the paper "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" by
      Sergey Ioffe and Christian Szegedy
      
    * Coding the method using custom layers in tensorflow 2.1.0 to understand the math and comparing against the API
    
    * Using MNIST data set

In [1]:
import tensorflow as tf

In [3]:
tf.__version__

'2.1.0'

In [2]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

### Loading MNIST data

In [4]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [5]:
print(x_train.shape, x_test.shape)

(60000, 28, 28) (10000, 28, 28)


#### Checking min and max values of the input

In [6]:
np.max(x_train)

255

In [7]:
np.min(x_train)

0

#### Scaling the data using the Standard Scaler

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
stdscaler = StandardScaler(with_mean=True, with_std=True)

In [10]:
stdscaler_fit = stdscaler.fit(x_train.reshape(-1, 28*28))

In [11]:
stdscaler_fit.mean_.shape

(784,)

In [12]:
# average mean
np.sqrt(stdscaler_fit.mean_.mean())

5.772211140440891

In [13]:
stdscaler_fit.n_samples_seen_

60000

In [14]:
stdscaler_fit.scale_.shape

(784,)

In [15]:
# average std
np.sqrt(stdscaler_fit.var_.mean())

66.12879201995706

In [16]:
x_train_scaled =  stdscaler_fit.transform(x_train.reshape(-1,28*28)).reshape(-1,28,28)
x_train_scaled.shape

(60000, 28, 28)

In [17]:
x_test_scaled =  stdscaler_fit.transform(x_test.reshape(-1,28*28)).reshape(-1,28,28)
x_test_scaled.shape

(10000, 28, 28)

#### Checking the min and max values

In [18]:
np.min(x_train_scaled)

-1.2742078920822268

In [19]:
np.max(x_train_scaled)

244.94693302873063

In [20]:
np.var(x_train_scaled)

0.9145408163265558

In [21]:
np.mean(x_train_scaled)

-2.1974863349995617e-18

#### One-hot encoding the y labels

In [22]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8),
 array([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949]))

    * The labels are more or less balanced

In [23]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8),
 array([ 980, 1135, 1032, 1010,  982,  892,  958, 1028,  974, 1009]))

In [24]:
y_train_coded =  tf.keras.utils.to_categorical(y_train, num_classes=10)
y_train_coded.shape

(60000, 10)

In [25]:
print("Label: ", y_train[0],'\n',"One-hot encoded: ", y_train_coded[0])

Label:  5 
 One-hot encoded:  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [26]:
y_test_coded =  tf.keras.utils.to_categorical(y_test, num_classes=10)
y_test_coded.shape

(10000, 10)

#### Creating validation data

In [27]:
from sklearn.utils import shuffle

In [28]:
# for reproducibility
random_seed = 100 

In [29]:
x_train_scaled.shape

(60000, 28, 28)

In [30]:
y_train_coded.shape

(60000, 10)

In [31]:
x_train_scaled, y_train_coded = shuffle(x_train_scaled, y_train_coded, random_state = random_seed) 

In [32]:
x_train_scaled.shape

(60000, 28, 28)

In [33]:
y_train_coded.shape

(60000, 10)

In [34]:
x_valid_scaled = x_train_scaled[:5000]
y_valid_coded = y_train_coded[:5000]

In [35]:
x_valid_scaled.shape

(5000, 28, 28)

In [36]:
y_valid_coded.shape

(5000, 10)

In [37]:
x_train_scaled = x_train_scaled[5000:]
y_train_coded = y_train_coded[5000:]

#### Creating a tf dataset for training on a Model with dense layers

In [38]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_scaled.reshape(-1, 784), y_train_coded))

In [39]:
valid_dataset = tf.data.Dataset.from_tensor_slices((x_valid_scaled.reshape(-1, 784), y_valid_coded))

In [40]:
test_dataset = tf.data.Dataset.from_tensor_slices((x_test_scaled.reshape(-1,784), y_test_coded))

In [41]:
minibatch =  60 # in the paper

In [42]:
buffer_size = len(y_train)
buffer_size

60000

In [43]:
# shuffle first, batch 2nd, then prefetch
train_dataset = train_dataset.shuffle(buffer_size=buffer_size, seed=random_seed, 
                                      reshuffle_each_iteration=True).batch(batch_size=minibatch, 
                                                                           drop_remainder=True).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [44]:
valid_dataset = valid_dataset.shuffle(buffer_size=buffer_size, seed=random_seed, 
                                      reshuffle_each_iteration=False).batch(batch_size=minibatch).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [45]:
test_dataset = test_dataset.shuffle(buffer_size=buffer_size, seed=random_seed, 
                                      reshuffle_each_iteration=False).batch(batch_size=minibatch).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [46]:
for i in test_dataset.take(1):

    print(i[0][0].shape, '\n')
    print(i[1][0])

(784,) 

tf.Tensor([0. 1. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(10,), dtype=float32)


### Building a Model using Dense Layers and Batch Norm from Keras

    * Using the model architecture from the paper

In [47]:
tf.keras.backend.clear_session()

In [48]:
input_layer = tf.keras.Input(shape = (784,), name = 'input')

In [49]:
units1, units2, units3 = 100,100,100

In [50]:
kerasdense1 = tf.keras.layers.Dense(units = units1, activation=None, 
                                    kernel_initializer=tf.keras.initializers.Orthogonal(gain=1,seed=random_seed))(input_layer)

In [51]:
# mean and variance are calculated for the minibatch.
# setting momentum = 0 for now. Population statistics will be calculated separately for inference.
# Initializing moving mean and variance using the statistics from the StandardScaler fit

# """bn_layer = tf.keras.layers.BatchNormalization(axis = [-1], momentum=0.0, epsilon=0.001, center=True, scale=True, 
#                                              beta_initializer = tf.keras.initializers.zeros(), 
#                                               gamma_initializer = tf.keras.initializers.ones(), 
#                                              moving_mean_initializer = tf.keras.initializers.zeros(), 
#                                              moving_variance_initializer = tf.keras.initializers.ones(), trainable=True)"""
# the "bn_layer" object can be used only once. Have to write a new batch_norm for every layer. Hence, will use a function

In [52]:
def bn_layer(axis = [-1]):
    
    """
    returns a batch norm layer
    
    Parameters:
    axis: list of integers. Default is [-1] which is confusing because the mean and averages are
    calculated across the minibatch rather than the features.   
    
    Anyways, will be cross-checking against custom code later
    
    """
    
    
    return tf.keras.layers.BatchNormalization(axis = axis, momentum=0.99, epsilon=0.001, center=True, scale=True, 
                                             beta_initializer = tf.keras.initializers.zeros(), 
                                              gamma_initializer = tf.keras.initializers.ones(), 
                                             moving_mean_initializer = tf.keras.initializers.zeros(), 
                                             moving_variance_initializer = tf.keras.initializers.ones(), trainable=True)

In [53]:
kerasdensebn1 = bn_layer(axis = [-1])(kerasdense1, training=True)

In [54]:
activation1 = tf.keras.layers.Activation(activation = tf.nn.tanh)(kerasdensebn1)

In [55]:
kerasdense2 = tf.keras.layers.Dense(units = units2, activation=None, 
                                    kernel_initializer=tf.keras.initializers.Orthogonal(gain=1,seed=random_seed))(activation1)
kerasdensebn2 = bn_layer(axis = [-1])(kerasdense2, training=True)
activation2 = tf.keras.layers.Activation(activation = tf.nn.tanh)(kerasdensebn2)

In [56]:
kerasdense3 = tf.keras.layers.Dense(units = units3, activation=None, 
                                    kernel_initializer=tf.keras.initializers.Orthogonal(gain=1,seed=random_seed))(activation2)
kerasdensebn3 = bn_layer(axis = [-1])(kerasdense3, training=True)
activation3 = tf.keras.layers.Activation(activation = tf.nn.tanh)(kerasdensebn3)

In [57]:
output_layer = tf.keras.layers.Dense(units = 10, activation=None,
                                    kernel_initializer=tf.keras.initializers.Orthogonal(gain=1,seed=random_seed))(activation3)
# no softmax activation

In [58]:
keras_fc_model = tf.keras.Model(inputs = [input_layer], outputs = [output_layer], name = 'kerasmodel1')

In [59]:
keras_fc_model.summary()

Model: "kerasmodel1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 784)]             0         
_________________________________________________________________
dense (Dense)                (None, 100)               78500     
_________________________________________________________________
batch_normalization (BatchNo (None, 100)               400       
_________________________________________________________________
activation (Activation)      (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 100)               400       
_________________________________________________________________
activation_1 (Activation)    (None, 100)               

In [60]:
keras_fc_model.losses

[]

In [61]:
len(keras_fc_model.weights)

20

In [62]:
# batch norm layer variables
keras_fc_model.layers[2].weights

[<tf.Variable 'batch_normalization/gamma:0' shape=(100,) dtype=float32, numpy=
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       dtype=float32)>,
 <tf.Variable 'batch_normalization/beta:0' shape=(100,) dtype=float32, numpy=
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

#### Compiling

In [63]:
keras_fc_model.compile(optimizer =  tf.keras.optimizers.Adam(learning_rate=0.001), 
                      loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
                      metrics = [tf.keras.metrics.CategoricalAccuracy()])

In [64]:
# USing tf.keras.metrics.AUC() give:

# """InvalidArgumentError:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] 
# [x (kerasmodel1/dense_3/BiasAdd:0) = ] [[0.7673769 2.2196033 1.78565049...]...] [y (metrics/auc/Cast_1/x:0) = ] [0]
#	 [[{{node metrics/auc/assert_greater_equal/Assert/AssertGuard/else/_1/Assert}}]] [Op:__inference_distributed_function_103831]"""

# To use this metric, the predictions have to be between 0 and 1, implies softmax needs to be used.
                      
   

In [65]:
keras_fc_model.name

'kerasmodel1'

#### Callbacks

##### Learning rate
    * Adam optimizer can do adaptive learning rate feature

##### Not setting EarlyStopping

##### Save best model

In [66]:
import os

In [67]:
keras_savedmodels = 'keras_savedmodels'

if os.path.exists(keras_savedmodels):
    pass
else:
    os.mkdir(keras_savedmodels)

In [68]:
cb_savemodel = tf.keras.callbacks.ModelCheckpoint(os.path.join(keras_savedmodels, 'model_{epoch}-{val_loss:.3f}.h5'), mode = 'min',
                                                  monitor = 'val_loss',
                                                 verbose = 1, save_best_only=True)

##### Tensorboard

In [69]:
keras_models_logs = 'keras_models_logs'

if os.path.exists(keras_models_logs):
    pass
else:
    os.mkdir(keras_models_logs)

In [70]:
cb_tboard = tf.keras.callbacks.TensorBoard(log_dir=keras_models_logs,histogram_freq=1, write_graph=True,
                                          write_images=True)

In [71]:
cblist = [cb_savemodel, cb_tboard]

#### Fitting

In [72]:
epochs1=10

In [73]:
keras_history1 = keras_fc_model.fit(train_dataset, epochs=epochs1, verbose = 1, callbacks=cblist,
                                   validation_data=valid_dataset, shuffle = True)

Train for 916 steps, validate for 84 steps
Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.21771, saving model to keras_savedmodels/model_1-0.218.h5
Epoch 2/10
Epoch 00002: val_loss improved from 0.21771 to 0.16106, saving model to keras_savedmodels/model_2-0.161.h5
Epoch 3/10
Epoch 00003: val_loss improved from 0.16106 to 0.15292, saving model to keras_savedmodels/model_3-0.153.h5
Epoch 4/10
Epoch 00004: val_loss improved from 0.15292 to 0.13310, saving model to keras_savedmodels/model_4-0.133.h5
Epoch 5/10
Epoch 00005: val_loss improved from 0.13310 to 0.13114, saving model to keras_savedmodels/model_5-0.131.h5
Epoch 6/10
Epoch 00006: val_loss improved from 0.13114 to 0.12137, saving model to keras_savedmodels/model_6-0.121.h5
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.12137
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.12137
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.12137
Epoch 10/10
Epoch 00010: val_loss improved from 0.12137 to 0.11878,

#### Testing

In [80]:
keras_fc_model.evaluate(test_dataset)



[0.11143299131751797, 0.9711]

### Exploring Layers 

In [74]:
keras_fc_model.layers

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x7f33f44998d0>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f340c27a8d0>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x7f340c228dd8>,
 <tensorflow.python.keras.layers.core.Activation at 0x7f340c228f60>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f340c1faeb8>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x7f340c1fe048>,
 <tensorflow.python.keras.layers.core.Activation at 0x7f340c1fe2b0>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f340c1a4d68>,
 <tensorflow.python.keras.layers.normalization_v2.BatchNormalization at 0x7f340c1ac198>,
 <tensorflow.python.keras.layers.core.Activation at 0x7f340c1ac438>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f340c1fab70>]

In [75]:
bn1 = keras_fc_model.layers[2]

In [79]:
bn1.weights

[<tf.Variable 'batch_normalization/gamma:0' shape=(100,) dtype=float32, numpy=
 array([1.2269483, 1.1349252, 1.3410976, 0.9179288, 1.2856462, 1.3862823,
        1.3595926, 1.2745571, 1.5197902, 1.3166919, 1.4408834, 1.2638034,
        1.2629784, 1.2740654, 1.415783 , 1.4326637, 1.4493729, 1.3970267,
        1.3195894, 1.3734947, 1.3420596, 1.1721978, 1.2968619, 1.5249897,
        1.0370381, 1.3912894, 1.5404277, 1.3771281, 1.4850557, 1.3972007,
        1.4163164, 1.1762176, 1.1790816, 1.3884237, 1.3342997, 1.303082 ,
        1.1480755, 1.2126306, 1.2919899, 1.1597459, 1.485534 , 1.2900969,
        1.221908 , 1.2401632, 1.4433057, 1.4404896, 1.508135 , 1.3095189,
        1.4880893, 1.3983922, 1.2017698, 1.2355157, 1.4851977, 1.5640281,
        1.5173646, 1.3068354, 1.62453  , 1.5176066, 1.4924428, 1.2629938,
        1.327193 , 1.4941337, 1.6335568, 1.4370104, 1.5315871, 1.3623064,
        1.284233 , 1.1915743, 1.4893486, 1.3606577, 1.1597953, 1.3838465,
        1.2619832, 1.2759168, 1.5

In [78]:
bn1.trainable_weights

[<tf.Variable 'batch_normalization/gamma:0' shape=(100,) dtype=float32, numpy=
 array([1.2269483, 1.1349252, 1.3410976, 0.9179288, 1.2856462, 1.3862823,
        1.3595926, 1.2745571, 1.5197902, 1.3166919, 1.4408834, 1.2638034,
        1.2629784, 1.2740654, 1.415783 , 1.4326637, 1.4493729, 1.3970267,
        1.3195894, 1.3734947, 1.3420596, 1.1721978, 1.2968619, 1.5249897,
        1.0370381, 1.3912894, 1.5404277, 1.3771281, 1.4850557, 1.3972007,
        1.4163164, 1.1762176, 1.1790816, 1.3884237, 1.3342997, 1.303082 ,
        1.1480755, 1.2126306, 1.2919899, 1.1597459, 1.485534 , 1.2900969,
        1.221908 , 1.2401632, 1.4433057, 1.4404896, 1.508135 , 1.3095189,
        1.4880893, 1.3983922, 1.2017698, 1.2355157, 1.4851977, 1.5640281,
        1.5173646, 1.3068354, 1.62453  , 1.5176066, 1.4924428, 1.2629938,
        1.327193 , 1.4941337, 1.6335568, 1.4370104, 1.5315871, 1.3623064,
        1.284233 , 1.1915743, 1.4893486, 1.3606577, 1.1597953, 1.3838465,
        1.2619832, 1.2759168, 1.5

In [131]:
import gc

In [144]:
gc.collect()

14265