# Batch Normalization
http://r2rt.com/implementing-batch-normalization-in-tensorflow.html

enables the use of higher learning rates, acts as a regularizer and can speed up training by 14 times

### Problem to solve:
Changes in model parameters during learning change the distributions of the outputs of each hidden layer. This means that later layers need to adapt to these (often noisy) changes during training.

TOASK so this means that ideally we would put a batch normalize in front of every non-linear layer ?.. (in order to give the non linearity an easy problem, and not extreme values)

the batch normalizing transform given above restricts the inputs to the activation function to a prescribed normal distribution. TOASK why? What will happen if we set gamma=1 and beta=0 ?

In [1]:
#Batch norm params
epsilon = 1e-3 #for not dividing by zero

## Multilayers using plain Tensorflow

In [2]:
import sys
mlpdir = '/home/student/Dropbox/msc_Artificial_Intelligence/mlp_Machine_Learning_Practical/mlpractical'
sys.path.append(mlpdir)

In [3]:
from mlp.data_providers import MNISTDataProvider
from mylibs.jupyter_notebook_helper import show_graph
import tensorflow as tf
import numpy as np
import os
import datetime
import math

In [4]:
# Seed a random number generator
seed = 16011984
rng = np.random.RandomState(seed)

In [5]:
config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)
config.gpu_options.allow_growth = True

In [6]:
totalTrain = 50000
totalValid = 10000

In [7]:
train_data = MNISTDataProvider('train', batch_size=totalTrain/1000, rng=rng, shuffle_order=False) #all 50000
valid_data = MNISTDataProvider('valid', batch_size=totalValid/1000, rng=rng, shuffle_order=False) #all 10000

In [8]:
def getInitialWeights(inDim, outDim):
    #https://www.tensorflow.org/how_tos/variables/
    #http://www.inf.ed.ac.uk/teaching/courses/mlpr/2016/notes/w4b_neural_net_intro.html        
    return tf.random_normal([inDim, outDim], mean=0, stddev=0.1/math.sqrt(outDim))

In [9]:
def batchNormWrapper_byExponentialMovingAvg(bnId, ins, training, epsilon = 1e-3):
    outputDim = ins.get_shape()[-1]
    
    pop_mean = tf.Variable(tf.zeros(outputDim), trainable=False, name='pm%d' % bnId)
    pop_var = tf.Variable(tf.ones(outputDim), trainable=False, name='pv%d' % bnId)
    
    beta_offset = tf.Variable(tf.zeros(outputDim), name='bo%d' % bnId)    
    scale_gamma = tf.Variable(tf.ones(outputDim), name='sg%d' % bnId)
    
    #given that on axis=0 is where the batches extend (we want mean and var for each attribute)
    batch_mean, batch_var = tf.nn.moments(ins,[0])
    
    decay = 0.999 # use numbers closer to 1 if you have more data
    mean_of_train = tf.assign(pop_mean, pop_mean * decay + batch_mean * (1 - decay)) #we just want to use the 
    var_of_train = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay))
    
    
    with tf.control_dependencies([mean_of_train, var_of_train]):
        normalized = tf.nn.batch_normalization(ins,
                                               tf.cond(training, lambda: batch_mean, lambda: pop_mean),
                                               tf.cond(training, lambda: batch_var, lambda: pop_var),
                                               beta_offset, scale_gamma, epsilon)

    return normalized

In [10]:
tf.reset_default_graph()

graph = tf.Graph() #create new graph

with graph.as_default():
    with tf.name_scope('params'):
        training = tf.placeholder(tf.bool, name="training")
    
    with tf.name_scope('data'):
        inputs = tf.placeholder(tf.float32, [None, 784], 'inputs')
        targets = tf.placeholder(tf.float32, [None, 10], 'targets')

    with tf.name_scope('affineLayer1'):
        weights_affine1 = tf.Variable(getInitialWeights(inDim = 784, outDim=100), name = 'w1')
        biases_affine1 = tf.Variable(tf.zeros(100), name='b1')
        out_affine1 = tf.matmul(inputs, weights_affine1) + biases_affine1

    with tf.name_scope('batchNorm1'):
        #given that on axis=0 is where the batches extend (we want mean and var for each attribute)
        #MANUAL WAY:
        #batch1_mean, batch1_var1 = tf.nn.moments(out_affine1, axes=[0])
        #z1_hat = (out_affine1 - batch1_mean) / tf.sqrt(batch1_var1 + epsilon)
        #batchNorm1 = tf.Variable(tf.ones(100)) * z1_hat + tf.Variable(tf.zeros(100)) #so this is learnable
        batchNorm1 = batchNormWrapper_byExponentialMovingAvg(1, out_affine1, training)

    with tf.name_scope('sigmoidLayer1'):
        non_linearity1 = tf.nn.sigmoid(batchNorm1, name='nonlinearity1')

    with tf.name_scope('affineLayer2'):
        weights_affine2 = tf.Variable(getInitialWeights(inDim = 100, outDim=100), name = 'w2')
        biases_affine2 = tf.Variable(tf.zeros(100), name='b2')
        out_affine2 = tf.matmul(non_linearity1, weights_affine2) + biases_affine2

    with tf.name_scope('batchNorm2'):
        #GOOD ONLY FOR TRAINING WAY:
        #batch2_mean, batch2_var = tf.nn.moments(out_affine2,axes=[0])
        #batchNorm2 = tf.nn.batch_normalization(out_affine2, batch2_mean, batch2_var,
        #                                       tf.Variable(tf.zeros(100)), tf.Variable(tf.ones(100)), epsilon)
        batchNorm2 = batchNormWrapper_byExponentialMovingAvg(2, out_affine2, training)

    with tf.name_scope('sigmoidLayer2'):
        non_linearity2 = tf.nn.sigmoid(batchNorm2)

    with tf.name_scope('affineLayer3'):
        weights_affine3 = tf.Variable(getInitialWeights(inDim = 100, outDim=10), name = 'w3')
        biases_affine3 = tf.Variable(tf.zeros(10), name='b3')
        out_affine3 = tf.matmul(non_linearity2, weights_affine3) + biases_affine3

    with tf.name_scope('error'):
        per_datapoint_errors = tf.nn.softmax_cross_entropy_with_logits(out_affine3, targets)
        error = tf.reduce_mean(per_datapoint_errors)

    with tf.name_scope('accuracy'):
        per_datapoint_pred_is_correct = tf.equal(tf.argmax(out_affine3, axis=1), tf.argmax(targets, axis=1))
        accuracy = tf.reduce_mean(tf.cast(per_datapoint_pred_is_correct, tf.float32))

    with tf.name_scope('training'):
        train_step = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(error)

    tf.summary.scalar('error', error)
    tf.summary.scalar('accuracy', accuracy)
    summary_op = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()

In [11]:
show_graph(graph)

In [12]:
def getTrainWriter(graph):
    #tensorboard --logdir=tf_batchnorm
    folder = "tf_batchnorm"

    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    return tf.summary.FileWriter(
        logdir=os.path.join(folder, timestamp, 'train'),
        graph=graph
    )

In [13]:
def getValidWriter(graph):
    #tensorboard --logdir=tf_batchnorm
    folder = "tf_batchnorm"

    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    return tf.summary.FileWriter(
        os.path.join(folder, timestamp, 'valid'),
        graph=graph
    )

In [14]:
def getWriters(graph):
    #tensorboard --logdir=tf_batchnorm
    folder = "tf_batchnorm"

    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    train_writer = tf.summary.FileWriter(
        logdir=os.path.join(folder, timestamp, 'train'),
        graph=graph
    )
    valid_writer = tf.summary.FileWriter(
        os.path.join(folder, timestamp, 'valid'),
        graph=graph
    )
    
    return train_writer, valid_writer

In [15]:
train_writer, valid_writer = getWriters(graph)

In [16]:
num_epoch = 20

with tf.Session(graph=graph, config=config) as sess:
    sess.run(init)
    
    for e in range(num_epoch):
        train_error = 0.
        train_acc = 0.

        validation_error = 0.
        valid_acc = 0.

        step = 0
        for input_batch, target_batch in train_data:
            _, batch_error, batch_acc, train_summary = sess.run(
                [train_step, error, accuracy, summary_op],
                feed_dict={inputs: input_batch, targets: target_batch, training: True})
            
            train_error += batch_error
            train_acc += batch_acc

            train_writer.add_summary(train_summary, global_step = e * train_data.num_batches + step)
            step += 1


        if (e+1)%1 == 0:
            step = 0
            for input_batch, target_batch in valid_data:
                batch_error, batch_acc, valid_summary = sess.run(
                    [error, accuracy, summary_op],
                    feed_dict={inputs: input_batch, targets: target_batch, training: False})
                
                validation_error += batch_error
                valid_acc += batch_acc

                valid_writer.add_summary(valid_summary, global_step = e*valid_data.num_batches + step)
                step += 1

        train_error /= train_data.num_batches
        train_acc /= train_data.num_batches

        validation_error /= valid_data.num_batches
        valid_acc /= valid_data.num_batches

        #print type(validation_error)

        print 'End of epoch %d: train error = %.2f, train accuracy = %.2f, valid error = %.2f, valid accuracy = %.2f'\
            % (e + 1, train_error, train_acc, validation_error, valid_acc)

End of epoch 1: train error = 0.54, train accuracy = 0.88, valid error = 1.30, valid accuracy = 0.68
End of epoch 2: train error = 0.29, train accuracy = 0.92, valid error = 0.31, valid accuracy = 0.92
End of epoch 3: train error = 0.23, train accuracy = 0.93, valid error = 0.22, valid accuracy = 0.94
End of epoch 4: train error = 0.19, train accuracy = 0.94, valid error = 0.19, valid accuracy = 0.94
End of epoch 5: train error = 0.16, train accuracy = 0.95, valid error = 0.17, valid accuracy = 0.95
End of epoch 6: train error = 0.14, train accuracy = 0.96, valid error = 0.16, valid accuracy = 0.95
End of epoch 7: train error = 0.12, train accuracy = 0.97, valid error = 0.15, valid accuracy = 0.96
End of epoch 8: train error = 0.10, train accuracy = 0.97, valid error = 0.14, valid accuracy = 0.96
End of epoch 9: train error = 0.08, train accuracy = 0.98, valid error = 0.13, valid accuracy = 0.96
End of epoch 10: train error = 0.07, train accuracy = 0.98, valid error = 0.13, valid accur

In [26]:
num_epoch = 10

sess = tf.InteractiveSession(graph=graph, config=config)
sess.run(init)

for e in range(num_epoch):
    train_error = 0.
    train_acc = 0.

    validation_error = 0.
    valid_acc = 0.

    step = 0
    for input_batch, target_batch in train_data:
        _, batch_error, batch_acc, train_summary = sess.run(
            [train_step, error, accuracy, summary_op],
            feed_dict={inputs: input_batch, targets: target_batch, training: True})

        train_error += batch_error
        train_acc += batch_acc

        train_writer.add_summary(train_summary, global_step = e * train_data.num_batches + step)
        step += 1


    if (e+1)%1 == 0:
        step = 0
        for input_batch, target_batch in valid_data:
            batch_error, batch_acc, valid_summary = sess.run(
                [error, accuracy, summary_op],
                feed_dict={inputs: input_batch, targets: target_batch, training: False})

            validation_error += batch_error
            valid_acc += batch_acc

            valid_writer.add_summary(valid_summary, global_step = e*valid_data.num_batches + step)
            step += 1

    train_error /= train_data.num_batches
    train_acc /= train_data.num_batches

    validation_error /= valid_data.num_batches
    valid_acc /= valid_data.num_batches

    #print type(validation_error)

    print 'End of epoch %d: train error = %.2f, train accuracy = %.2f, valid error = %.2f, valid accuracy = %.2f'\
        % (e + 1, train_error, train_acc, validation_error, valid_acc)

End of epoch 1: train error = 0.54, train accuracy = 0.88, valid error = 1.23, valid accuracy = 0.76
End of epoch 2: train error = 0.29, train accuracy = 0.92, valid error = 0.30, valid accuracy = 0.93


KeyboardInterrupt: 

In [28]:
[v.name for v in tf.trainable_variables()]

[u'affineLayer1/w1:0',
 u'affineLayer1/b1:0',
 u'batchNorm1/bo1:0',
 u'batchNorm1/sg1:0',
 u'affineLayer2/w2:0',
 u'affineLayer2/b2:0',
 u'batchNorm2/bo2:0',
 u'batchNorm2/sg2:0',
 u'affineLayer3/w3:0',
 u'affineLayer3/b3:0']

In [25]:
sess.close()