# Part 3 - keras, distributions, optimizers, tensorboard, saving

Now that we have implemented and trained a model from scratch, we are ready introduce a selected list of useful higher-level API and utilities that make your life easier. 

In [1]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib.pyplot as plt
import numpy as np
import os
import datetime

## Hyperparams & constants

In [2]:
dim_in, dim_out = 784, 10
dims_hidden = [128, 64]
activations_hidden = ["relu", "relu"]
batch_size = 128
learning_rate = 5e-4

num_iterations_train = 20000

now = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
log_path = os.path.join("logs", now)

In [3]:
os.makedirs(log_path, exist_ok=True)

## Data

In [4]:
# This time we will use integer labels directly instead of one_hot. 
data = input_data.read_data_sets("data/MNIST/", one_hot=False)  

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting data/MNIST/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting data/MNIST/train-labels-idx1-ubyte.gz
Extracting data/MNIST/t10k-images-idx3-ubyte.gz
Extracting data/MNIST/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


## Model - TODO: Make tf keras Model with print? and tf.summary_writer (is probably automatically in keras).

In [5]:
class MLP():
    """ Multi-Layer-Perceptron """
    def __init__(self, name: str, dim_in: int, dims: tuple, activations: tuple):
        self.name = name
        self.dim_in = dim_in
        self.dim_out = dims[-1]
        self.shp_in = (dim_in,)
        self.shp_out = (dims[-1],)
        
        dims_in = (dim_in,) + tuple(dims[:-1])
        dims_out = dims
        self.layers = list()
        with tf.variable_scope(name, reuse=False): 
            for idx_hidden, (dim_in, dim_out, activation) in enumerate(zip(dims_in, dims_out, activations)):
                kernel_initializer, bias_initializer = self.get_initializers_for(activation=activation)
                # ***********************************
                layer = tf.keras.layers.Dense(dim_out, 
                                              activation=activation, 
                                              name="layer_{}".format(idx_hidden), 
                                              kernel_initializer=kernel_initializer, 
                                              bias_initializer=bias_initializer,
                                             )
                layer.build(dim_in)
                # ***********************************
                self.layers.append(layer)

    def __call__(self, x):
        h = x
        for layer in self.layers:
            h = layer(h)
        return h
    
    def get_initializers_for(self, activation: str, distribution: str = "uniform", mode : str = "fan_in"):
        """ Helper function to choose appropriate initialization method, depending on the activation function. """
        if not isinstance(activation, str):
            activation = activation.__name__

        if activation in [None, 'linear']:
            scale = 1.0
        elif activation is "relu":
            scale = 2.0  
        elif activation is "tanh":
            scale = 1.32
        else:
            raise ValueError("unexpected activation function: {}".format(activation))

        kernel_initializer = tf.initializers.variance_scaling(
            scale=scale,
            mode=mode,
            distribution=distribution,
        )
        bias_initializer = tf.initializers.constant(0.0)
        return kernel_initializer, bias_initializer

In [6]:
tf.reset_default_graph()  # Might be helpful, if you rebuild your model.

# Batch size None means that batch size is define by the data provided by the feed_dict to session.run()
labels = tf.placeholder(shape=[None], dtype=tf.int32, name="targets")  
inputs = tf.placeholder(shape=[None, dim_in], dtype=tf.float32, name="inputs")
model = MLP(name="MLP", 
            dim_in=dim_in, 
            dims=dims_hidden + [dim_out],  # last dimension is for output layer.
            activations=activations_hidden + ["linear"],  # Output has no activation function.
           )
logits = model(inputs) 

**Note:** I strongly recommend to use tf.distributions. 
This way, you are explicit about your assumptions about the data distribution. 
Furthermore, your loss function directly follows from this assumption, i.e. your objective is to maximize the log-likelihood $\log p_{\theta}(y ~|~ x)$, where $\theta$ are the model parameters (variables in tensorflow). 

In [7]:
def get_accuracy(logits, labels):
    with tf.name_scope("accuracy"): 
        correct_prediction = tf.equal(tf.cast(tf.argmax(logits, axis=-1), tf.int32), labels)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

def get_loglikelihood(logits, labels):
#     with tf.name_scope("likelihood_distribution"):
    likelihood_dist = tf.distributions.Categorical(  # tf.contrib.distributions.OneHotCategorical
        logits=logits, name="predicted_labels")  
    with tf.name_scope("log-likelihood"):
        loglikelihood = tf.reduce_sum(likelihood_dist.log_prob(labels))  
    return loglikelihood

In [8]:
loss = - get_loglikelihood(logits, labels)
accuracy = get_accuracy(logits, labels)

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_step = optimizer.minimize(loss)

### Inspect model on tensorboard

In [9]:
# To visualize the graph on Tensorboard, we must add it to the summaries.
summary_writer = tf.summary.FileWriter(log_path)
summary_writer.add_graph(tf.get_default_graph())
 
# We can do the same in one line, by passing the graph to the constructor of the FileWriter
# tf.summary.FileWriter(log_path, tf.get_default_graph())

In [10]:
# We can also add some other useful statistics to tensorboard and log them periodically (during training).
# We do this by creating a unique "summary_key" (just a string), which we add to the graph collections.
print("Graph collections, BEFORE we added our summaries: {}".format(
    tf.get_default_graph().collections))
validation_summary_key = tf.get_default_graph().unique_name("validation_summaries")

# Add loss to our summaries key in the graph collections
tf.summary.scalar("loss", loss, collections=[validation_summary_key])

# Let's also add some gradient statistics. This is often useful for debugging
variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
gradients = tf.gradients(loss, variables)
for var, grad in zip(variables, gradients):
    tf.summary.histogram(name="gradient/" + var.name.replace(':', '/'), 
                         values=grad, 
                         collections=[validation_summary_key])
    tf.summary.scalar(name="gradient_norm/" + var.name.replace(':', '/'),
                      tensor=tf.norm(grad), 
                      collections=[validation_summary_key])
    
    tf.summary.histogram(name="parameter/" + var.name.replace(':', '/'), 
                         values=var, 
                         collections=[validation_summary_key])
    tf.summary.scalar(name="parameter_norm/" + var.name.replace(':', '/'),
                      tensor=tf.norm(var), 
                      collections=[validation_summary_key])

# Let's make one summary, that we can run with as a single operation. (summarize is an operation)
validation_summaries = tf.summary.merge_all(key=validation_summary_key)

print("Graph collections, AFTER we added our summaries: {}".format(
    tf.get_default_graph().collections))
print("We should have a summary key: {}".format(validation_summary_key))

Graph collections, BEFORE we added our summaries: [('__variable_store',), ('__varscope',), 'trainable_variables', 'variables', 'update_ops', 'train_op']
Graph collections, AFTER we added our summaries: [('__variable_store',), ('__varscope',), 'trainable_variables', 'variables', 'update_ops', 'train_op', 'validation_summaries']
We should have a summary key: validation_summaries


In [11]:
# # Training
# # ema = tf.train.ExponentialMovingAverage(decay=0.98, zero_debias=True)
# # ema.apply([loss])

# tf.metrics.mean(
#     values,
#     metrics_collections=None,
#     updates_collections=None,
#     name=None
# )

In your terminal do: 
<br>
tensorboard --logdir=PATH_TO_LOG_FOLDER

In [12]:
saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
sess = tf.Session()
sess.run(tf.global_variables_initializer()) # Initializes weights and biases.

In [13]:
best_val_accuracy = -np.inf
for iter_train in range(num_iterations_train):  
    batch_inputs, batch_labels = data.train.next_batch(batch_size=batch_size, shuffle=True)
    feed_dict = {inputs: batch_inputs, labels: batch_labels}
    sess.run(train_step, feed_dict=feed_dict)  
        
    if iter_train % 500 == 0 or iter_train == num_iterations_train - 1:  # Validate
        feed_dict = {inputs: data.validation.images, labels: data.validation.labels}
        val_summary, val_loss, val_accuracy = sess.run(
            [validation_summaries, loss, accuracy], feed_dict=feed_dict)
        print("iter {} / {}, validation accuracy = {}, loss = {}".format(
            iter_train, num_iterations_train, val_accuracy, val_loss))
        
        # Log summaries to TB (val loss, gradient norm, gradient histogram)
        summary_writer.add_summary(val_summary, iter_train)
        
        # Save our session at every validation, in case our program stops.
        saver.save(sess=sess, save_path=os.path.join(log_path, "session"))
        # Let's also save the current best model somewhere else.
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            saver.save(sess=sess, save_path=os.path.join(log_path, "best_model", "session"))

iter 0 / 20000, validation accuracy = 0.09700000286102295, loss = 11675.59375
iter 500 / 20000, validation accuracy = 0.9435999989509583, loss = 1002.8472900390625
iter 1000 / 20000, validation accuracy = 0.9595999717712402, loss = 702.9844970703125
iter 1500 / 20000, validation accuracy = 0.9649999737739563, loss = 584.0706787109375
iter 2000 / 20000, validation accuracy = 0.9710000157356262, loss = 499.24322509765625
iter 2500 / 20000, validation accuracy = 0.9739999771118164, loss = 458.9294738769531
iter 3000 / 20000, validation accuracy = 0.9721999764442444, loss = 429.9996337890625
iter 3500 / 20000, validation accuracy = 0.978600025177002, loss = 380.74542236328125
iter 4000 / 20000, validation accuracy = 0.9771999716758728, loss = 386.56158447265625
iter 4500 / 20000, validation accuracy = 0.9782000184059143, loss = 382.8121032714844
iter 5000 / 20000, validation accuracy = 0.9787999987602234, loss = 369.7127990722656
iter 5500 / 20000, validation accuracy = 0.9782000184059143,

In [14]:
def get_test_accuracy():
    test_accuracies = list()
    data.test._index_in_epoch = 0 
    for iter_test in range(int(data.test.num_examples / batch_size)):  # Do 10k iterations of gradient descent
        batch_inputs, batch_labels = data.test.next_batch(batch_size=batch_size, shuffle=False)
        feed_dict = {inputs:batch_inputs, labels: batch_labels}
        test_accuracies.append(sess.run(accuracy, feed_dict=feed_dict))
    test_accuracy = np.mean(test_accuracies)
    print("Test accuracy is: {}".format(test_accuracy))
    return test_accuracy

In [15]:
test_accuracy_model = get_test_accuracy()

Test accuracy is: 0.9788661599159241


### Restoring a saved model

Saving and restoring your model parameters is pretty simple using tf.train.Saver.
We have already created the saver object, where we provided all variables registered to the default graph. It is also possible to save only subsets of variables, such as all trainable variables (tf.GraphKeys.TRAINABLE_VARIABLES). 

In [16]:
sess.run(tf.global_variables_initializer()) # Re-initialize (overwrite) model parameters
test_accuracy_random = get_test_accuracy()

Test accuracy is: 0.09354967623949051


In [17]:
saver.restore(sess, os.path.join(log_path, "session"))
test_accuracy_model = get_test_accuracy()

INFO:tensorflow:Restoring parameters from logs/2018-06-23-15-53-00/session
Test accuracy is: 0.9788661599159241


In [18]:
saver.restore(sess, os.path.join(log_path, "best_model", "session"))  # Load model that performed best on val set
test_accuracy_model = get_test_accuracy()

INFO:tensorflow:Restoring parameters from logs/2018-06-23-15-53-00/best_model/session
Test accuracy is: 0.9783653616905212


Restoring the model was very simple, since our default graph still had all variables registered.
<br>
Otherwise, we must first build the same model with the same variable names, and then load the stored variables.
<br>
Try restarting the kernel of this notebook.
Then run the code that builds the model, create the saver, and last run the previous cells to load the model. 