Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (32000, 28, 28) (32000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (32000, 28, 28, 1) (32000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes. **Edit:** Get tensorboard working.

In [5]:
# REF: https://www.tensorflow.org/get_started/summaries_and_tensorboard
def variable_summaries(var, hist=True):
  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
  with tf.name_scope('summaries'):
    mean = tf.reduce_mean(var)
    tf.summary.scalar('mean', mean)
    with tf.name_scope('stddev'):
      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
    tf.summary.scalar('stddev', stddev)
    tf.summary.scalar('max', tf.reduce_max(var))
    tf.summary.scalar('min', tf.reduce_min(var))
    if hist:
        tf.summary.histogram('histogram', var)

In [23]:
batch_size = 128
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  with tf.name_scope("data"):
    with tf.name_scope("train"):
      tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    with tf.name_scope("valid"):
      tf_valid_dataset = tf.constant(valid_dataset)
    with tf.name_scope("test"):
      tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  with tf.name_scope("Weights/Biases"):
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    layer3_weights = tf.Variable(tf.truncated_normal(
        [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    with tf.name_scope("Layer1"):
      with tf.name_scope("Conv1"):
        conv1 = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME', name="Conv1")
        variable_summaries(conv1)  
      with tf.name_scope("Logits1"):
        logits1 = conv1 + layer1_biases
        variable_summaries(logits1)  
      with tf.name_scope("Act1"):
        hidden1 = tf.nn.relu(logits1, name="Relu1")
        variable_summaries(hidden1)
    with tf.name_scope("Layer2"):
      with tf.name_scope("Conv2"):
        conv2 = tf.nn.conv2d(hidden1, layer2_weights, [1, 2, 2, 1], padding='SAME', name="Conv2")
        variable_summaries(conv2)
      with tf.name_scope("Logits2"):
        logits2 = conv2 + layer2_biases
        variable_summaries(logits2)
      with tf.name_scope("Logits2"):
        hidden2 = tf.nn.relu(logits2, name="Relu2")
        variable_summaries(hidden2)
    with tf.name_scope("Reshape"):
      shape = hidden2.get_shape().as_list()
      reshape = tf.reshape(hidden2, [shape[0], shape[1] * shape[2] * shape[3]])
    with tf.name_scope("Layer3"):
      with tf.name_scope("Logits3"):
        logits3 = tf.matmul(reshape, layer3_weights) + layer3_biases
        variable_summaries(logits3)
      with tf.name_scope("Act3"):
        hidden3 = tf.nn.relu(logits3, name="Relu3")
        variable_summaries(hidden3)
    with tf.name_scope("Layer4"):
      logits4 = tf.matmul(hidden3, layer4_weights) + layer4_biases
      variable_summaries(logits4)
    return logits4

  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [7]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3.471792
Minibatch accuracy: 10.2%
Validation accuracy: 9.7%
Minibatch loss at step 50: 0.896388
Minibatch accuracy: 70.3%
Validation accuracy: 71.8%
Minibatch loss at step 100: 0.655574
Minibatch accuracy: 83.6%
Validation accuracy: 77.3%
Minibatch loss at step 150: 0.593063
Minibatch accuracy: 82.8%
Validation accuracy: 79.0%
Minibatch loss at step 200: 0.638354
Minibatch accuracy: 82.0%
Validation accuracy: 79.8%
Minibatch loss at step 250: 0.804501
Minibatch accuracy: 75.0%
Validation accuracy: 80.8%
Minibatch loss at step 300: 0.652016
Minibatch accuracy: 76.6%
Validation accuracy: 80.5%
Minibatch loss at step 350: 0.695189
Minibatch accuracy: 78.9%
Validation accuracy: 81.5%
Minibatch loss at step 400: 0.571845
Minibatch accuracy: 85.9%
Validation accuracy: 81.7%
Minibatch loss at step 450: 0.532243
Minibatch accuracy: 83.6%
Validation accuracy: 82.2%
Minibatch loss at step 500: 0.675998
Minibatch accuracy: 80.5%
Validation accuracy: 82.5%
Mi

Bringing a few things in from the fullyconnected project.

In [24]:
def run_batch_graph(graph, save_layers, num_steps = 1001, summary_path = './summary/', earliest_stop = 300, stopping_threshold = 3, batch_size=128, report_learning_rate=False):
    with tf.Session(graph=graph) as session:
        # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
        merged_summary = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(summary_path+'train', graph=graph)
        test_writer = tf.summary.FileWriter(summary_path+'test')
        tf.global_variables_initializer().run()
        print('Initialized')

        def get_feed_dict(step):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            return feed_dict, (batch_data, batch_labels)

        def test_step(step):
            feed_dict, (batch_data, batch_labels) = get_feed_dict(step)
            summary, test = session.run([merged_summary, test_prediction], feed_dict=feed_dict)
            test_writer.add_summary(summary, step)
            test = accuracy(test, test_labels)
            print("Step {}: Test: {}%".format(step, test))
            return test

        def train_step(step, verify=False, summary=False):
            feed_dict, (batch_data, batch_labels) = get_feed_dict(step)
            if verify:
                if not report_learning_rate:
                    if summary:
                        summ, _, train, valid = session.run([merged_summary, optimizer, train_prediction, valid_prediction], feed_dict=feed_dict)
                        train_writer.add_summary(summ, step)
                    else:
                        summ, _, train, valid = session.run([optimizer, train_prediction, valid_prediction], feed_dict=feed_dict)
                    train = accuracy(train, batch_labels)
                    valid = accuracy(valid, valid_labels)
                    print("Step {}: Train: {}% Valid: {}%".format(step, train, valid))
                    return train, valid
                else:
                    if summary:
                        summary, _, train, valid, lr = session.run([merged_summary, optimizer, train_prediction, valid_prediction, learning_rate], feed_dict=feed_dict)
                        train_writer.add_summary(summary, step)
                    else:
                        _, train, valid, lr = session.run([optimizer, train_prediction, valid_prediction, learning_rate], feed_dict=feed_dict)
                    train = accuracy(train, batch_labels)
                    valid = accuracy(valid, valid_labels)
                    print("Step {}: Train: {}% Valid: {}% Lr: {}".format(step, train, valid, lr))
                    return train, valid
            elif summary:
                summary, _ = session.run([merged_summary, optimizer], feed_dict=feed_dict)
                train_writer.add_summary(summary, step)
            else:
                session.run([optimizer], feed_dict=feed_dict)

        # Early Stopping
        saver = tf.train.Saver(save_layers)
        
        # Main Loop
        try:
            for step in range(num_steps):
                # Run the computations. We tell .run() that we want to run the optimizer,
                # and get the loss value and the training predictions returned as numpy
                # arrays.
                # Every 1000 steps, test the test accuracy without training
                if (step == num_steps-1):
                    test_step(step)

                # Every 100 steps, test training and validation accuracy
                elif (step % (num_steps//10) == 0):
                    saver.save(session, summary_path+"train/single_layer{}.chk".format(step), global_step=step)
                    train, valid = train_step(step, verify=True, summary=True)

                    # Early Stopping
                    if abs(train-valid)>stopping_threshold and step>earliest_stop and earliest_stop>=0:
                        print("Stopping Early!")
                        test_step(step)
                        break

                # Normal, fast, training step
                else:
                    train_step(step)
        except KeyboardInterrupt:
            print("Stopping Early!")
            test_step(step)

Does it work on our graph?

In [25]:
run_batch_graph(graph, save_layers=[layer1_weights, layer1_biases, \
                                    layer2_weights, layer2_biases, layer3_weights, layer3_biases, \
                                    layer4_weights, layer4_biases], \
                num_steps = 1001, summary_path = './summary/convolutions/basic/', earliest_stop = -1, \
                batch_size=batch_size, report_learning_rate=False)

Initialized
Step 0: Train: 5.46875% Valid: 10.0%
Step 100: Train: 86.71875% Valid: 77.29%
Step 200: Train: 82.03125% Valid: 80.41%
Step 300: Train: 81.25% Valid: 81.11%
Step 400: Train: 84.375% Valid: 82.54%
Step 500: Train: 81.25% Valid: 83.04%
Step 600: Train: 81.25% Valid: 83.08%
Step 700: Train: 83.59375% Valid: 83.72%
Step 800: Train: 86.71875% Valid: 83.56%
Step 900: Train: 87.5% Valid: 84.14%
Step 1000: Test: 91.15%


---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [31]:
batch_size = 128
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  with tf.name_scope("data"):
    with tf.name_scope("train"):
      tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    with tf.name_scope("valid"):
      tf_valid_dataset = tf.constant(valid_dataset)
    with tf.name_scope("test"):
      tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  with tf.name_scope("Weights/Biases"):
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    layer3_weights = tf.Variable(tf.truncated_normal(
        [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    with tf.name_scope("Layer1"):
      with tf.name_scope("Conv1"):
        conv1 = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
        variable_summaries(conv1)  
      with tf.name_scope("Logits1"):
        logits1 = conv1 + layer1_biases
        variable_summaries(logits1)
      with tf.name_scope("Act1"):
        hidden1 = tf.nn.relu(logits1, name="Relu1")
        variable_summaries(hidden1)
      with tf.name_scope("Pooling1"):
        pool1 = tf.nn.max_pool(hidden1, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME")
        variable_summaries(pool1)
    with tf.name_scope("Layer2"):
      with tf.name_scope("Conv2"):
        conv2 = tf.nn.conv2d(pool1, layer2_weights, [1, 1, 1, 1], padding='SAME')
        variable_summaries(conv2)
      with tf.name_scope("Logits2"):
        logits2 = conv2 + layer2_biases
        variable_summaries(logits2)
      with tf.name_scope("Logits2"):
        hidden2 = tf.nn.relu(logits2, name="Relu2")
        variable_summaries(hidden2)
      with tf.name_scope("Pooling2"):
        pool2 = tf.nn.max_pool(hidden2, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME")
        variable_summaries(pool2)
    with tf.name_scope("Reshape"):
      shape = pool2.get_shape().as_list()
      reshape = tf.reshape(pool2, [shape[0], shape[1] * shape[2] * shape[3]])
    with tf.name_scope("Layer3"):
      with tf.name_scope("Logits3"):
        logits3 = tf.matmul(reshape, layer3_weights) + layer3_biases
        variable_summaries(logits3)
      with tf.name_scope("Act3"):
        hidden3 = tf.nn.relu(logits3, name="Relu3")
        variable_summaries(hidden3)
    with tf.name_scope("Layer4"):
      logits4 = tf.matmul(hidden3, layer4_weights) + layer4_biases
      variable_summaries(logits4)
    return logits4

  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [None]:
run_batch_graph(graph, save_layers=[layer1_weights, layer1_biases, \
                                    layer2_weights, layer2_biases, layer3_weights, layer3_biases, \
                                    layer4_weights, layer4_biases], \
                num_steps = 1001, summary_path = './summary/convolutions/pooling/', earliest_stop = -1, \
                batch_size=batch_size, report_learning_rate=False)

Initialized
Step 0: Train: 9.375% Valid: 10.0%


Now lets use both:

In [None]:
batch_size = 128
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  with tf.name_scope("data"):
    with tf.name_scope("train"):
      tf_train_dataset = tf.placeholder(
        tf.float32, shape=(batch_size, image_size, image_size, num_channels))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    with tf.name_scope("valid"):
      tf_valid_dataset = tf.constant(valid_dataset)
    with tf.name_scope("test"):
      tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  with tf.name_scope("Weights/Biases"):
    layer1_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, num_channels, depth], stddev=0.1))
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
        [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    layer3_weights = tf.Variable(tf.truncated_normal(
        [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
        [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    with tf.name_scope("Layer1"):
      with tf.name_scope("Conv1"):
        conv1 = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        #variable_summaries(conv1)  
      with tf.name_scope("Logits1"):
        logits1 = conv1 + layer1_biases
        #variable_summaries(logits1)
      with tf.name_scope("Act1"):
        hidden1 = tf.nn.relu(logits1, name="Relu1")
        variable_summaries(hidden1)
      with tf.name_scope("Pooling1"):
        pool1 = tf.nn.max_pool(hidden1, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME")
        variable_summaries(pool1)
    with tf.name_scope("Layer2"):
      with tf.name_scope("Conv2"):
        conv2 = tf.nn.conv2d(pool1, layer2_weights, [1, 2, 2, 1], padding='SAME')
        #variable_summaries(conv2)
      with tf.name_scope("Logits2"):
        logits2 = conv2 + layer2_biases
        #variable_summaries(logits2)
      with tf.name_scope("Logits2"):
        hidden2 = tf.nn.relu(logits2, name="Relu2")
        variable_summaries(hidden2)
      with tf.name_scope("Pooling2"):
        pool2 = tf.nn.max_pool(hidden2, [1, 2, 2, 1], [1, 2, 2, 1], padding="SAME")
        variable_summaries(pool2)
    with tf.name_scope("Reshape"):
      shape = pool2.get_shape().as_list()
      reshape = tf.reshape(hidden2, [shape[0], shape[1] * shape[2] * shape[3]])
    with tf.name_scope("Layer3"):
      with tf.name_scope("Logits3"):
        logits3 = tf.matmul(reshape, layer3_weights) + layer3_biases
        #variable_summaries(logits3)
      with tf.name_scope("Act3"):
        hidden3 = tf.nn.relu(logits3, name="Relu3")
        variable_summaries(hidden3)
    with tf.name_scope("Layer4"):
      logits4 = tf.matmul(hidden3, layer4_weights) + layer4_biases
      #variable_summaries(logits4)
    return logits4

  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [None]:
run_batch_graph(graph, save_layers=[layer1_weights, layer1_biases, \
                                    layer2_weights, layer2_biases, layer3_weights, layer3_biases, \
                                    layer4_weights, layer4_biases], \
                num_steps = 1001, summary_path = './summary/convolutions/both/', earliest_stop = -1, \
                batch_size=batch_size, report_learning_rate=False)

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay. **Edit:** Also try inception learning.

---

## Problem 3
Get Tensorboard Images working and get inverse pooling working.