Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [None]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
import sys
from six.moves import cPickle as pickle
from six.moves import range

# http://stackoverflow.com/questions/29772158/make-ipython-notebook-print-in-real-time
oldsysstdout = sys.stdout
class flushfile():
    def __init__(self, f):
        self.f = f
    def __getattr__(self,name): 
        return object.__getattribute__(self.f, name)
    def write(self, x):
        self.f.write(x)
        self.f.flush()
    def flush(self):
        self.f.flush()
sys.stdout = flushfile(sys.stdout)

In [None]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [None]:
datasets = {
    "image_size": 28,
    "label_count": 10,
    "channel_count": 1
}
datasets["total_image_size"] = datasets["image_size"] * datasets["image_size"]

def reformat(dataset, labels, name):
    dataset = dataset.reshape((-1, datasets["image_size"], datasets["image_size"], datasets["channel_count"])).astype(np.float32)
    # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
    labels = (np.arange(datasets["label_count"]) == labels[:,None]).astype(np.float32)
    print(name + " set", dataset.shape, labels.shape)
    return dataset, labels
datasets["train"], datasets["train_labels"] = reformat(train_dataset, train_labels, "Training")
datasets["valid"], datasets["valid_labels"] = reformat(valid_dataset, valid_labels, "Validation")
datasets["test"], datasets["test_labels"] = reformat(test_dataset, test_labels, "Test")

print(datasets.keys())

In [None]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [None]:
def run_graph(graph_info, data, step_count, report_every=50):
    with tf.Session(graph=graph_info["graph"]) as session:
        tf.initialize_all_variables().run()
        print("Initialized")
        batch_size = graph_info["batch_size"]
        for step in xrange(step_count + 1):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (data["train_labels"].shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = data["train"][offset:(offset + batch_size), :, :, :]
            batch_labels = data["train_labels"][offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            targets = [graph_info["optimizer"], graph_info["loss"], graph_info["predictions"]]
            feed_dict = {graph_info["train"] : batch_data, graph_info["labels"] : batch_labels}
            _, l, predictions = session.run(targets, feed_dict=feed_dict)
            if (step % report_every == 0):
                print("Minibatch loss at step", step, ":", l)
                print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
                print("Validation accuracy: %.1f%%" % accuracy(graph_info["valid"].eval(), data["valid_labels"]))
        print("Test accuracy: %.1f%%" % accuracy(graph_info["test"].eval(), data["test_labels"]))

In [None]:
def convnet_two_layer(batch_size, patch_size, depth, hidden_size, data):
    image_size = data["image_size"]
    label_count = data["label_count"]
    channel_count = data["channel_count"]
    graph = tf.Graph()
    with graph.as_default():
        # Input data.
        train = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, channel_count))
        labels= tf.placeholder(tf.float32, shape=(batch_size, label_count))
        valid = tf.constant(data["valid"])
        test  = tf.constant(data["test"])

        # Variables.
        layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, channel_count, depth], stddev=0.1))
        layer1_biases  = tf.Variable(tf.zeros([depth]))
        layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1))
        layer2_biases  = tf.Variable(tf.constant(1.0, shape=[depth]))
        layer3_weights = tf.Variable(tf.truncated_normal([image_size // 4 * image_size // 4 * depth, hidden_size], stddev=0.1))
        layer3_biases  = tf.Variable(tf.constant(1.0, shape=[hidden_size]))
        layer4_weights = tf.Variable(tf.truncated_normal([hidden_size, label_count], stddev=0.1))
        layer4_biases  = tf.Variable(tf.constant(1.0, shape=[label_count]))

          # Model.
        def model(set):
            conv   = tf.nn.conv2d(set, layer1_weights, [1, 2, 2, 1], padding='SAME')
            hidden = tf.nn.relu(conv + layer1_biases)
            conv   = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
            hidden = tf.nn.relu(conv + layer2_biases)
            shape  = hidden.get_shape().as_list()
            reshape= tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
            hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
            return tf.matmul(hidden, layer4_weights) + layer4_biases

        # Training computation.
        logits = model(train)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, labels))
        
        info = {
            "graph": graph,
            "batch_size": batch_size,
            "train": train,
            "labels": labels,
            "loss": loss,
            "optimizer": tf.train.GradientDescentOptimizer(0.05).minimize(loss),

            # Predictions for the training, validation, and test data.
            "predictions": tf.nn.softmax(logits),
            "valid": tf.nn.softmax(model(valid)),
            "test":  tf.nn.softmax(model(test))
        }
    return info

In [None]:
graph_2conv = convnet_two_layer(batch_size=16, patch_size=5, depth=16, hidden_size=64, data=datasets)

run_graph(graph_2conv, datasets, 1000)

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [None]:
def poolnet_two_layer(batch_size, patch_size, depth, hidden_size, data):
    image_size = data["image_size"]
    label_count = data["label_count"]
    channel_count = data["channel_count"]
    graph = tf.Graph()
    with graph.as_default():
        # Input data.
        train = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, channel_count))
        labels= tf.placeholder(tf.float32, shape=(batch_size, label_count))
        valid = tf.constant(data["valid"])
        test  = tf.constant(data["test"])

        # Variables.
        #layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, channel_count, depth], stddev=0.1))
        layer1_biases  = tf.Variable(tf.zeros([depth]))
        #layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1))
        layer2_biases  = tf.Variable(tf.constant(1.0, shape=[depth]))
        layer3_weights = tf.Variable(tf.truncated_normal([image_size // 4 * image_size // 4 * depth, hidden_size], stddev=0.1))
        layer3_biases  = tf.Variable(tf.constant(1.0, shape=[hidden_size]))
        layer4_weights = tf.Variable(tf.truncated_normal([hidden_size, label_count], stddev=0.1))
        layer4_biases  = tf.Variable(tf.constant(1.0, shape=[label_count]))

          # Model.
        def model(set):
            pool   = tf.nn.max_pool(set, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
            hidden = tf.nn.relu(pool + layer1_biases)
            pool   = tf.nn.max_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], padding='SAME')
            hidden = tf.nn.relu(pool + layer2_biases)
            shape  = hidden.get_shape().as_list()
            reshape= tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
            hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
            return tf.matmul(hidden, layer4_weights) + layer4_biases

        # Training computation.
        logits = model(train)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, labels))
        
        info = {
            "graph": graph,
            "batch_size": batch_size,
            "train": train,
            "labels": labels,
            "loss": loss,
            "optimizer": tf.train.GradientDescentOptimizer(0.05).minimize(loss),

            # Predictions for the training, validation, and test data.
            "predictions": tf.nn.softmax(logits),
            "valid": tf.nn.softmax(model(valid)),
            "test":  tf.nn.softmax(model(test))
        }
    return info

In [None]:
graph_2pool = poolnet_two_layer(batch_size=16, patch_size=5, depth=16, hidden_size=64, data=datasets)

run_graph(graph_2pool, datasets, 1000)

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

In [None]:
def convnet_optimize(batch_size, patch_sizes, depth, hidden_sizes, data, rate_alpha=0.05, decay_rate=1.0, decay_steps=1000):
    image_size = data["image_size"]
    label_count = data["label_count"]
    channel_count = data["channel_count"]
    graph = tf.Graph()
    with graph.as_default():
        # Input data.
        train = tf.placeholder(tf.float32, shape=(batch_size, image_size, image_size, channel_count))
        labels= tf.placeholder(tf.float32, shape=(batch_size, label_count))
        valid = tf.constant(data["valid"])
        test  = tf.constant(data["test"])

        # Variables.
        layer1_weights = tf.Variable(tf.truncated_normal([patch_sizes[0], patch_sizes[0], channel_count, depth], stddev=0.1))
        layer1_biases  = tf.Variable(tf.zeros([depth]))
        layer2_weights = tf.Variable(tf.truncated_normal([patch_sizes[1], patch_sizes[1], depth, depth], stddev=0.1))
        layer2_biases  = tf.Variable(tf.constant(1.0, shape=[depth]))
        layer3_weights = tf.Variable(tf.truncated_normal([image_size // 4 * image_size // 4 * depth, hidden_sizes[0]], stddev=0.1))
        layer3_biases  = tf.Variable(tf.constant(1.0, shape=[hidden_sizes[0]]))
        layer4_weights = tf.Variable(tf.truncated_normal([hidden_sizes[0], label_count], stddev=0.1))
        layer4_biases  = tf.Variable(tf.constant(1.0, shape=[label_count]))

          # Model.
        def model(set):
            conv   = tf.nn.conv2d(set, layer1_weights, [1, 2, 2, 1], padding='SAME')
            hidden = tf.nn.relu(conv + layer1_biases)
            conv   = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
            hidden = tf.nn.relu(conv + layer2_biases)
            shape  = hidden.get_shape().as_list()
            reshape= tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
            hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
            return tf.matmul(hidden, layer4_weights) + layer4_biases

        # Training computation.
        logits = model(train)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, labels))
        
        global_step = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(rate_alpha, global_step, decay_steps, decay_rate)
        
        info = {
            "graph": graph,
            "batch_size": batch_size,
            "train": train,
            "labels": labels,
            "loss": loss,
            "optimizer": tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step),

            # Predictions for the training, validation, and test data.
            "predictions": tf.nn.softmax(logits),
            "valid": tf.nn.softmax(model(valid)),
            "test":  tf.nn.softmax(model(test))
        }
    return info

In [None]:
graph_connive = convnet_optimize(
    batch_size=16, patch_sizes=[5,10], depth=16,
    hidden_sizes=[64],
    data=datasets)

run_graph(graph_connive, datasets, 5000, report_every=500)

In [None]:
optimal_steps = 5000

graph_connive = convnet_optimize(
    batch_size=16, patch_sizes=[5,10], depth=20,
    hidden_sizes=[128],    
    rate_alpha=0.05, decay_rate=0.90, decay_steps=optimal_steps,
    data=datasets)

run_graph(graph_connive, datasets, optimal_steps, report_every=500)