Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [30]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [31]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [32]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [33]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [34]:
# hyperparameters
batch_size = 128
num_steps = 3001
hidden_size = 1024
beta = 0.005

In [41]:
graph = tf.Graph()
with graph.as_default():
    # inputs
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # weights and biases
    W_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_size]))
    b_1 = tf.Variable(tf.zeros([hidden_size]))
    
    W_2 = tf.Variable(tf.truncated_normal([hidden_size, num_labels]))
    b_2 = tf.Variable(tf.zeros([num_labels]))
    
    # forward propagation and loss and gradient descent optimiser
    logits = tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, W_1) + b_1), W_2) + b_2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))

    # add l2
    loss += beta * (tf.nn.l2_loss(W_1) + tf.nn.l2_loss(W_2))
    
    optimiser = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # predictions
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, W_1) + b_1), W_2) + b_2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, W_1) + b_1), W_2) + b_2)

In [36]:
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()

    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimiser, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Minibatch loss at step 0: 1973.348267
Minibatch accuracy: 7.8%
Validation accuracy: 29.4%
Minibatch loss at step 500: 127.998314
Minibatch accuracy: 82.0%
Validation accuracy: 80.8%
Minibatch loss at step 1000: 10.961746
Minibatch accuracy: 82.0%
Validation accuracy: 85.2%
Minibatch loss at step 1500: 1.383894
Minibatch accuracy: 86.7%
Validation accuracy: 84.2%
Minibatch loss at step 2000: 0.686888
Minibatch accuracy: 88.3%
Validation accuracy: 84.8%
Minibatch loss at step 2500: 0.545936
Minibatch accuracy: 88.3%
Validation accuracy: 84.9%
Minibatch loss at step 3000: 0.693761
Minibatch accuracy: 83.6%
Validation accuracy: 85.4%
Test accuracy: 92.0%


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [None]:
# hyperparameters
batch_size = 128
num_steps = 3001
hidden_size = 1024
beta = 0.005

In [45]:
graph = tf.Graph()
with graph.as_default():
    # inputs
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # weights and biases
    W_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_size]))
    b_1 = tf.Variable(tf.zeros([hidden_size]))
    
    W_2 = tf.Variable(tf.truncated_normal([hidden_size, num_labels]))
    b_2 = tf.Variable(tf.zeros([num_labels]))
    
    # forward propagation and loss and gradient descent optimiser
    logits = tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, W_1) + b_1), W_2) + b_2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))

    # add l2
    loss += beta * (tf.nn.l2_loss(W_1) + tf.nn.l2_loss(W_2))
    
    optimiser = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # predictions
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, W_1) + b_1), W_2) + b_2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, W_1) + b_1), W_2) + b_2)

In [46]:
n_batches = 5
t_train_dataset = train_dataset[: batch_size * n_batches, :]
t_train_labels = train_labels[: batch_size * n_batches, :]

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()

    for step in range(num_steps):
        offset = (step * batch_size) % (t_train_labels.shape[0] - batch_size)
        
        batch_data = t_train_dataset[offset:(offset + batch_size), :]
        batch_labels = t_train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimiser, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Minibatch loss at step 0: 1967.082397
Minibatch accuracy: 10.0%
Validation accuracy: 20.5%
Minibatch loss at step 500: 140.748947
Minibatch accuracy: 100.0%
Validation accuracy: 46.4%
Minibatch loss at step 1000: 11.522995
Minibatch accuracy: 100.0%
Validation accuracy: 45.5%
Minibatch loss at step 1500: 1.002739
Minibatch accuracy: 100.0%
Validation accuracy: 53.3%
Minibatch loss at step 2000: 0.149725
Minibatch accuracy: 100.0%
Validation accuracy: 52.7%
Minibatch loss at step 2500: 0.076398
Minibatch accuracy: 100.0%
Validation accuracy: 53.0%
Minibatch loss at step 3000: 0.068079
Minibatch accuracy: 100.0%
Validation accuracy: 53.3%
Test accuracy: 59.2%


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [47]:
# hyperparameters
batch_size = 128
num_steps = 3001
hidden_size = 1024
beta = 0.005
dropout = 0.5

In [55]:
graph = tf.Graph()
with graph.as_default():
    # inputs
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # weights and biases
    W_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_size]))
    b_1 = tf.Variable(tf.zeros([hidden_size]))
    
    W_2 = tf.Variable(tf.truncated_normal([hidden_size, num_labels]))
    b_2 = tf.Variable(tf.zeros([num_labels]))
    
    # forward propagation and loss and gradient descent optimiser
    h1 = tf.nn.relu(tf.matmul(tf_train_dataset, W_1) + b_1)

    logits = tf.matmul(tf.nn.dropout(h1, dropout), W_2) + b_2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
    loss += beta * (tf.nn.l2_loss(W_1) + tf.nn.l2_loss(W_2))

    optimiser = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    train_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, W_1) + b_1) * dropout, W_2) + b_2)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, W_1) + b_1) * dropout, W_2) + b_2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset,  W_1) + b_1) * dropout, W_2) + b_2)

In [56]:
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()

    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimiser, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Minibatch loss at step 0: 2035.303101
Minibatch accuracy: 11.7%
Validation accuracy: 26.6%
Minibatch loss at step 500: 129.852051
Minibatch accuracy: 84.4%
Validation accuracy: 78.8%
Minibatch loss at step 1000: 11.064855
Minibatch accuracy: 80.5%
Validation accuracy: 83.9%
Minibatch loss at step 1500: 1.457402
Minibatch accuracy: 86.7%
Validation accuracy: 84.3%
Minibatch loss at step 2000: 0.799505
Minibatch accuracy: 89.1%
Validation accuracy: 83.1%
Minibatch loss at step 2500: 0.668907
Minibatch accuracy: 83.6%
Validation accuracy: 84.5%
Minibatch loss at step 3000: 0.817785
Minibatch accuracy: 84.4%
Validation accuracy: 83.8%
Test accuracy: 90.6%


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---

> A couple of 4 layer MLPs with 1024-300-50 hidden neurons respectively. We divided the noisy set into 5/6 train 1/6 valid and kept the clean set for testing. We 97.1% accuracy on the test set at 412 epoch with early stopping, linear decay of the learning rate, a hard constraint on the norm of the weights and tanh activation units. We get approximately 93 on valid and 98 on train. The train set is easy to overfit (you can get 100% accuracy on train if you continue training). One could probably do better if they pursue hyper-optimization further. We used Torch 7.

In [61]:
# hyperparameters
batch_size = 128
num_steps = 10000
hidden_size = 1024
beta = 0.005
dropout = 0.5

graph = tf.Graph()
with graph.as_default():
    # inputs
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # weights and biases
    W_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_size]))
    b_1 = tf.Variable(tf.zeros([hidden_size]))
    
    W_2 = tf.Variable(tf.truncated_normal([hidden_size, num_labels]))
    b_2 = tf.Variable(tf.zeros([num_labels]))
    
    # forward propagation and loss and gradient descent optimiser
    h1 = tf.nn.relu(tf.matmul(tf_train_dataset, W_1) + b_1)

    logits = tf.matmul(tf.nn.dropout(h1, dropout), W_2) + b_2
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
    loss += beta * (tf.nn.l2_loss(W_1) + tf.nn.l2_loss(W_2))

    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(0.5, global_step, 1000, 0.9)
    optimiser = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    train_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_train_dataset, W_1) + b_1) * dropout, W_2) + b_2)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, W_1) + b_1) * dropout, W_2) + b_2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset,  W_1) + b_1) * dropout, W_2) + b_2)
    
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()

    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
        _, l, predictions = session.run([optimiser, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Minibatch loss at step 0: 2064.397705
Minibatch accuracy: 14.8%
Validation accuracy: 44.1%
Minibatch loss at step 500: 137.985764
Minibatch accuracy: 83.6%
Validation accuracy: 80.2%
Minibatch loss at step 1000: 14.068832
Minibatch accuracy: 81.2%
Validation accuracy: 84.4%
Minibatch loss at step 1500: 2.102200
Minibatch accuracy: 82.0%
Validation accuracy: 83.9%
Minibatch loss at step 2000: 0.866945
Minibatch accuracy: 86.7%
Validation accuracy: 84.2%
Minibatch loss at step 2500: 0.647678
Minibatch accuracy: 85.2%
Validation accuracy: 84.6%
Minibatch loss at step 3000: 0.830914
Minibatch accuracy: 83.6%
Validation accuracy: 84.1%
Minibatch loss at step 3500: 0.598386
Minibatch accuracy: 86.7%
Validation accuracy: 84.3%
Minibatch loss at step 4000: 0.710131
Minibatch accuracy: 86.7%
Validation accuracy: 83.7%
Minibatch loss at step 4500: 0.652547
Minibatch accuracy: 85.9%
Validation accuracy: 84.6%
Minibatch loss at step 5000: 0.579147
Minibatch accuracy: 89.1%
Validation accuracy: 84.