# Convolution for handwritten digits

In this exercises you investigate the implementation for a convolutional neural network for handwritten digit recognition. Most of the code is already written. Your task is to try to understand the code, write code to train the model and to add extra layers to the network.

In [1]:
%matplotlib inline
from IPython.display import Image
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
sns.set_context("poster")
sns.set_style('white', {'axes.linewidth': 0, 'xtick.major.size': 0.0,
 'xtick.minor.size': 0.0, 'ytick.major.size': 0.0,
 'ytick.minor.size': 0.0,
 'figure.figsize': (10, 6)})
plt.rcParams['figure.figsize'] = (10, 6)


data_dir = '/tmp/mnist'
mnist = input_data.read_data_sets(data_dir, one_hot=True)
train_labels = np.argmax(mnist.train.labels, axis=1)
train_images = mnist.train.images.reshape(55000, 28, 28)

def view_heatmap(image, label=""):
    """ Plots a grayscale heatmap """
    if label:
        plt.title('Label is {label}'.format(label=label))
    plt.imshow(image, cmap='gray')
    cur_axes = plt.gca()
    cur_axes.axes.get_xaxis().set_visible(False)
    cur_axes.axes.get_yaxis().set_visible(False)

Extracting /tmp/mnist/train-images-idx3-ubyte.gz
Extracting /tmp/mnist/train-labels-idx1-ubyte.gz
Extracting /tmp/mnist/t10k-images-idx3-ubyte.gz
Extracting /tmp/mnist/t10k-labels-idx1-ubyte.gz


In [2]:
image_size = int(np.sqrt(mnist.train.images.shape[1]))
image_channels = 1
print("The input images have {} x {} pixels".format(image_size, image_size))
print("The input images have {} color channel".format(image_channels))

The input images have 28 x 28 pixels
The input images have 1 color channel


### Reshaping input

In [3]:
tf.reset_default_graph()
# Reshaping of input
x = tf.placeholder(tf.float32, [None, 784])
x_reshape = tf.reshape(x, [-1, 28, 28, 1])
print(x_reshape.shape)

(?, 28, 28, 1)


#### Problem 1: Reshape input
1. Explain the shape of `x_reshape`. What is each dimension used for?
2. Why do we need to reshape the input?

### Convolutions
`tf.nn.conv2d(input, filter, strides, padding)`

Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
and a filter / kernel tensor of shape
`[filter_height, filter_width, in_channels, out_channels]


In [94]:
# Input
layer_input = x_reshape
in_height = image_size
in_width = image_size
in_channels = image_channels

# Filter dimensions
filter_height = 5
filter_width = 5
out_channels = 32


# Convolution parameters
# With padding = 'SAME' the 
stride = 1
zero_padding = 0

# Derived quantities
weight_shape = [filter_height, filter_width, in_channels, out_channels]
out_height = (in_height - filter_height + 2*zero_padding)/stride + 1
out_width = (in_width - filter_width + 2*zero_padding)/stride + 1


with tf.variable_scope('conv_1', reuse=tf.AUTO_REUSE):
    random_init = tf.random_normal_initializer(2.0/(filter_height*filter_width*in_channels), dtype=tf.float32)
    zero_init = tf.zeros_initializer(dtype=tf.float32)
    # Create filter weights
    W = tf.get_variable('W', shape=weight_shape, initializer=random_init)
    b = tf.get_variable('b', shape=[out_channels])
    convolution = tf.nn.conv2d(input=layer_input, filter=W, strides=[1, stride, stride, 1], padding='SAME')
    conf1_output = tf.nn.relu(tf.nn.bias_add(convolution, b))
    
print("Output layer shape: ", conf1_output.shape)

Output layer shape:  (?, 28, 28, 32)


#### Problem 2: Strides and padding
The padding parameter of `tf.nn.conv2d` is set to 'SAME'. With this setting, `tf.nn.conv2d` uses a padding $p$ that depends on the stride $s$.
1. Write down the output dimensions of convolutional layer for stride 1, 2, 3, and 4.
1. What is the value of $p$ for $s=1, 2, 3, 4?$ Explain your reasoning.
1. Figure out a formula for the padding number $p$ used by `tf.nn.conv2d` if the stride is $s$.

### Max pooling

Change the stride of the convolutional layer back to 1, and re-run that cell. The code cell blow should then output

    Output layer shape:  (?, 14, 14, 32)

In [95]:
with tf.variable_scope('pool_1', reuse=tf.AUTO_REUSE):
    k = 2
    pool1_output = tf.nn.max_pool(conf1_output, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')

print("Output layer shape: ", pool1_output.shape)

Output layer shape:  (?, 14, 14, 32)


#### Problem 3: Max pooling
The max pooling operation makes use of padding too. Again we use the setting `padding='SAME'`.
1. What is max pooling used for?
2. Write down the dimensions for output of the pooling layer for k=2, 3, and 4.
2. What padding numbers are being used? Explain your reasoning.

### Fully connected layer

Change $k$ in the max pooling layer back to 2, and re-run that cell.

In [31]:
# Before sending the output of the pooling layer into the fully connected layer,
# we have flatten it.
fc_input = tf.reshape(pool1_output, [-1, 14 * 14 * 32])

with tf.variable_scope('fc_1', reuse=tf.AUTO_REUSE):
    weight_init = tf.random_uniform_initializer(minval=-1, maxval=1, dtype=tf.float32)
    bias_init = tf.zeros_initializer()
    W = tf.get_variable('W', shape=(14*14*32, 10), initializer=weight_init)
    b = tf.get_variable('b', shape=(10), initializer=bias_init)
    fc_output = tf.matmul(fc_input, W)

### Activation function for classification

In [99]:
y_logits = fc_output
y_proba = tf.nn.softmax(fc_output)

### Loss function and minimization

In [119]:
y = tf.placeholder(name='y', dtype=tf.float32, shape=(None, 10)) # Input of labels
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=y_logits))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
train_op = optimizer.minimize(loss)

### Variable initializer

In [120]:
global_var_init = tf.global_variables_initializer()

### Training session


#### Problem 4: batch training
Create a new session and train the network in a loop over 100 batches. Generate training batches of 64 examples with
```
batch_x, batch_y = mnist.train.next_batch(64)
```
Print the training loss at each iteration.

In [1]:
# Problem 4: session for batch training

## Stacking convolutional layers

#### Problem 5: Stacking layers
1. After the first pooling layer, add a second convolutoinal layer with 64 filters of  size 5 * 5 and stride 1
2. After the second convolutional layer, add a max pooling layer with k=2
3. Adapt the fully connected layer to work with the output of the second pooling layer

Your code should eventually print 

    Input shape: (?, 28, 28, 1)
    Conf1 output shape:  (?, 28, 28, 32)
    Pool1 output shape:  (?, 14, 14, 32)
    Conf2 output shape:  (?, 14, 14, 64)
    Pool2 output shape:  (?, 7, 7, 64)
    Fc_1 output shape:  (?, 10)
    

In [33]:
tf.reset_default_graph()
# Reshaping of input
x = tf.placeholder(tf.float32, [None, 784])
x_reshape = tf.reshape(x, [-1, 28, 28, 1])
print("Input shape:", x_reshape.shape)


## Convolutional layer 1

# Input
conf1_input = x_reshape
in_height = image_size
in_width = image_size
in_channels = image_channels

# Filter dimensions
filter_height = 5
filter_width = 5
out_channels = 32

# Convolution parameters
# With padding = 'SAME' the 
stride = 1
zero_padding = 0

# Derived quantities
weight_shape = [filter_height, filter_width, in_channels, out_channels]


with tf.variable_scope('conv_1', reuse=tf.AUTO_REUSE):
    random_init = tf.random_normal_initializer(2.0/(filter_height*filter_width*in_channels), dtype=tf.float32)
    zero_init = tf.zeros_initializer(dtype=tf.float32)
    # Create filter weights
    W = tf.get_variable('W', shape=weight_shape, initializer=random_init)
    b = tf.get_variable('b', shape=[out_channels])
    convolution = tf.nn.conv2d(input=conf1_input, filter=W, strides=[1, stride, stride, 1], padding='SAME')
    conf1_output = tf.nn.relu(tf.nn.bias_add(convolution, b))
    
print("Conf_1 output shape: ", conf1_output.shape)

with tf.variable_scope('pool_1', reuse=tf.AUTO_REUSE):
    k = 2
    pool1_output = tf.nn.max_pool(conf1_output, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME')
    print("Pool_1 output shape: ", pool1_output.shape)

Input shape: (?, 28, 28, 1)
Conf_1 output shape:  (?, 28, 28, 32)
Pool_1 output shape:  (?, 14, 14, 32)


In [34]:
## Problem 5.1 Add convolutional layer 2 here

In [35]:
## Problem 5.2 Add max pooling layer 2 here

In [36]:
## Problem 5.3 Adapt fully connected layer

# Before sending the output of the pooling layer into the fully connected layer,
# we have to flatten it.
fc_input = tf.reshape(pool1_output, [-1, 14 * 14 * 32])

with tf.variable_scope('fc_1', reuse=tf.AUTO_REUSE):
    weight_init = tf.random_uniform_initializer(minval=-1, maxval=1, dtype=tf.float32)
    bias_init = tf.zeros_initializer()
    W = tf.get_variable('W', shape=(14*14*32, 10), initializer=weight_init)
    b = tf.get_variable('b', shape=(10), initializer=bias_init)
    fc1_output = tf.matmul(fc_input, W)
    print("Fc_1 output shape: ", fc1_output.shape)
    

Fc_1 output shape:  (?, 10)
