Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from tensorflow.contrib import slim
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
image_size = 28
num_labels = 10
channels = 1
batch_size = 16
class NotMNIST:
    def __init__(self):
        class Train:
            def __init__(self):
                self.images = []
                self.labels = []
                self.batch_counter = 0

            @property
            def num_examples(self):
                return len(self.images)

            def next_batch(self, num):
                if self.batch_counter + num >= len(self.labels):
                    batch_images = self.images[self.batch_counter:]
                    batch_labels = self.labels[self.batch_counter:]
                    left = num - len(batch_labels)
                    self.batch_counter = left
                else:
                    batch_images = self.images[self.batch_counter:self.batch_counter + num]
                    batch_labels = self.labels[self.batch_counter:self.batch_counter + num]
                    self.batch_counter += num
                return batch_images, batch_labels

        class Test:
            def __init__(self):
                self.images = []
                self.labels = []
        
        class Validation:
            def __init__(self):
                self.images = []
                self.labels = []

        self.train = Train()
        self.test = Test()
        self.validation = Validation()
            
        # self.load_data(self.reformat((-1, image_size * image_size)))
        self.load_data(self.reformat((-1, image_size, image_size, channels)))
        
    def reformat(self, shape):
        return lambda dataset, labels: (
                dataset.reshape(shape).astype(np.float32),
                (np.arange(num_labels) == labels[:, None]).astype(np.float32)
            )
        
    def load_data(self, reformat):

        pickle_file = '/data/notMNIST.pickle'

        with open(pickle_file, 'rb') as f:
            save = pickle.load(f)
            train_dataset = save['train_dataset']
            train_labels = save['train_labels']
            valid_dataset = save['valid_dataset']
            valid_labels = save['valid_labels']
            test_dataset = save['test_dataset']
            test_labels = save['test_labels']
            del save  # 删除内存文件，等待gc回收释放内存
        
        train_dataset, train_labels = reformat(train_dataset, train_labels)
        valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
        test_dataset, test_labels = reformat(test_dataset, test_labels)
        print('Training set', train_dataset.shape, train_labels.shape)
        print('Validation set', valid_dataset.shape, valid_labels.shape)
        print('Test set', test_dataset.shape, test_labels.shape)
        self.train.images = train_dataset
        self.train.labels = train_labels
        self.test.images = test_dataset
        self.test.labels = test_labels  

not_mnist = NotMNIST()

Training set (300000, 28, 28, 1) (300000, 10)
Validation set (15000, 28, 28, 1) (15000, 10)
Test set (15000, 28, 28, 1) (15000, 10)


In [3]:
# 训练参数
learning_rate = 0.01  # 梯度下降步长，寻找最优解的下降步长
training_epochs = 25  # 迭代轮数
batch_size = 50  # 批次训练数据集大小
display_step = 1
num_channels = 1

# tf图输入
x = tf.placeholder(tf.float32, [None, image_size, image_size, num_channels])  # mnist 数据集图片大小为28*28=784，placeholder为占位符
y = tf.placeholder(tf.float32, [None, num_labels])  # 识别A-J的数据，一共10个类别

# CNN参数
patch_size = 5
depth = 16
num_hidden = 64

# Variables.
w1 = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, 6], stddev=0.1))
b1 = tf.Variable(tf.zeros([6]))

w3 = tf.Variable(tf.truncated_normal([patch_size, patch_size, 6, 16], stddev=0.1))
b3 = tf.Variable(tf.zeros([16]))

w5 = tf.Variable(tf.truncated_normal([5 * 5 * 16, 120], stddev=0.1))
b5 = tf.Variable(tf.zeros([120]))

w6 = tf.Variable(tf.truncated_normal([120, 84], stddev=0.1))
b6 = tf.Variable(tf.zeros([84]))

w7 = tf.Variable(tf.truncated_normal([84, num_labels], stddev=0.1))
b7 = tf.Variable(tf.zeros([num_labels]))

# Model.
def model(data):
    conv = tf.nn.conv2d(data, w1, [1, 1, 1, 1], padding='SAME')
    hidden = tf.nn.relu(conv + b1)
    hidden = tf.nn.avg_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], 'SAME')
    
    conv = tf.nn.conv2d(hidden, w3, [1, 1, 1, 1], padding='VALID')
    hidden = tf.nn.relu(conv + b3)
    hidden = tf.nn.avg_pool(hidden, [1, 2, 2, 1], [1, 2, 2, 1], 'SAME')
    
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [-1, shape[1] * shape[2] * shape[3]])
    
    hidden = tf.nn.relu(tf.matmul(reshape, w5) + b5)
    hidden = tf.nn.relu(tf.matmul(hidden, w6) + b6)
    noise = tf.random_normal(shape=tf.shape(hidden), mean=0.0, stddev=0.1, dtype=tf.float32)
    hidden = noise + hidden
    return tf.matmul(hidden, w7) + b7

# 构造模型
pred = tf.nn.softmax(model(x))  # Softmax

# 损失函数：交叉墒
loss = tf.reduce_mean(-tf.reduce_sum(y * tf.log(pred), reduction_indices=1))
# 使用梯度下降算法寻找最优解
# API tf.train.GradientDescentOptimizer
# TODO 创建梯度下降优化方法
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# 初始化所有变量
init = tf.global_variables_initializer()
saver = tf.train.Saver()

def accuracy(pred, y, test_data):
    # 测试模型
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # 计算准确度
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy.eval({x: test_data.images, y: test_data.labels})

# 启动tf图
with tf.Session() as sess:
    sess.run(init)  # 初始化所有变量

    # 迭代训练
    for epoch in range(training_epochs):
        avg_loss = 0.
        total_batch = int(not_mnist.train.num_examples / batch_size)  # 计算数据集总共有多少批次数据
        # 训练训练所有批次数据
        for i in range(total_batch):
            batch_xs, batch_ys = not_mnist.train.next_batch(batch_size)
            # 运行优化操作和损失函数计算操作，获取损失值
            _, c = sess.run([optimizer, loss], feed_dict={x: batch_xs,
                                                          y: batch_ys})
            # 计算平均损失
            avg_loss += c / total_batch
        # 打印显示
        if (epoch + 1) % display_step == 0:
            print("Epoch:", '%04d' % (epoch + 1), "loss:", "{:.9f}".format(avg_loss),
                  "accuracy:", accuracy(pred, y, not_mnist.test))

    print("Optimization Finished!")
    print("Accuracy:", accuracy(pred, y, not_mnist.test))
    saver.save(sess, 'my-model')

Epoch: 0001 loss: 0.714518250 accuracy: 0.9134
Epoch: 0002 loss: 0.460661827 accuracy: 0.929666
Epoch: 0003 loss: 0.409295347 accuracy: 0.935266
Epoch: 0004 loss: 0.379474094 accuracy: 0.940866
Epoch: 0005 loss: 0.359003248 accuracy: 0.943733
Epoch: 0006 loss: 0.344224773 accuracy: 0.9462
Epoch: 0007 loss: 0.332242314 accuracy: 0.947333
Epoch: 0008 loss: 0.322451069 accuracy: 0.9498
Epoch: 0009 loss: 0.314036303 accuracy: 0.9506
Epoch: 0010 loss: 0.306613015 accuracy: 0.9512
Epoch: 0011 loss: 0.300141698 accuracy: 0.9526
Epoch: 0012 loss: 0.294244232 accuracy: 0.953866
Epoch: 0013 loss: 0.289031032 accuracy: 0.9546
Epoch: 0014 loss: 0.284457519 accuracy: 0.954533
Epoch: 0015 loss: 0.279909228 accuracy: 0.955933
Epoch: 0016 loss: 0.275655867 accuracy: 0.9564
Epoch: 0017 loss: 0.271777932 accuracy: 0.956866
Epoch: 0018 loss: 0.268040385 accuracy: 0.957266
Epoch: 0019 loss: 0.264848874 accuracy: 0.957466
Epoch: 0020 loss: 0.261701117 accuracy: 0.9582
Epoch: 0021 loss: 0.258486331 accuracy

In [None]:
def weight_variable(shape, stddev=0.1):
    return tf.Variable(tf.truncated_normal(shape, stddev))


def bias_variable(shape, default_value=0.1):
    return tf.Variable(tf.constant(default_value, shape=shape))

# Variables.
w1 = tf.Variable(tf.truncated_normal([5, 5, 1, 6], stddev=0.1))
b1 = tf.Variable(tf.zeros([6]))

w3 = tf.Variable(tf.truncated_normal([5, 5, 6, 16], stddev=0.1))
b3 = tf.Variable(tf.zeros([16]))

w5 = tf.Variable(tf.truncated_normal([400, 120], stddev=0.1))
b5 = tf.Variable(tf.zeros([120]))

w6 = tf.Variable(tf.truncated_normal([120, 84], stddev=0.1))
b6 = tf.Variable(tf.zeros([84]))

w7 = tf.Variable(tf.truncated_normal([84, 10], stddev=0.1))
b7 = tf.Variable(tf.zeros([10]))


x = tf.placeholder(tf.float32, [None, image_size, image_size, channels], name='x-input')
y_ = tf.placeholder(tf.float32, [None, num_labels], name='y-input')

conv1 = tf.nn.relu(tf.nn.conv2d(x, w1, strides=[1, 1, 1, 1], padding='SAME') + b1)
hidden1 = tf.nn.avg_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

conv2 = tf.nn.relu(tf.nn.conv2d(hidden1, w3, strides=[1, 1, 1, 1], padding='VALID') + b3)
hidden2 = tf.nn.avg_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

shape = hidden2.get_shape().as_list()
conv_out = tf.reshape(hidden2, [-1, shape[1] * shape[2] * shape[3]])

full1 = tf.nn.relu(tf.matmul(conv_out, w5) + b5)
full2 = tf.nn.relu(tf.matmul(full1, w6) + b6)
y = tf.nn.relu(tf.matmul(full2, w7) + b7)

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
optimizer = tf.train.AdamOptimizer(0.01).minimize(cross_entropy)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

def accuracy(pred, y, test_data):
    # 测试模型
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # 计算准确度
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy.eval({x: test_data.images, y: test_data.labels})

# 启动tf图
with tf.Session() as sess:
    sess.run(init)  # 初始化所有变量

    # 迭代训练
    total_batch = int(not_mnist.train.num_examples / batch_size)  # 计算数据集总共有多少批次数据
    # 训练训练所有批次数据
    for i in range(total_batch):
        batch_xs, batch_ys = not_mnist.train.next_batch(batch_size)
        # 运行优化操作和损失函数计算操作，获取损失值
        _, c = sess.run([optimizer, cross_entropy], feed_dict={x: batch_xs,
                                                      y_: batch_ys})
        
    # 打印显示
        if i % 500 == 0:
            print("Epoch:", '%04d' % (i + 1), 
                  "loss:", "{:.9f}".format(c),
                  "accuracy:", accuracy(y, y_, not_mnist.test))

    print("Optimization Finished!")
    print("Accuracy:", accuracy(y, y, not_mnist.test))
    saver.save(sess, '/data/my-model')

Epoch: 0001 loss: 2.296831846 accuracy: 0.2122
Epoch: 0501 loss: 1.399394631 accuracy: 0.544
Epoch: 1001 loss: 1.170395136 accuracy: 0.552867
Epoch: 1501 loss: 1.242914677 accuracy: 0.555733
Epoch: 2001 loss: 1.512181997 accuracy: 0.538133
Epoch: 2501 loss: 1.613656998 accuracy: 0.5594
Epoch: 3001 loss: 1.540232062 accuracy: 0.562933
Epoch: 3501 loss: 1.639102697 accuracy: 0.559533
Epoch: 4001 loss: 1.629338264 accuracy: 0.5608
Epoch: 4501 loss: 1.280507088 accuracy: 0.560067


In [None]:
def weight_variable(shape, stddev=0.1):
    return tf.Variable(tf.truncated_normal(shape, stddev))


def bias_variable(shape, default_value=0.1):
    return tf.Variable(tf.constant(default_value, shape=shape))
    


def variable_summaries(var):
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)       
        

def cnn_layer(input_tensor, 
              filter=[5, 5, 1, 16], 
              strides=[1, 1, 1, 1], 
              padding='SAME',
              act=tf.nn.relu,
              layer_name='conv'):
    with tf.name_scope(layer_name):
        with tf.name_scope('convolution_weights'):
            weights = weight_variable(filter)
            variable_summaries(weights)

        with tf.name_scope('convolution_biases'):
            biases = bias_variable(shape=[filter[-1]])
            variable_summaries(biases)

        with tf.name_scope('convolution_Wx_plus_b'):
            preactivate = tf.nn.conv2d(input_tensor, weights, strides, padding) + biases
            tf.summary.histogram('pre_activations', preactivate)

        activations = act(preactivate, name='activation')
        tf.summary.histogram('activations', activations)
    return activations

def pool_layer(input_tensor, 
               ksize=[1, 2, 2, 1], 
               strides=[1, 1, 1, 1], 
               padding='SAME',  
               pool_func=tf.nn.max_pool,
               layer_name='pool'):
    with tf.name_scope(layer_name):
        with tf.name_scope('pooling'):
            preactivate = pool_func(
                input_tensor,
                ksize=ksize,
                strides=strides,
                padding=padding
            )
            tf.summary.histogram('pre_activations', preactivate)
    return preactivate

def nn_layer(input_tensor, 
             input_dim, 
             output_dim, 
             layer_name='full', 
             act=tf.nn.relu):
    with tf.name_scope(layer_name):
        with tf.name_scope('weights'):
            weights = weight_variable([input_dim, output_dim])
            variable_summaries(weights)

        with tf.name_scope('biases'):
            biases = bias_variable([output_dim])
            variable_summaries(biases)

        with tf.name_scope('Wx_plus_b'):
            preactivate = tf.matmul(input_tensor, weights) + biases
            tf.summary.histogram('pre_activations', preactivate)

        activations = act(preactivate, name='activation')
        tf.summary.histogram('activations', activations)
    return activations


def define_input(image_size=28, number_labels=10, channels=1):
    with tf.name_scope('input'):
        x = tf.placeholder(tf.float32, [None, image_size, image_size, channels], name='x-input')
        y_ = tf.placeholder(tf.float32, [None, number_labels], name='y-input')

    with tf.name_scope('input_reshape'):
        image_shaped_input = tf.reshape(x, [-1, image_size, image_size, 1])
        tf.summary.image('input', image_shaped_input, number_labels)
    return x, y_


def main(learning_rate=0.01, max_steps=3001, batch_size=16):

    x, y_ = define_input(28, 10)
    
    conv1 = cnn_layer(x, filter=[5, 5, 1, 6], strides=[1, 1, 1, 1], padding='SAME', layer_name="conv1")
    hidden1 = pool_layer(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],padding='SAME',layer_name="pool1", pool_func=tf.nn.avg_pool)
    
    conv2 = cnn_layer(hidden1, filter=[5, 5, 6, 16], strides=[1, 1, 1, 1], padding='VALID', layer_name="conv2")
    hidden2 = pool_layer(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],padding='SAME',layer_name="pool2", pool_func=tf.nn.avg_pool)
    
    shape = hidden2.get_shape().as_list()
    conv_out = tf.reshape(hidden2, [-1, shape[1] * shape[2] * shape[3]])
    
    full1 = nn_layer(conv_out, input_dim=400, output_dim=120, layer_name='full1')
    full2 = nn_layer(full1, input_dim=120, output_dim=84, layer_name='full2')
    y = nn_layer(full2, input_dim=84, output_dim=10, layer_name='full3')

#     with tf.name_scope('dropout'):
#         keep_prob = tf.placeholder(tf.float32)
#         tf.summary.scalar('dropout_keep_probability', keep_prob)
#         droped = tf.nn.dropout(hidden1, keep_prob)

    with tf.name_scope('cross_entropy'):
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    tf.summary.scalar('cross_entropy', cross_entropy)

    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    merged = tf.summary.merge_all()
    

    init = tf.global_variables_initializer()
    
    # 启动tf图
    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter('/data/summary/train', sess.graph)
        sess.run(init)
        
        avg_loss = 0.
        total_batch = int(not_mnist.train.num_examples / batch_size)  # 计算数据集总共有多少批次数据
        print('total batch: ', total_batch)
        for step in range(total_batch):
            
            batch_xs, batch_ys = not_mnist.train.next_batch(batch_size)
            
            if step % 500 == 99:
                run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                summary, _, acc = sess.run([merged, train_step, accuracy],
                                           feed_dict={x: batch_xs, y_: batch_ys},
                                           options=run_options,
                                           run_metadata=run_metadata)
                train_writer.add_run_metadata(run_metadata, 'step%03d' % step)
                train_writer.add_summary(summary, step)
                print('Adding run metadata for %s and the accuracy is %s' % (step, acc))
            else:
                summary, _, acc = sess.run([merged, train_step, accuracy], 
                                           feed_dict={x: batch_xs, y_: batch_ys})
                train_writer.add_summary(summary, step)
                
#             if (step % 500 == 0):
#                 summary, acc = sess.run([merged, accuracy], 
#                                         feed_dict={
#                                             x: not_mnist.validation.images, 
#                                             y_: not_mnist.validation.labels
#                                         })

#                 train_writer.add_summary(summary, step)
#                 print('Validation Accuracy at step %s: %s' % (step, acc))

        summary, acc = sess.run([merged, accuracy], 
                                feed_dict={
                                    x: not_mnist.test.images, 
                                    y_: not_mnist.test.labels
                                })
        train_writer.add_summary(summary, step + 1)
        
        print('Total Accuracy at step %s: %s' % (step + 1, acc))

        train_writer.close()
        
main()

total batch:  6250


In [None]:
#AlexNet
def cnn_layers(inputs, scope, end_points_collection, dropout_keep_prob=0.8, is_training=True):
    # Collect outputs for conv2d and max_pool2d.
    
    with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], 
                        outputs_collections=[end_points_collection]):
    
        net = slim.conv2d(inputs, 64, [5, 5], 4, padding='VALID', scope='conv1')
        net = slim.max_pool2d(net, [3, 3], 2, scope='pool1')

        net = slim.conv2d(net, 192, [5, 5], scope='conv2')
        net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')

        net = slim.conv2d(net, 384, [3, 3], scope='conv3')
        net = slim.conv2d(net, 384, [3, 3], scope='conv4')
        net = slim.conv2d(net, 256, [3, 3], scope='conv5')
        net = slim.max_pool2d(net, [3, 3], 2, scope='pool5')

    with slim.arg_scope([slim.conv2d],
                      weights_initializer=trunc_normal(0.005),
                      biases_initializer=tf.constant_initializer(0.1),
                      outputs_collections=[end_points_collection]):
        
        net = slim.conv2d(net, 4096, [5, 5], padding='VALID', scope='fc6')
        net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6')
        net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
        net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7')

    return net, end_points_collection

In [2]:
pickle_file = '/data/notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (300000, 28, 28) (300000,)
Validation set (15000, 28, 28) (15000,)
Test set (15000, 28, 28) (15000,)


In [11]:
def pca(dataset):
    print(dataset[0][:10])
    dataset -= np.mean(dataset, axis = 0) # 对数据进行零中心化(重要)
    print(dataset[0][:10])
    cov = np.dot(dataset.T, dataset) / dataset.shape[0] # 得到数据的协方差矩阵
    U,S,V = np.linalg.svd(cov)
    print(U.shape)
    print(U)
    Xrot = np.dot(dataset,U) # 对数据去相关性
    
    return Xrot
valid_dataset = valid_dataset.reshape((-1, 784))
valid_dataset = pca(valid_dataset)
print('Validation set', valid_dataset.shape)

[ 10.44325256   0.01589015  -2.32210875   1.28891873   0.65020078
   3.08960986  -0.15546146  -1.31447566  -3.82819867   5.22964573]
[ 10.44325256   0.01589018  -2.32210875   1.28891873   0.65020078
   3.08960986  -0.15546148  -1.31447566  -3.82819867   5.22964573]
(784, 784)
[[ -1.00000000e+00   1.73947623e-09  -2.83921137e-10 ...,  -4.16588075e-12
   -6.10994458e-12   4.72130538e-12]
 [ -1.73947623e-09  -1.00000000e+00   6.19605145e-10 ...,  -1.35096708e-11
    1.06539274e-11  -2.74584847e-12]
 [  2.83921164e-10  -6.19604701e-10  -1.00000000e+00 ...,   4.45589052e-12
   -3.71321073e-12  -8.49392778e-12]
 ..., 
 [  4.16588205e-12   1.35096725e-11  -4.45589312e-12 ...,  -1.00000000e+00
    5.07014946e-08   3.44333984e-09]
 [ -6.10994588e-12   1.06539291e-11  -3.71322070e-12 ...,   5.07014946e-08
    1.00000000e+00  -2.61279727e-08]
 [  4.72130798e-12  -2.74584587e-12  -8.49392865e-12 ...,   3.44334228e-09
    2.61279727e-08   1.00000000e+00]]
Validation set (15000, 784)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
    dataset = dataset.reshape((-1, image_size, image_size, num_channels)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (300000, 28, 28, 1) (300000, 10)
Validation set (15000, 28, 28, 1) (15000, 10)
Test set (15000, 28, 28, 1) (15000, 10)


In [0]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [0]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [0]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0 : 3.51275
Minibatch accuracy: 6.2%
Validation accuracy: 12.8%
Minibatch loss at step 50 : 1.48703
Minibatch accuracy: 43.8%
Validation accuracy: 50.4%
Minibatch loss at step 100 : 1.04377
Minibatch accuracy: 68.8%
Validation accuracy: 67.4%
Minibatch loss at step 150 : 0.601682
Minibatch accuracy: 68.8%
Validation accuracy: 73.0%
Minibatch loss at step 200 : 0.898649
Minibatch accuracy: 75.0%
Validation accuracy: 77.8%
Minibatch loss at step 250 : 1.3637
Minibatch accuracy: 56.2%
Validation accuracy: 75.4%
Minibatch loss at step 300 : 1.41968
Minibatch accuracy: 62.5%
Validation accuracy: 76.0%
Minibatch loss at step 350 : 0.300648
Minibatch accuracy: 81.2%
Validation accuracy: 80.2%
Minibatch loss at step 400 : 1.32092
Minibatch accuracy: 56.2%
Validation accuracy: 80.4%
Minibatch loss at step 450 : 0.556701
Minibatch accuracy: 81.2%
Validation accuracy: 79.4%
Minibatch loss at step 500 : 1.65595
Minibatch accuracy: 43.8%
Validation accuracy: 79.6%

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---