https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/3_regularization.ipynb

# Deep Learning

# Assignment 3

Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
#from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in 1_notmnist.ipynb.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

# Problem 1

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor t using nn.l2_loss(t). The right amount of regularization should improve your validation / test accuracy.

In [4]:
image_size = 28
num_labels = 10

In [5]:
from sklearn.linear_model import LogisticRegression

n = 5000
C_array = [10,1,0.1,0.001]

for C in C_array:
    print ("C: %s" % C)
    model = LogisticRegression(penalty='l2', C=C, multi_class='multinomial', solver='newton-cg')
    train_x = train_dataset[0:n].reshape(n, (image_size*image_size))
    train_y = train_labels[0:n]
    test_x = test_dataset.reshape(len(test_dataset), (image_size*image_size))
    test_y = test_labels
    model.fit(train_x, train_y)
    print ("Training error (%s samples):" % n, model.score(train_x, train_y))
    print ("Test error", model.score(test_x, test_y))
    print (" ")

C: 10
Training error (5000 samples): 0.9976
Test error 0.8174
 
C: 1
Training error (5000 samples): 0.9742
Test error 0.8453
 
C: 0.1
Training error (5000 samples): 0.9138
Test error 0.8734
 
C: 0.001
Training error (5000 samples): 0.8132
Test error 0.8681
 


In [6]:
def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [7]:
batch_size = 128
num_hidden_units = 1024

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
    b1 = tf.Variable(tf.zeros([num_hidden_units]))
    w2 = tf.Variable(tf.truncated_normal([num_hidden_units, num_labels]))
    b2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    layer_1 = tf.matmul(tf_train_dataset, w1) + b1
    hidden_layer = tf.nn.relu(layer_1)
    layer_2 = tf.matmul(hidden_layer, w2) + b2
    
    l2_penalty = 1e-3
    unregularized_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=layer_2))
    l2_loss = l2_penalty*(tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2))
    loss = unregularized_loss + l2_loss

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(layer_2)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, w1) + b1), w2) + b2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, w1) + b1), w2) + b2)

In [8]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 673.392334
Minibatch accuracy: 11.7%
Validation accuracy: 31.0%
Minibatch loss at step 500: 205.795181
Minibatch accuracy: 78.1%
Validation accuracy: 78.8%
Minibatch loss at step 1000: 115.076981
Minibatch accuracy: 79.7%
Validation accuracy: 80.9%
Minibatch loss at step 1500: 68.685402
Minibatch accuracy: 89.1%
Validation accuracy: 83.2%
Minibatch loss at step 2000: 41.546864
Minibatch accuracy: 87.5%
Validation accuracy: 85.3%
Minibatch loss at step 2500: 25.281002
Minibatch accuracy: 86.7%
Validation accuracy: 85.5%
Minibatch loss at step 3000: 15.567230
Minibatch accuracy: 85.2%
Validation accuracy: 86.7%
Test accuracy: 93.2%


# Problem 2

Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

In [9]:
batch_size = 128
num_hidden_units = 1024

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
    b1 = tf.Variable(tf.zeros([num_hidden_units]))
    w2 = tf.Variable(tf.truncated_normal([num_hidden_units, num_labels]))
    b2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    layer_1 = tf.matmul(tf_train_dataset, w1) + b1
    hidden_layer = tf.nn.relu(layer_1)
    layer_2 = tf.matmul(hidden_layer, w2) + b2
    
    l2_penalty = 1e-3
    unregularized_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=layer_2))
    l2_loss = l2_penalty*(tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2))
    loss = unregularized_loss + l2_loss

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(layer_2)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, w1) + b1), w2) + b2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, w1) + b1), w2) + b2)

In [10]:
num_steps = 101
num_batches = 3

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    # offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    offset = step % num_batches
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 2 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 622.863342
Minibatch accuracy: 16.4%
Validation accuracy: 36.2%
Minibatch loss at step 2: 1521.764648
Minibatch accuracy: 26.6%
Validation accuracy: 37.8%
Minibatch loss at step 4: 801.419434
Minibatch accuracy: 46.1%
Validation accuracy: 54.1%
Minibatch loss at step 6: 339.672455
Minibatch accuracy: 92.2%
Validation accuracy: 64.1%
Minibatch loss at step 8: 316.860779
Minibatch accuracy: 96.9%
Validation accuracy: 64.5%
Minibatch loss at step 10: 312.155731
Minibatch accuracy: 99.2%
Validation accuracy: 64.6%
Minibatch loss at step 12: 310.375732
Minibatch accuracy: 100.0%
Validation accuracy: 64.6%
Minibatch loss at step 14: 309.755585
Minibatch accuracy: 100.0%
Validation accuracy: 64.6%
Minibatch loss at step 16: 309.137146
Minibatch accuracy: 100.0%
Validation accuracy: 64.6%
Minibatch loss at step 18: 308.519257
Minibatch accuracy: 100.0%
Validation accuracy: 64.6%
Minibatch loss at step 20: 307.902985
Minibatch accuracy: 100.0%
Validation ac

# Problem 3

Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

In [11]:
batch_size = 128
num_hidden_units = 1024

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
    
    keep_prob = tf.placeholder(tf.float32)

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
    b1 = tf.Variable(tf.zeros([num_hidden_units]))
    w2 = tf.Variable(tf.truncated_normal([num_hidden_units, num_labels]))
    b2 = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    layer_1 = tf.matmul(tf_train_dataset, w1) + b1
    hidden_layer = tf.nn.relu(layer_1)
    hidden_layer_with_dropout = tf.nn.dropout(hidden_layer, keep_prob)
    layer_2 = tf.matmul(hidden_layer_with_dropout, w2) + b2
    
    l2_penalty = 1e-3
    unregularized_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=layer_2))
    l2_loss = l2_penalty*(tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2))
    loss = unregularized_loss + l2_loss

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(layer_2)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, w1) + b1), w2) + b2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, w1) + b1), w2) + b2)

In [12]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 787.969360
Minibatch accuracy: 10.9%
Validation accuracy: 33.0%
Minibatch loss at step 500: 219.193542
Minibatch accuracy: 69.5%
Validation accuracy: 80.0%
Minibatch loss at step 1000: 116.960251
Minibatch accuracy: 72.7%
Validation accuracy: 80.8%
Minibatch loss at step 1500: 69.918243
Minibatch accuracy: 86.7%
Validation accuracy: 82.0%
Minibatch loss at step 2000: 41.431843
Minibatch accuracy: 86.7%
Validation accuracy: 83.4%
Minibatch loss at step 2500: 25.255077
Minibatch accuracy: 80.5%
Validation accuracy: 84.2%
Minibatch loss at step 3000: 15.455159
Minibatch accuracy: 86.7%
Validation accuracy: 85.0%
Test accuracy: 91.7%


What happens to our extreme overfitting case?

In [13]:
num_steps = 101
num_batches = 3

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    # offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    offset = step % num_batches
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 2 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 849.643433
Minibatch accuracy: 7.8%
Validation accuracy: 33.1%
Minibatch loss at step 2: 1186.224243
Minibatch accuracy: 39.8%
Validation accuracy: 27.6%
Minibatch loss at step 4: 564.095459
Minibatch accuracy: 65.6%
Validation accuracy: 56.5%
Minibatch loss at step 6: 344.874481
Minibatch accuracy: 89.1%
Validation accuracy: 62.5%
Minibatch loss at step 8: 341.759155
Minibatch accuracy: 90.6%
Validation accuracy: 64.9%
Minibatch loss at step 10: 325.092133
Minibatch accuracy: 95.3%
Validation accuracy: 66.2%
Minibatch loss at step 12: 312.037262
Minibatch accuracy: 98.4%
Validation accuracy: 66.3%
Minibatch loss at step 14: 317.565735
Minibatch accuracy: 95.3%
Validation accuracy: 66.1%
Minibatch loss at step 16: 313.015839
Minibatch accuracy: 97.7%
Validation accuracy: 67.8%
Minibatch loss at step 18: 309.518372
Minibatch accuracy: 99.2%
Validation accuracy: 67.9%
Minibatch loss at step 20: 310.710693
Minibatch accuracy: 98.4%
Validation accuracy

# Problem 4

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

global_step = tf.Variable(0)  # count the number of steps taken.

learning_rate = tf.train.exponential_decay(0.5, global_step, ...)

optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

In [14]:
batch_size = 128
num_hidden_layer_1_units = 1024
num_hidden_layer_2_units = 512

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    w1 = tf.Variable(tf.truncated_normal([image_size * image_size, num_hidden_layer_1_units], stddev=np.sqrt(2.0 / (image_size * image_size))))
    b1 = tf.Variable(tf.zeros([num_hidden_layer_1_units]))
    w2 = tf.Variable(tf.truncated_normal([num_hidden_layer_1_units, num_hidden_layer_2_units], stddev=np.sqrt(2.0 / num_hidden_layer_1_units)))
    b2 = tf.Variable(tf.zeros([num_hidden_layer_2_units]))
    w3 = tf.Variable(tf.truncated_normal([num_hidden_layer_2_units, num_labels], stddev=np.sqrt(2.0 / num_hidden_layer_2_units)))
    b3 = tf.Variable(tf.zeros([num_labels]))
    global_step = tf.Variable(0) # count the number of steps taken

    # Training computation.
    hidden_layer_1 = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, w1) + b1), keep_prob=0.5)
    hidden_layer_2 = tf.nn.dropout(tf.nn.relu(tf.matmul(hidden_layer_1, w2) + b2), keep_prob=0.5)
    layer_3 = tf.matmul(hidden_layer_2, w3) + b3
    
    l2_penalty = 1e-3
    unregularized_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=layer_3))
    l2_loss = l2_penalty*(tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2) + tf.nn.l2_loss(w3))
    loss = unregularized_loss + l2_loss

    # Optimizer.
    #learning_rate = tf.train.exponential_decay(0.5, global_step, 1000, 0.65, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(layer_3)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, w1) + \
                                                                               b1), w2) + b2), w3) + b3)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, w1) + \
                                                                              b1), w2) + b2), w3) + b3)

In [15]:
num_steps = 5001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 3.807815
Minibatch accuracy: 9.4%
Validation accuracy: 33.7%
Minibatch loss at step 500: 1.238387
Minibatch accuracy: 88.3%
Validation accuracy: 84.6%
Minibatch loss at step 1000: 1.077341
Minibatch accuracy: 85.2%
Validation accuracy: 85.7%
Minibatch loss at step 1500: 0.721555
Minibatch accuracy: 89.1%
Validation accuracy: 86.2%
Minibatch loss at step 2000: 0.649908
Minibatch accuracy: 93.0%
Validation accuracy: 86.5%
Minibatch loss at step 2500: 0.664468
Minibatch accuracy: 87.5%
Validation accuracy: 86.4%
Minibatch loss at step 3000: 0.625172
Minibatch accuracy: 86.7%
Validation accuracy: 86.7%
Minibatch loss at step 3500: 0.736265
Minibatch accuracy: 83.6%
Validation accuracy: 87.0%
Minibatch loss at step 4000: 0.566973
Minibatch accuracy: 89.1%
Validation accuracy: 87.3%
Minibatch loss at step 4500: 0.533519
Minibatch accuracy: 89.8%
Validation accuracy: 86.5%
Minibatch loss at step 5000: 0.575380
Minibatch accuracy: 88.3%
Validation accuracy