Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in _notmist.ipynb_.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (32000, 28, 28) (32000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (32000, 784) (32000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

Bringing a few things in from the fullyconnected project.

In [7]:
# REF: https://www.tensorflow.org/get_started/summaries_and_tensorboard
def variable_summaries(var):
  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
  with tf.name_scope('summaries'):
    mean = tf.reduce_mean(var)
    tf.summary.scalar('mean', mean)
    with tf.name_scope('stddev'):
      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
    tf.summary.scalar('stddev', stddev)
    tf.summary.scalar('max', tf.reduce_max(var))
    tf.summary.scalar('min', tf.reduce_min(var))
    tf.summary.histogram('histogram', var)

In [32]:
def run_batch_graph(graph, save_layers, num_steps = 1001, summary_path = './summary/', earliest_stop = 300, stopping_threshold = 3, batch_size=128, report_learning_rate=False):
    with tf.Session(graph=graph) as session:
        # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
        merged_summary = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(summary_path+'train', graph=graph)
        test_writer = tf.summary.FileWriter(summary_path+'test')
        tf.global_variables_initializer().run()
        print('Initialized')

        def get_feed_dict(step):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
            return feed_dict, (batch_data, batch_labels)

        def test_step(step):
            feed_dict, (batch_data, batch_labels) = get_feed_dict(step)
            summary, test = session.run([merged_summary, test_prediction], feed_dict=feed_dict)
            test_writer.add_summary(summary, step)
            test = accuracy(test, test_labels)
            print("Step {}: Test: {}%".format(step, test))
            return test

        def train_step(step, verify=False):
            feed_dict, (batch_data, batch_labels) = get_feed_dict(step)
            if verify:
                if not report_learning_rate:
                    summary, _, train, valid = session.run([merged_summary, optimizer, train_prediction, valid_prediction], feed_dict=feed_dict)
                    train_writer.add_summary(summary, step)
                    train = accuracy(train, batch_labels)
                    valid = accuracy(valid, valid_labels)
                    print("Step {}: Train: {}% Valid: {}%".format(step, train, valid))
                    return train, valid
                else:
                    summary, _, train, valid, lr = session.run([merged_summary, optimizer, train_prediction, valid_prediction, learning_rate], feed_dict=feed_dict)
                    train_writer.add_summary(summary, step)
                    train = accuracy(train, batch_labels)
                    valid = accuracy(valid, valid_labels)
                    print("Step {}: Train: {}% Valid: {}% Lr: {}".format(step, train, valid, lr))
                    return train, valid
            else:
                summary, _ = session.run([merged_summary, optimizer], feed_dict=feed_dict)
                train_writer.add_summary(summary, step)

        # Early Stopping
        saver = tf.train.Saver(save_layers)
        
        # Main Loop
        try:
            for step in range(num_steps):
                # Run the computations. We tell .run() that we want to run the optimizer,
                # and get the loss value and the training predictions returned as numpy
                # arrays.
                # Every 1000 steps, test the test accuracy without training
                if (step == num_steps-1):
                    test_step(step)

                # Every 100 steps, test training and validation accuracy
                elif (step % (num_steps//10) == 0):
                    saver.save(session, summary_path+"train/single_layer{}.chk".format(step))
                    train, valid = train_step(step, verify=True)

                    # Early Stopping
                    if abs(train-valid)>stopping_threshold and step>earliest_stop and earliest_stop>=0:
                        print("Stopping Early!")
                        test_step(step)
                        break

                # Normal, fast, training step
                else:
                    train_step(step)
        except KeyboardInterrupt:
            print("Stopping Early!")
            test_step(step)

Modifying as needed graph3 from the fully connected project.

In [30]:
batch_size = 128
hidden_size = 1024
start_learning_rate = .5
w = .01  # The l2 loss rates for weights
graph = tf.Graph()
with graph.as_default():
  tf.summary.scalar('image_size', image_size)
  tf.summary.scalar('batch_size', batch_size)
  tf.summary.scalar('num_labels', num_labels)
  tf.summary.scalar('num_labels', hidden_size)
  tf.summary.scalar('learning_rate', start_learning_rate)
  tf.summary.scalar('w',w)
  
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  with tf.name_scope('data'):
    with tf.name_scope('train'):
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    with tf.name_scope('valid'):
      tf_valid_dataset = tf.constant(valid_dataset)
    with tf.name_scope('test'):
      tf_test_dataset = tf.constant(test_dataset)
  
  # Feed Forward Layer 1
  with tf.name_scope('ff1'):
    with tf.name_scope('weights'):
        weights1 = tf.Variable(
            tf.truncated_normal([image_size * image_size, hidden_size]))
        variable_summaries(weights1)
    with tf.name_scope('biases'):
        biases1 = tf.Variable(tf.zeros([hidden_size]))
        variable_summaries(biases1)
    with tf.name_scope('linear'):
        logits1 = tf.matmul(tf_train_dataset, weights1) + biases1
        variable_summaries(logits1)
    with tf.name_scope('activation'):
        act1 = tf.nn.relu(logits1)
        variable_summaries(act1)
    
  # Feed Forward Layer 1
  with tf.name_scope('ff2'):
    with tf.name_scope('weights'):
        weights2 = tf.Variable(
            tf.truncated_normal([hidden_size, num_labels]))
        variable_summaries(weights2)
    with tf.name_scope('biases'):
        biases2 = tf.Variable(tf.zeros([num_labels]))
        variable_summaries(biases2)
    with tf.name_scope('linear'):
        logits2 = tf.matmul(act1, weights2) + biases2
        variable_summaries(logits2)
  
  with tf.name_scope('error'):
    diff = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=tf_train_labels)
    loss = tf.reduce_mean(diff + w*(tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)))
    tf.summary.scalar('cross_entropy', loss)
    
  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  with tf.name_scope('opt'):
    optimizer = tf.train.GradientDescentOptimizer(start_learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  with tf.name_scope('out'):
    train_prediction = tf.nn.softmax(logits2)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)

In [31]:
run_batch_graph(graph, save_layers=[weights1, biases1, weights2, biases2], num_steps=1000, earliest_stop = -1, stopping_threshold = 10, batch_size=batch_size, summary_path="./summary/regularization/l2loss/")

Initialized
Step 0: Train: 8.59375% Valid: 17.3%
Step 100: Train: 79.6875% Valid: 77.24%
Step 200: Train: 79.6875% Valid: 79.83%
Step 300: Train: 81.25% Valid: 81.95%
Step 400: Train: 89.0625% Valid: 84.04%
Step 500: Train: 84.375% Valid: 83.9%
Step 600: Train: 82.8125% Valid: 83.48%
Step 700: Train: 81.25% Valid: 83.06%
Step 800: Train: 83.59375% Valid: 82.68%
Step 900: Train: 85.15625% Valid: 82.79%
Step 999: Test: 89.84%


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [39]:
# Save full dataset
save_training = (train_dataset, train_labels)

# Slice it to a smaller set
train_dataset = train_dataset[:batch_size*3]
train_labels  = train_labels[:batch_size*3]

In [33]:
run_batch_graph(graph, save_layers=[weights1, biases1, weights2, biases2], num_steps=1000, earliest_stop = -1, stopping_threshold = 10, batch_size=batch_size, summary_path="./summary/regularization/fewbatch/")

Initialized
Step 0: Train: 14.0625% Valid: 18.29%
Step 100: Train: 100.0% Valid: 70.89%
Step 200: Train: 100.0% Valid: 71.11%
Step 300: Train: 100.0% Valid: 71.83%
Step 400: Train: 100.0% Valid: 72.46%
Step 500: Train: 100.0% Valid: 73.97%
Step 600: Train: 100.0% Valid: 75.01%
Step 700: Train: 100.0% Valid: 75.21%
Step 800: Train: 100.0% Valid: 75.53%
Step 900: Train: 100.0% Valid: 75.61%
Step 999: Test: 83.82%


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [52]:
keep_prob = .20 # Tried several values, it needs to be this low surprisingly
graph = tf.Graph()
with graph.as_default():
  tf.summary.scalar('image_size', image_size)
  tf.summary.scalar('batch_size', batch_size)
  tf.summary.scalar('num_labels', num_labels)
  tf.summary.scalar('num_labels', hidden_size)
  tf.summary.scalar('learning_rate', start_learning_rate)
  tf.summary.scalar('w',w)
  
  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  with tf.name_scope('data'):
    with tf.name_scope('train'):
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    with tf.name_scope('valid'):
      tf_valid_dataset = tf.constant(valid_dataset)
    with tf.name_scope('test'):
      tf_test_dataset = tf.constant(test_dataset)
  
  # Feed Forward Layer 1
  with tf.name_scope('ff1'):
    with tf.name_scope('weights'):
        weights1 = tf.Variable(
            tf.truncated_normal([image_size * image_size, hidden_size]))
        variable_summaries(weights1)
    with tf.name_scope('biases'):
        biases1 = tf.Variable(tf.zeros([hidden_size]))
        variable_summaries(biases1)
    with tf.name_scope('linear'):
        logits1 = tf.matmul(tf_train_dataset, weights1) + biases1
        variable_summaries(logits1)
    with tf.name_scope('activation'):
        act1 = tf.nn.relu(logits1)
        variable_summaries(act1)
    with tf.name_scope('dropout'):
        drop1 = tf.nn.dropout(act1, keep_prob=keep_prob)
        variable_summaries(drop1)
    
  # Feed Forward Layer 1
  with tf.name_scope('ff2'):
    with tf.name_scope('weights'):
        weights2 = tf.Variable(
            tf.truncated_normal([hidden_size, num_labels]))
        variable_summaries(weights2)
    with tf.name_scope('biases'):
        biases2 = tf.Variable(tf.zeros([num_labels]))
        variable_summaries(biases2)
    with tf.name_scope('linear'):
        logits2 = tf.matmul(drop1, weights2) + biases2
        variable_summaries(logits2)
  
  with tf.name_scope('error'):
    diff = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=tf_train_labels)
    loss = tf.reduce_mean(diff + w*(tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)))
    tf.summary.scalar('cross_entropy', loss)
    
  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  with tf.name_scope('opt'):
    optimizer = tf.train.GradientDescentOptimizer(start_learning_rate).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  with tf.name_scope('out'):
    train_prediction = tf.nn.softmax(logits2)
    valid_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2)
    test_prediction = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)

In [53]:
run_batch_graph(graph, save_layers=[weights1, biases1, weights2, biases2], num_steps=1000, earliest_stop = -1, stopping_threshold = 10, batch_size=batch_size, summary_path="./summary/regularization/dropout/")

Initialized
Step 0: Train: 13.28125% Valid: 24.69%
Step 100: Train: 96.875% Valid: 76.1%
Step 200: Train: 93.75% Valid: 75.3%
Step 300: Train: 99.21875% Valid: 76.08%
Step 400: Train: 99.21875% Valid: 76.85%
Step 500: Train: 100.0% Valid: 76.37%
Step 600: Train: 100.0% Valid: 76.46%
Step 700: Train: 100.0% Valid: 76.38%
Step 800: Train: 100.0% Valid: 76.59%
Step 900: Train: 100.0% Valid: 76.33%
Step 999: Test: 84.31%


Now lets restore the old data.

In [54]:
train_dataset, train_labels = save_training # Restore full dataset
print("Restored!")

Restored!


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [7]:
# REF: https://www.tensorflow.org/get_started/summaries_and_tensorboard
def variable_summaries(var):
  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
  with tf.name_scope('summaries'):
    mean = tf.reduce_mean(var)
    tf.summary.scalar('mean', mean)
    with tf.name_scope('stddev'):
      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
    tf.summary.scalar('stddev', stddev)
    tf.summary.scalar('max', tf.reduce_max(var))
    tf.summary.scalar('min', tf.reduce_min(var))

In [45]:
def ff_layer(in_size, out_size, prev_layer, name):
    with tf.name_scope(name):
        with tf.name_scope('weights'):
            weights = tf.Variable(
                tf.truncated_normal([in_size, out_size],stddev=np.sqrt(2.0/in_size)))
            variable_summaries(weights)
        with tf.name_scope('biases'):
            biases = tf.Variable(tf.zeros([out_size]))
            variable_summaries(biases)
        with tf.name_scope('linear'):
            logits = tf.matmul(prev_layer, weights) + biases
            variable_summaries(logits)
    return logits, weights, biases

In [51]:
keep_prob = .9
start_learning_rate = .5
decay_steps, decay_rate = 100, .9
w = 0.001  # The l2 loss rates for weights
batch_size = 128

hidden_size1 = 1024
hidden_size2 = hidden_size1//2
hidden_size3 = hidden_size2//2
hidden_size4 = hidden_size3//2
hidden_size5 = hidden_size4//2

graph = tf.Graph()
with graph.as_default():
  tf.summary.scalar('image_size', image_size)
  tf.summary.scalar('batch_size', batch_size)
  tf.summary.scalar('num_labels', num_labels)
  tf.summary.scalar('w',w)
  tf.summary.scalar('keep_prob',keep_prob)
  tf.summary.scalar('decay_steps',decay_steps)
  tf.summary.scalar('decay_rate',decay_rate)

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  with tf.name_scope('data'):
    with tf.name_scope('train'):
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    with tf.name_scope('valid'):
      tf_valid_dataset = tf.constant(valid_dataset)
    with tf.name_scope('test'):
      tf_test_dataset = tf.constant(test_dataset)
  
  # Feed Forward Layer 1
  logits1, weights1, biases1 = ff_layer(in_size=int(image_size**2.), out_size=hidden_size1, prev_layer=tf_train_dataset, name="ff1")
  with tf.name_scope('RELU1'):
    act1 = tf.nn.relu(logits1)
  with tf.name_scope('dropout1'):
    drop1 = tf.nn.dropout(act1, keep_prob=keep_prob)
        
  # Feed Forward Layer 2
  logits2, weights2, biases2 = ff_layer(in_size=hidden_size1, out_size=hidden_size2, prev_layer=drop1, name="ff2")
  with tf.name_scope('RELU2'):
    act2 = tf.nn.relu(logits2)
  with tf.name_scope('dropout2'):
    drop2 = tf.nn.dropout(act2, keep_prob=keep_prob)
  
  # Feed Forward Layer 3
  logits3, weights3, biases3 = ff_layer(in_size=hidden_size2, out_size=hidden_size3, prev_layer=drop2, name="ff3")
  with tf.name_scope('RELU3'):
    act3 = tf.nn.relu(logits3)
  with tf.name_scope('dropout3'):
    drop3 = tf.nn.dropout(act3, keep_prob=keep_prob)
  
  # Feed Forward Layer 4
  logits4, weights4, biases4 = ff_layer(in_size=hidden_size3, out_size=hidden_size4, prev_layer=drop3, name="ff4")
  with tf.name_scope('RELU4'):
    act4 = tf.nn.relu(logits4)
  with tf.name_scope('dropout4'):
    drop4 = tf.nn.dropout(act4, keep_prob=keep_prob)
  
  # Feed Forward Layer 1
  logits5, weights5, biases5 = ff_layer(in_size=hidden_size4, out_size=num_labels, prev_layer=drop4, name="ff5")
        
  with tf.name_scope('error'):
    diff = tf.nn.softmax_cross_entropy_with_logits(logits=logits5, labels=tf_train_labels)
    w1 = tf.nn.l2_loss(weights1)
    w2 = tf.nn.l2_loss(weights2)
    w3 = tf.nn.l2_loss(weights3)
    w4 = tf.nn.l2_loss(weights4)
    w5 = tf.nn.l2_loss(weights5)
    loss = tf.reduce_mean(diff + w*(w1+w2+w3+w4+w5))
    tf.summary.scalar('cross_entropy', loss)
    tf.summary.scalar('l2_w1', w1)
    tf.summary.scalar('l2_w2', w2)
    tf.summary.scalar('l2_w3', w3)
    tf.summary.scalar('l2_w4', w4)
    tf.summary.scalar('l2_w5', w5)
    
    
  # Optimizer.
  # We are going to find the minimum of this loss using gradient descent.
  with tf.name_scope('opt'):
    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_steps, decay_rate)
    tf.summary.scalar('learning_rate', learning_rate)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  # These are not part of training, but merely here so that we can report
  # accuracy figures as we train.
  with tf.name_scope('out'):
    with tf.name_scope("train_prediction"):
        train_prediction = tf.nn.softmax(logits5)
    
    with tf.name_scope("valid_prediction"):
        v1 = tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1)
        v2 = tf.nn.relu(tf.matmul(v1, weights2) + biases2)
        v3 = tf.nn.relu(tf.matmul(v2, weights3) + biases3)
        v4 = tf.nn.relu(tf.matmul(v3, weights4) + biases4)
        v5 = tf.nn.relu(tf.matmul(v4, weights5) + biases5)
        valid_prediction = tf.nn.softmax(v5)
    
    with tf.name_scope("test_prediction"):
        t1 = tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1)
        t2 = tf.nn.relu(tf.matmul(t1, weights2) + biases2)
        t3 = tf.nn.relu(tf.matmul(t2, weights3) + biases3)
        t4 = tf.nn.relu(tf.matmul(t3, weights4) + biases4)
        t5 = tf.nn.relu(tf.matmul(t4, weights5) + biases5)
        test_prediction = tf.nn.softmax(t5)

In [53]:
run_batch_graph(graph, save_layers=[weights1, biases1, weights2, biases2, weights3, biases3, weights4, biases4, weights5, biases5], num_steps=1400, earliest_stop = -1, stopping_threshold = 8, batch_size=batch_size, summary_path="./summary/regularization/lrdecay/", report_learning_rate=True)

Initialized
Step 0: Train: 12.5% Valid: 24.65% Lr: 0.5
Step 140: Train: 86.71875% Valid: 82.7% Lr: 0.4314291477203369
Step 280: Train: 88.28125% Valid: 84.4% Lr: 0.37226223945617676
Step 420: Train: 87.5% Valid: 84.98% Lr: 0.321209579706192
Step 560: Train: 81.25% Valid: 85.65% Lr: 0.27715837955474854
Step 700: Train: 84.375% Valid: 85.77% Lr: 0.23914840817451477
Step 840: Train: 89.0625% Valid: 85.85% Lr: 0.20635119080543518
Step 980: Train: 89.0625% Valid: 86.33% Lr: 0.17805184423923492
Step 1120: Train: 89.84375% Valid: 86.72% Lr: 0.15363352000713348
Step 1260: Train: 91.40625% Valid: 86.95% Lr: 0.13256394863128662
Step 1399: Test: 93.05%
