In [1]:
import argparse
import sys
import os
import time
import copy

import tensorflow as tf
import pandas as pd

import dataset
# from models.conv2_dense2_dropout import Model
from models.dense3 import Model

from helpers.history import history
from helpers.gpu_utils import validate_batch_size_for_multi_gpu
from helpers.softmax_cross_entropy_trainer import create_model_fn

### Get the history and the runtime context 

In [16]:
localtime = time.asctime(time.localtime(time.time()))
user = os.environ['USER']
print("\n\n")
print("Welcome, %s, it's %s, and you'll be working with Tensorflow version %s" % (user, localtime, tf.__version__))
print("\n")
history.experiments.tail(10)




Welcome, wgiersche, it's Sun Apr 29 18:06:58 2018, and you'll be working with Tensorflow version 1.8.0




Unnamed: 0,batch_size,data_dir,model_dir,multi_gpu,train_epochs,user,timestamp,localtime
0,128,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525010000.0,Sun Apr 29 15:50:38 2018
1,128,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525010000.0,Sun Apr 29 15:50:38 2018
2,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,5,wgiersche,1525012000.0,Sun Apr 29 16:24:48 2018
3,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525012000.0,Sun Apr 29 16:33:03 2018
4,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525017000.0,Sun Apr 29 17:50:42 2018
5,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525017000.0,Sun Apr 29 17:50:42 2018
6,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525018000.0,Sun Apr 29 18:05:46 2018


### Create a new hyper-parameter record 

In [5]:
# Creating a new hyper-parameter record from a history entry
FLAGS = history.copy_from_record(2)

# Choose new parameters
FLAGS.train_epochs = 10
FLAGS.batch_size = 64
FLAGS

batch_size                                 64
data_dir        /var/ellie/data/mnist_fashion
model_dir                    /tmp/mnist_model
multi_gpu                               False
train_epochs                               10
user                                wgiersche
timestamp                         1.52502e+09
localtime            Sun Apr 29 18:05:46 2018
Name: 2, dtype: object

### Get to work!

In [6]:
!rm -rf /tmp/mnist_model

In [7]:
optimizer = tf.train.AdamOptimizer()

In [8]:
model_function = create_model_fn(lambda params: Model(params), optimizer)

In [9]:
if FLAGS.multi_gpu:
    validate_batch_size_for_multi_gpu(FLAGS.batch_size)

    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
    # and (2) wrap the optimizer. The first happens here, and (2) happens
    # in the model_fn itself when the optimizer is defined.
    model_function = tf.contrib.estimator.replicate_model_fn(
        model_function, loss_reduction=tf.losses.Reduction.MEAN)

data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last')

In [10]:
mnist_classifier = tf.estimator.Estimator(
    model_fn=model_function,
    model_dir=FLAGS.model_dir,
    params={
        'data_format': data_format,
        'multi_gpu': FLAGS.multi_gpu
    })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/mnist_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11f72f4e0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


##### ```input_fn``` functions are a factories for ```DataSet```s

In [11]:
def train_input_fn():
    ds = dataset.training_dataset(FLAGS.data_dir)
    ds = ds.cache().shuffle(buffer_size=50000).\
        batch(FLAGS.batch_size).\
        repeat(FLAGS.train_epochs)
    return ds

In [12]:
def eval_input_fn():
    return dataset.test_dataset(FLAGS.data_dir).batch(
        FLAGS.batch_size).make_one_shot_iterator().get_next()

In [13]:
tensors_to_log = {'train_accuracy': 'train_accuracy'}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1000)

### Report the new hyper-parameters and run the training 

In [14]:
history.report_experiment(FLAGS)

mnist_classifier.train(input_fn=train_input_fn, hooks=[logging_hook])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/mnist_model/model.ckpt.
INFO:tensorflow:train_accuracy = 0.078125
INFO:tensorflow:loss = 2.3288713, step = 1
INFO:tensorflow:global_step/sec: 110.376
INFO:tensorflow:loss = 0.48031345, step = 101 (0.906 sec)
INFO:tensorflow:global_step/sec: 161.638
INFO:tensorflow:loss = 0.60577035, step = 201 (0.619 sec)
INFO:tensorflow:global_step/sec: 260.733
INFO:tensorflow:loss = 0.530182, step = 301 (0.384 sec)
INFO:tensorflow:global_step/sec: 258.371
INFO:tensorflow:loss = 0.6784529, step = 401 (0.387 sec)
INFO:tensorflow:global_step/sec: 259.081
INFO:tensorflow:loss = 0.29508072, step = 501 (0.386 sec)
INFO:tensorflow:global_step/sec: 262.322
INFO:tensorflow:loss = 0.34913617, step = 601 (0.381 sec)
INFO:tensorf

<tensorflow.python.estimator.estimator.Estimator at 0x11f72f6d8>

In [15]:
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
print('Evaluation results:\n\t%s' % eval_results)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-04-29-16:06:47
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/mnist_model/model.ckpt-9380
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-04-29-16:06:48
INFO:tensorflow:Saving dict for global step 9380: accuracy = 0.8857, global_step = 9380, loss = 0.3311808
Evaluation results:
	{'accuracy': 0.8857, 'loss': 0.3311808, 'global_step': 9380}
