In [1]:
import argparse
import sys
import os
import time
import copy

import tensorflow as tf
import pandas as pd

import dataset
# from models.conv2_dense2_dropout import Model
from models.dense3 import Model

from helpers.history import history
from helpers.gpu_utils import validate_batch_size_for_multi_gpu
from helpers.softmax_cross_entropy_trainer import create_model_fn

### Get the history and the runtime context 

In [12]:
localtime = time.asctime(time.localtime(time.time()))
user = os.environ['USERNAME']
print("\n\n")
print("Welcome, %s, it's %s, and you'll be working with Tensorflow version %s" % (user, localtime, tf.__version__))
print("\n")
history.experiments.tail(10)




Welcome, wgi, it's Mon Apr 30 13:49:32 2018, and you'll be working with Tensorflow version 1.8.0




Unnamed: 0,batch_size,data_dir,model_dir,multi_gpu,train_epochs,user,timestamp,localtime,accuracy,steps
0,128,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525010000.0,Sun Apr 29 15:50:38 2018,,
1,128,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525010000.0,Sun Apr 29 15:50:38 2018,,
2,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,5,wgiersche,1525012000.0,Sun Apr 29 16:24:48 2018,,
3,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525012000.0,Sun Apr 29 16:33:03 2018,,
4,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525017000.0,Sun Apr 29 17:50:42 2018,,
5,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525017000.0,Sun Apr 29 17:50:42 2018,,
6,64,/var/ellie/data/mnist_fashion,/tmp/mnist_model,False,10,wgiersche,1525018000.0,Sun Apr 29 18:05:46 2018,,
7,256,/var/ellie/data/mnist_fashion,C:\tmp\mnist_model,False,6,wgi,1525089000.0,Mon Apr 30 13:48:20 2018,0.9801,1410.0


### Create a new hyper-parameter record 

In [13]:
!ls /var/ellie/data/mnist_fashion

ls: cannot access '/var/ellie/data/mnist_fashion': No such file or directory


In [3]:
# Creating a new hyper-parameter record from a history entry
FLAGS = history.copy_from_record(2)

# Choose new parameters
FLAGS.train_epochs = 6
FLAGS.batch_size = 256
FLAGS.model_dir='C:\\tmp\\mnist_model'
FLAGS

batch_size                                256
data_dir        /var/ellie/data/mnist_fashion
model_dir                  C:\tmp\mnist_model
multi_gpu                               False
train_epochs                                6
user                                      wgi
timestamp                         1.52509e+09
localtime            Mon Apr 30 13:48:20 2018
accuracy                                  NaN
steps                                     NaN
Name: 2, dtype: object

### Get to work!

In [4]:
# For the sake of this tutorial, we always start from scratch
!rm -rf C:\tmp\mnist_model

In [5]:
model_function = create_model_fn(
    lambda params: Model(params), 
    tf.train.AdamOptimizer())

In [6]:
if FLAGS.multi_gpu:
    validate_batch_size_for_multi_gpu(FLAGS.batch_size)

    # There are two steps required if using multi-GPU: (1) wrap the model_fn,
    # and (2) wrap the optimizer. The first happens here, and (2) happens
    # in the model_fn itself when the optimizer is defined.
    model_function = tf.contrib.estimator.replicate_model_fn(
        model_function, loss_reduction=tf.losses.Reduction.MEAN)

data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last')

In [7]:
mnist_classifier = tf.estimator.Estimator(
    model_fn=model_function,
    model_dir=FLAGS.model_dir,
    params={
        'data_format': data_format,
        'multi_gpu': FLAGS.multi_gpu
    })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\tmp\\mnist_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000024D529EE6A0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


##### ```input_fn``` functions are a factories for ```DataSet```s

In [8]:
def train_input_fn():
    ds = dataset.training_dataset(FLAGS.data_dir)
    ds = ds.cache().shuffle(buffer_size=50000).\
        batch(FLAGS.batch_size).\
        repeat(FLAGS.train_epochs)
    return ds

In [9]:
def eval_input_fn():
    return dataset.test_dataset(FLAGS.data_dir).batch(
        FLAGS.batch_size).make_one_shot_iterator().get_next()

In [10]:
tensors_to_log = {'train_accuracy': 'train_accuracy'}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1000)

### Run the training and report the new experiment's hyper-parameters

In [11]:
# train
mnist_classifier.train(input_fn=train_input_fn, hooks=[logging_hook])
# evaluate 
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)

# report
FLAGS.accuracy = eval_results['accuracy']
FLAGS.steps = eval_results['global_step']
history.report_experiment(FLAGS)

print('Evaluation results:\n\t%s' % eval_results)
FLAGS

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into C:\tmp\mnist_model\model.ckpt.
INFO:tensorflow:train_accuracy = 0.09375
INFO:tensorflow:loss = 2.3211517, step = 1
INFO:tensorflow:global_step/sec: 64.3006
INFO:tensorflow:loss = 0.16576661, step = 101 (1.555 sec)
INFO:tensorflow:global_step/sec: 80.4761
INFO:tensorflow:loss = 0.12136151, step = 201 (1.243 sec)
INFO:tensorflow:global_step/sec: 79.7191
INFO:tensorflow:loss = 0.07683261, step = 301 (1.255 sec)
INFO:tensorflow:global_step/sec: 75.49
INFO:tensorflow:loss = 0.06109284, step = 401 (1.325 sec)
INFO:tensorflow:global_step/sec: 79.4091
INFO:tensorflow:loss = 0.043292683, step = 501 (1.259 sec)
INFO:tensorflow:global_step/sec: 80.7338
INFO:tensorflow:loss = 0.027377024, step = 601 (1.238 sec)
INFO:ten

batch_size                                256
data_dir        /var/ellie/data/mnist_fashion
model_dir                  C:\tmp\mnist_model
multi_gpu                               False
train_epochs                                6
user                                      wgi
timestamp                         1.52509e+09
localtime            Mon Apr 30 13:48:20 2018
accuracy                               0.9801
steps                                    1410
Name: 2, dtype: object

In [None]:
FLAGS