In [1]:
import argparse
import sys
import os
import time
import copy

import tensorflow as tf
import pandas as pd
import numpy as np

import dataset
#from models.conv2_dense2_dropout import Model
from models.dense3 import Model

from helpers.os_utils import os_info
from helpers.history import ExpHistory
from helpers.estimator_utils import create_model_fn, split_datasource

### Get the history and the runtime context 

In [14]:
tf.logging.set_verbosity(tf.logging.INFO)

HIST_FILE_NAME = 'experiment_history.csv'
history = ExpHistory(HIST_FILE_NAME)

localtime = time.asctime(time.localtime(time.time()))
user = os.environ.get('USER', os.environ.get('USERNAME', 'anonymous'))
print("\n\n")
print("Welcome, %s, it's %s, and you'll be working with Tensorflow version %s" % (user, localtime, tf.__version__))
rt=os_info()
this_os = rt['os']
this_node = rt['node']
this_machine = rt['machine']
this_cuda = rt['cuda']
print("Your current runtime: \n  node: %s, \n  os: %s, \n  machine: %s, \n  cuda: %s" % (this_node, this_os, this_machine, this_cuda))
print("\n")
columns=[
    'node', 
    #'os',
    #'machine',
    'cuda',
    'multi_gpu',
    'batch_size',
    'data_dir',
    #'model_dir',
    'train_epochs',
    #'user',
    #'time_stamp',
    'localtime',
    'steps',
    'accuracy',
    'duration'
]
history.experiments.tail(10)[columns]




Welcome, wgiersche, it's Tue May  1 23:23:21 2018, and you'll be working with Tensorflow version 1.8.0
Your current runtime: 
  node: wolfgangs-mac-pro.home, 
  os: Darwin-15.6.0-x86_64-i386-64bit, 
  machine: x86_64, 
  cuda: False




Unnamed: 0,node,cuda,multi_gpu,batch_size,data_dir,train_epochs,localtime,steps,accuracy,duration
24,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 15:28:29 2018,1876.0,0.8536,12.0
25,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:02:23 2018,1876.0,0.872,12.0
26,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:11:26 2018,1694.0,0.864,14.0
27,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:11:26 2018,3387.0,0.8725,14.0
28,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:17:57 2018,1693.0,0.8541,14.0
29,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:23:53 2018,1693.0,0.869825,15.0
30,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:27:11 2018,1782.0,0.871333,15.0
31,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:34:13 2018,1782.0,0.872,15.0
32,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:34:13 2018,3564.0,0.875333,16.0
33,wolfgangs-mac-pro.home,0,0,64,/var/ellie/data/mnist_fashion,2,Tue May 1 22:34:13 2018,5346.0,0.894,15.0


### Want to start with the most recent record from this platform?

In [3]:
hparams=history.suggest_from_history()
hparams=history.copy_from_record(18)
hparams

node                     wolfgangs-mac-pro.home
os              Darwin-15.6.0-x86_64-i386-64bit
machine                                  x86_64
cuda                                          0
multi_gpu                                     0
batch_size                                  256
data_dir                  /var/ellie/data/mnist
model_dir                      /tmp/mnist_model
train_epochs                                 12
user                                  wgiersche
timestamp                            1525206853
localtime              Tue May  1 22:34:13 2018
accuracy                                 0.9797
steps                                      2820
duration                                     30
Name: 18, dtype: object

### Use as new hyper-parameter record, with adaptations 

In [4]:
DATA_SET = 'FASHION'
hparams.data_dir = '/var/ellie/data/mnist_fashion'
hparams.train_epochs = 2
hparams.batch_size = 64
hparams.multi_gpu = False
hparams

node                     wolfgangs-mac-pro.home
os              Darwin-15.6.0-x86_64-i386-64bit
machine                                  x86_64
cuda                                          0
multi_gpu                                 False
batch_size                                   64
data_dir          /var/ellie/data/mnist_fashion
model_dir                      /tmp/mnist_model
train_epochs                                  2
user                                  wgiersche
timestamp                            1525206853
localtime              Tue May  1 22:34:13 2018
accuracy                                 0.9797
steps                                      2820
duration                                     30
Name: 18, dtype: object

# Get to work!

In [5]:
# For the sake of this tutorial, we always start from scratch
!rm -rf /tmp/mnist_model

### The model function constructs the computational graphs for training, eval and test
Note that the actual construction takes place within the Estimator. Thus, none of the the constructing code should be explicitly called from the API client. The Estimator will complain that parts that have been constructed prior to those that itself constructs, don't belong to the same graph. 

In [6]:
model_function = create_model_fn(
    lambda params: Model(params),
    tf.train.AdamOptimizer(),
    tf.losses.sparse_softmax_cross_entropy,
    hparams)

Performance depends on the data format, and differs between CPU and GPU computations

In [7]:
data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last')

### The Estimator is the center piece of Tensorflow's new API

In [8]:
mnist_classifier = tf.estimator.Estimator(
    model_fn=model_function,
    model_dir=hparams.model_dir,
    params={
        'data_format': data_format,
        'multi_gpu': hparams.multi_gpu
    })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/mnist_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x117f274e0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


##### ```input_fn``` functions are a factories for ```DataSet```s

### Split the training dataset into training and evaluation sets

In [9]:
def train_input_fn():
    ds_tr = dataset.training_dataset(hparams.data_dir, DATA_SET)
    ds_tr_tr, _ = split_datasource(ds_tr, 60000, 0.95)
    ds1 = ds_tr_tr.cache().shuffle(buffer_size=57000).\
        repeat(hparams.train_epochs).\
        batch(hparams.batch_size)
    return ds1

def eval_input_fn():
    ds_tr = dataset.training_dataset(hparams.data_dir, DATA_SET)
    _, ds_tr_ev = split_datasource(ds_tr, 60000, 0.95)
    ds2 = ds_tr_ev.batch(hparams.batch_size)
    return ds2

### Logging hooks

In [10]:
tensors_to_log = {'train_accuracy': 'train_accuracy'}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1000)

### Run the training and report the new hyper-parameters 

In [13]:
# Train
start_time=time.time()
mnist_classifier.train(input_fn=train_input_fn, hooks=[logging_hook])
duration=time.time() - start_time

# Evaluate
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
hparams.accuracy = eval_results['accuracy']
hparams.steps = eval_results['global_step']
hparams.duration = int(duration)

# Report!
history.report_experiment(hparams)

print('Evaluation results:\n\t%s' % eval_results)
hparams

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/mnist_model/model.ckpt-3564
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 3565 into /tmp/mnist_model/model.ckpt.
INFO:tensorflow:train_accuracy = 0.9375
INFO:tensorflow:loss = 0.23161556, step = 3565
INFO:tensorflow:global_step/sec: 209.31
INFO:tensorflow:loss = 0.44792962, step = 3665 (0.478 sec)
INFO:tensorflow:global_step/sec: 261.985
INFO:tensorflow:loss = 0.36448893, step = 3765 (0.382 sec)
INFO:tensorflow:global_step/sec: 263.071
INFO:tensorflow:loss = 0.26584047, step = 3865 (0.380 sec)
INFO:tensorflow:global_step/sec: 263.675
INFO:tensorflow:loss = 0.3439101, step = 3965 (0.379 sec)
INFO:tensorflow:global_step/sec: 260.221
INFO:tensorflow:loss = 0.31222665, step = 4065 (0.385 sec)
INFO:tensorflow:global_st

node                     wolfgangs-mac-pro.home
os              Darwin-15.6.0-x86_64-i386-64bit
machine                                  x86_64
cuda                                          0
multi_gpu                                 False
batch_size                                   64
data_dir          /var/ellie/data/mnist_fashion
model_dir                      /tmp/mnist_model
train_epochs                                  2
user                                  wgiersche
timestamp                            1525206853
localtime              Tue May  1 22:34:13 2018
accuracy                                  0.894
steps                                      5346
duration                                     15
Name: 18, dtype: object