In [1]:
import argparse
import sys
import os
import time
import copy

import tensorflow as tf
import pandas as pd

import dataset
# from models.conv2_dense2_dropout import Model
from models.dense3 import Model

from helpers.os_utils import os_info
from helpers.history import ExpHistory
#from helpers.gpu_utils import validate_batch_size_for_multi_gpu
from helpers.softmax_cross_entropy_trainer import create_model_fn

### Get the history and the runtime context 

In [2]:
HIST_FILE_NAME = 'experiment_history.csv'
history = ExpHistory(HIST_FILE_NAME)

localtime = time.asctime(time.localtime(time.time()))
user = os.environ.get('USER', os.environ.get('USERNAME', 'anonymous'))
print("\n\n")
print("Welcome, %s, it's %s, and you'll be working with Tensorflow version %s" % (user, localtime, tf.__version__))
rt=os_info()
this_os = rt['os']
this_node = rt['node']
this_machine = rt['machine']
this_cuda = rt['cuda']
print("Your current runtime: \n  node: %s, \n  os: %s, \n  machine: %s, \n  cuda: %s" % (this_node, this_os, this_machine, this_cuda))
print("\n")
columns=[
    'node', 
    #'os',
    #'machine',
    'cuda',
    'multi_gpu',
    'batch_size',
    #'data_dir',
    #'model_dir',
    'train_epochs',
    #'user',
    #'time_stamp',
    'localtime',
    'steps',
    'accuracy',
    'duration'
]
#history.experiments.tail(10)

history.experiments.tail(10)[columns]




Welcome, wgiersche, it's Tue May  1 12:53:25 2018, and you'll be working with Tensorflow version 1.8.0
Your current runtime: 
  node: scylla, 
  os: Linux-4.13.0-39-generic-x86_64-with-Ubuntu-16.04-xenial, 
  machine: x86_64, 
  cuda: True




Unnamed: 0,node,cuda,multi_gpu,batch_size,train_epochs,localtime,steps,accuracy,duration
4,wolfgangs-mac-pro.home,0,0,64,10,Mon Apr 30 17:28:43 2018,,,
5,MacPro,0,0,256,4,Tue May 1 06:36:35 2018,940.0,0.8801,
6,PC-16,1,1,256,40,Tue May 1 09:11:54 2018,9400.0,0.8918,
7,wolfgangs-mac-pro.home,0,0,256,4,Tue May 1 11:47:49 2018,940.0,0.877,
8,wolfgangs-mac-pro.home,0,0,256,4,Tue May 1 12:09:15 2018,1880.0,0.8825,
9,wolfgangs-mac-pro.home,0,0,256,4,Tue May 1 12:23:36 2018,940.0,0.8681,290.0
10,wolfgangs-mac-pro.home,0,0,256,4,Tue May 1 12:23:36 2018,1880.0,0.8762,645.0
11,wolfgangs-mac-pro.home,0,0,256,4,Tue May 1 12:23:36 2018,2820.0,0.8794,13.0
12,wolfgangs-mac-pro.home,0,0,256,4,Tue May 1 12:48:35 2018,940.0,0.8756,12.0
13,wolfgangs-mac-pro.home,0,0,256,4,Tue May 1 12:48:35 2018,1880.0,0.886,12.0


### Want to start with the most recent record from this platform?

In [3]:
hparams=history.last_experiment_from_here()
#hparams=history.copy_from_record(2)
hparams

node                                                       scylla
os              Linux-4.13.0-39-generic-x86_64-with-Ubuntu-16....
machine                                                    x86_64
cuda                                                         True
multi_gpu                                                       0
batch_size                                                    256
data_dir                            /var/ellie/data/mnist_fashion
model_dir                                        /tmp/mnist_model
train_epochs                                                    4
user                                                    wgiersche
timestamp                                              1525172006
localtime                                Tue May  1 12:53:26 2018
accuracy                                                    0.886
steps                                                        1880
duration                                                       12
Name: 13, 

### Use as new hyper-parameter record, with adaptations 

In [4]:
hparams.train_epochs = 4
hparams.batch_size = 256
hparams.multi_gpu = False
hparams

node                                                       scylla
os              Linux-4.13.0-39-generic-x86_64-with-Ubuntu-16....
machine                                                    x86_64
cuda                                                         True
multi_gpu                                                   False
batch_size                                                    256
data_dir                            /var/ellie/data/mnist_fashion
model_dir                                        /tmp/mnist_model
train_epochs                                                    4
user                                                    wgiersche
timestamp                                              1525172006
localtime                                Tue May  1 12:53:26 2018
accuracy                                                    0.886
steps                                                        1880
duration                                                       12
Name: 13, 

### Get to work!

In [5]:
# For the sake of this tutorial, we always start from scratch
!rm -rf /tmp/mnist_model

In [6]:
model_function = create_model_fn(
    lambda params: Model(params),
    tf.train.AdamOptimizer(),
    hparams)

In [7]:
data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last')

In [8]:
mnist_classifier = tf.estimator.Estimator(
    model_fn=model_function,
    model_dir=hparams.model_dir,
    params={
        'data_format': data_format,
        'multi_gpu': hparams.multi_gpu
    })

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f89672fb0d0>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/mnist_model', '_global_id_in_cluster': 0, '_save_summary_steps': 100}


##### ```input_fn``` functions are a factories for ```DataSet```s

In [9]:
def train_input_fn():
    ds = dataset.training_dataset(hparams.data_dir)
    ds = ds.cache().shuffle(buffer_size=50000).\
        batch(hparams.batch_size).\
        repeat(hparams.train_epochs)
    return ds

In [10]:
def eval_input_fn():
    return dataset.test_dataset(hparams.data_dir).\
        batch(hparams.batch_size).\
        make_one_shot_iterator().get_next()

In [11]:
tensors_to_log = {'train_accuracy': 'train_accuracy'}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1000)

### Run the training and report the new hyper-parameters 

In [13]:
# Train
start_time=time.time()
mnist_classifier.train(input_fn=train_input_fn, hooks=[logging_hook])
duration=time.time() - start_time

# Evaluate
eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
hparams.accuracy = eval_results['accuracy']
hparams.steps = eval_results['global_step']
hparams.duration = int(duration)

# Report!
history.report_experiment(hparams)

print('Evaluation results:\n\t%s' % eval_results)
hparams

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/mnist_model/model.ckpt-940
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 941 into /tmp/mnist_model/model.ckpt.
INFO:tensorflow:train_accuracy = 0.9140625
INFO:tensorflow:loss = 0.22301891, step = 940
INFO:tensorflow:global_step/sec: 149.807
INFO:tensorflow:loss = 0.2654615, step = 1040 (0.668 sec)
INFO:tensorflow:global_step/sec: 543.354
INFO:tensorflow:loss = 0.24799642, step = 1140 (0.184 sec)
INFO:tensorflow:global_step/sec: 473.406
INFO:tensorflow:loss = 0.28721142, step = 1240 (0.211 sec)
INFO:tensorflow:global_step/sec: 537.779
INFO:tensorflow:loss = 0.22506666, step = 1340 (0.186 sec)
INFO:tensorflow:global_step/sec: 498.378
INFO:tensorflow:loss = 0.2845953, step = 1440 (0.201 sec)
INFO:tensorflow:global_st

node                                                       scylla
os              Linux-4.13.0-39-generic-x86_64-with-Ubuntu-16....
machine                                                    x86_64
cuda                                                         True
multi_gpu                                                   False
batch_size                                                    256
data_dir                            /var/ellie/data/mnist_fashion
model_dir                                        /tmp/mnist_model
train_epochs                                                    4
user                                                    wgiersche
timestamp                                              1525172006
localtime                                Tue May  1 12:53:26 2018
accuracy                                                   0.8841
steps                                                        1880
duration                                                        4
Name: 13, 