In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
from train import *
from loss import *
from accuracy import *
from model import *

import cifar_input
import os.path
import time
import numpy as np
import tensorflow as tf

In [2]:
max_steps = 500000
train_dir = 'cifar10_vgg_model2/'
batch_size = 128
log_device_placement = False

In [3]:
with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()
    
    # Generating images and its labels
    # build_inputs('cifar10/cifar100', cifar dataset dir, batch size, mode)
    images, labels = cifar_input.build_input('cifar10', '../../cifar/cifar10/data_batch*', batch_size, 'train')
    
    # Creating graph. NUM_CLASESS=10 (CIFAR-10) or NUM_CLASESS=100 (CIFAR-100)
    logits = inference(images, NUM_CLASSES=10)
    
    # Loss/Error and Accuracy
    losses = loss(logits, labels)
    accuracies = accuracy(logits, labels)
    
    # Our train_op (Only minimizing loss)
    train_op = train(losses, global_step, batch_size)
    
    
    # SessionRunHook. Logging will be done each x steps.
    class _LoggerHook(tf.train.SessionRunHook):
        
      def begin(self):
        self._step = -1
        # Creating train_dir if it does not exist and writing to log file
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)
        open(train_dir+'training_data.csv', 'w').close()
        f = open(train_dir+"log.txt",'ab')
        f.write('\n\n==== Run ===\nInfo: VGG\n')
        f.close()

      def before_run(self, run_context):
        # Increment step, reset start_time, and asking for loss and accuracy tensor
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs([losses, accuracies])

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time # Calculating time
        loss_value = run_values.results[0]
        accuracy_value = run_values.results[1]
        # Printing log, accuracy, and loss
        if self._step % 10 == 0:
          print("{0}: step {1}, error = {2:.4f}, accuracy = {3:.4f}. ({4:.3f} sec/step)\n".format(
              datetime.now(), self._step, loss_value, accuracy_value, float(duration)))
                
          f = open(train_dir+"log.txt",'ab')
          f.write("{0}: step {1}, error = {2:.4f}, accuracy = {3:.4f}. ({4:.3f} sec/step)\n".format(
              datetime.now(), self._step, loss_value, accuracy_value, float(duration)))
          f.close()
                
          f = open(train_dir+"training_data.csv",'ab')
          f.write('{0},{1},{2}\n'.format(self._step, loss_value, accuracy_value))
          f.close()
            
    with tf.train.MonitoredTrainingSession(checkpoint_dir=train_dir,
                                           hooks=[tf.train.StopAtStepHook(last_step=max_steps),
                                                  tf.train.NanTensorHook(losses),
                                                  _LoggerHook()],save_checkpoint_secs=30, 
                                           config=tf.ConfigProto(
                                               log_device_placement=log_device_placement)) as mon_sess:
        while not mon_sess.should_stop():
            mon_sess.run(train_op)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into cifar10_vgg_model2/model.ckpt.
2017-03-16 13:35:52.853589: step 0, error = 4.8358, accuracy = 0.0859. (3.472 sec/step)

2017-03-16 13:35:55.684678: step 10, error = 5.1074, accuracy = 0.2031. (0.336 sec/step)

2017-03-16 13:35:59.168293: step 20, error = 5.4176, accuracy = 0.1562. (0.333 sec/step)

2017-03-16 13:36:03.397333: step 30, error = 5.1448, accuracy = 0.1641. (0.438 sec/step)

2017-03-16 13:36:07.534616: step 40, error = 4.9131, accuracy = 0.1953. (0.434 sec/step)

2017-03-16 13:36:11.655467: step 50, error = 4.9711, accuracy = 0.2266. (0.398 sec/step)

2017-03-16 13:36:15.932149: step 60, error = 4.6127, accuracy = 0.2812. (0.436 sec/step)

2017-03-16 13:36:20.097891: step 70, error = 4.5922, accuracy = 0.2891. (0.358 sec/step)

INFO:tensorflow:Saving checkpoints for 78 into cifar10_vgg_model2/model.ckpt.
2017-03-16 13:36:23.644583: step 80, error = 4.7797, accuracy = 0.2812. (0.217 sec

ResourceExhaustedError: cifar10_vgg_model2/model.ckpt-28172_temp_37ef64c60f354a74a63a9b5eb379e40f/part-00001-of-00002.data-00000-of-00001.tempstate8362418674437790013
	 [[Node: save/SaveV2_1 = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](save/ShardedFilename_1, save/SaveV2_1/tensor_names, save/SaveV2_1/shape_and_slices, conv1_1/biases, conv1_1/biases/ExponentialMovingAverage, conv1_1/weights, conv1_1/weights/ExponentialMovingAverage, conv1_2/biases, conv1_2/biases/ExponentialMovingAverage, conv1_2/weights, conv1_2/weights/ExponentialMovingAverage, conv2_1/biases, conv2_1/biases/ExponentialMovingAverage, conv2_1/weights, conv2_1/weights/ExponentialMovingAverage, conv2_2/biases, conv2_2/biases/ExponentialMovingAverage, conv2_2/weights, conv2_2/weights/ExponentialMovingAverage, conv3_1/biases, conv3_1/biases/ExponentialMovingAverage, conv3_1/weights, conv3_1/weights/ExponentialMovingAverage, conv3_2/biases, conv3_2/biases/ExponentialMovingAverage, conv3_2/weights, conv3_2/weights/ExponentialMovingAverage, conv3_3/biases, conv3_3/biases/ExponentialMovingAverage, conv3_3/weights, conv3_3/weights/ExponentialMovingAverage, fc_3/biases, fc_3/biases/ExponentialMovingAverage, fc_3/weights, fc_3/weights/ExponentialMovingAverage, fc_4/biases, fc_4/biases/ExponentialMovingAverage, fc_4/weights, fc_4/weights/ExponentialMovingAverage, softmax_linear/biases, softmax_linear/biases/ExponentialMovingAverage, softmax_linear/weights, softmax_linear/weights/ExponentialMovingAverage)]]

Caused by op u'save/SaveV2_1', defined at:
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-93ec2fa589b1>", line 61, in <module>
    log_device_placement=log_device_placement)) as mon_sess:
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 315, in MonitoredTrainingSession
    return MonitoredSession(session_creator=session_creator, hooks=all_hooks)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 601, in __init__
    session_creator, hooks, should_recover=True)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 434, in __init__
    self._sess = _RecoverableSession(self._coordinated_creator)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 767, in __init__
    _WrappedSession.__init__(self, self._create_session())
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 772, in _create_session
    return self._sess_creator.create_session()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 494, in create_session
    self.tf_sess = self._session_creator.create_session()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 366, in create_session
    self._scaffold.finalize()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 180, in finalize
    lambda: training_saver.Saver(sharded=True, allow_empty=True,
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 232, in get_or_default
    op = default_constructor()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 181, in <lambda>
    write_version=saver_pb2.SaverDef.V2))
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1051, in __init__
    self.build()
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1081, in build
    restore_sequentially=self._restore_sequentially)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 669, in build
    save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 356, in _AddShardedSaveOps
    return self._AddShardedSaveOpsForV2(filename_tensor, per_device)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 330, in _AddShardedSaveOpsForV2
    sharded_saves.append(self._AddSaveOps(sharded_filename, saveables))
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 271, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 214, in save_op
    tensors)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 779, in save_v2
    tensors=tensors, name=name)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2395, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/ubuntu/miniconda2/envs/cifar/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1264, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): cifar10_vgg_model2/model.ckpt-28172_temp_37ef64c60f354a74a63a9b5eb379e40f/part-00001-of-00002.data-00000-of-00001.tempstate8362418674437790013
	 [[Node: save/SaveV2_1 = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](save/ShardedFilename_1, save/SaveV2_1/tensor_names, save/SaveV2_1/shape_and_slices, conv1_1/biases, conv1_1/biases/ExponentialMovingAverage, conv1_1/weights, conv1_1/weights/ExponentialMovingAverage, conv1_2/biases, conv1_2/biases/ExponentialMovingAverage, conv1_2/weights, conv1_2/weights/ExponentialMovingAverage, conv2_1/biases, conv2_1/biases/ExponentialMovingAverage, conv2_1/weights, conv2_1/weights/ExponentialMovingAverage, conv2_2/biases, conv2_2/biases/ExponentialMovingAverage, conv2_2/weights, conv2_2/weights/ExponentialMovingAverage, conv3_1/biases, conv3_1/biases/ExponentialMovingAverage, conv3_1/weights, conv3_1/weights/ExponentialMovingAverage, conv3_2/biases, conv3_2/biases/ExponentialMovingAverage, conv3_2/weights, conv3_2/weights/ExponentialMovingAverage, conv3_3/biases, conv3_3/biases/ExponentialMovingAverage, conv3_3/weights, conv3_3/weights/ExponentialMovingAverage, fc_3/biases, fc_3/biases/ExponentialMovingAverage, fc_3/weights, fc_3/weights/ExponentialMovingAverage, fc_4/biases, fc_4/biases/ExponentialMovingAverage, fc_4/weights, fc_4/weights/ExponentialMovingAverage, softmax_linear/biases, softmax_linear/biases/ExponentialMovingAverage, softmax_linear/weights, softmax_linear/weights/ExponentialMovingAverage)]]
