In [1]:
%load_ext autoreload
%autoreload 2

import os
import time

import numpy
import tensorflow as tf

import tf_func.layers as L

import tf_func.margin_loss as lmargin


In [2]:
from tf_func.mnist import inputs, unlabeled_inputs
from tf_func.models import mlp3 as model

In [3]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [4]:
def build_training_graph(method, x, y, ul_x, lr, mom):
    global_step = tf.get_variable(
        name="global_step",
        shape=[],
        dtype=tf.float32,
        initializer=tf.constant_initializer(0.0),
        trainable=False,
    )
    logit, endpoints = model_ins(x, is_training=True, update_batch_stats=True, stochastic=False)
    # nll_loss = L.ce_loss(logit, y)
    layers_list = [x, endpoints['fc1']]
    nll_loss = lmargin.large_margin(logits=logit, one_hot_labels=y, layers_list=layers_list)

    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        if method == 'vat':
            ul_logit, _ = vat.forward(model_ins, ul_x, is_training=True, update_batch_stats=False)
            vat_loss = vat.virtual_adversarial_loss(model_ins, ul_x, ul_logit)
            additional_loss = vat_loss
        elif method == 'vatent':
            ul_logit, _ = vat.forward(model_ins, ul_x, is_training=True, update_batch_stats=False)
            vat_loss = vat.virtual_adversarial_loss(model_ins, ul_x, ul_logit)
            ent_loss = L.entropy_y_x(ul_logit)
            additional_loss = vat_loss + ent_loss
        elif method == 'baseline':
            additional_loss = 0
        elif method == 'margin':
            ul_logit, ul_endpoints = vat.forward(model_ins, ul_x, is_training=True, update_batch_stats=False)
            layers_list = [ul_x] # + [ul_endpoints[name] for name in ul_endpoints]
            pred = tf.argmax(ul_logit, 1)
            labels = tf.one_hot(pred, 10)
            l_margin_loss = lmargin.large_margin(logits=ul_logit, one_hot_labels=labels, layers_list=layers_list)
            additional_loss = l_margin_loss
        else:
            raise NotImplementedError
        loss = nll_loss + additional_loss

    opt = tf.train.AdamOptimizer(learning_rate=lr, beta1=mom)
    tvars = tf.trainable_variables()
    grads_and_vars = opt.compute_gradients(loss, tvars)
    train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
    return loss, train_op, global_step


def build_eval_graph(x, y, ul_x):
    losses = {}
    logit, _ = model_ins(x, is_training=False, update_batch_stats=False)
    nll_loss = L.ce_loss(logit, y)
    losses['NLL'] = nll_loss
    acc = L.accuracy(logit, y)
    losses['Acc'] = acc
    scope = tf.get_variable_scope()
    scope.reuse_variables()
#     at_loss = vat.adversarial_loss(model_ins, x, y, nll_loss, is_training=False)
#     losses['AT_loss'] = at_loss
#     ul_logit, _ = vat.forward(model_ins, ul_x, is_training=False, update_batch_stats=False)
#     vat_loss = vat.virtual_adversarial_loss(model_ins, ul_x, ul_logit, is_training=False)
#     losses['VAT_loss'] = vat_loss
    return losses

In [5]:
images, labels = inputs(batch_size=100,
                        train=True,
                        validation="False",
                        shuffle=True)
ul_images = unlabeled_inputs(batch_size=250,
                         validation=False,
                         shuffle=True)

images_eval_test, labels_eval_test = inputs(batch_size=100,
                                            train=False,
                                            validation=False,
                                            shuffle=True)

In [6]:
model_ins = model(None)

In [7]:
with tf.device("/device:GPU:0"): # % FLAGS.device):
    lr = tf.placeholder(tf.float32, shape=[], name="learning_rate")
    mom = tf.placeholder(tf.float32, shape=[], name="momentum")
    with tf.variable_scope("CNN") as scope:
        # Build training graph
        loss, train_op, global_step = build_training_graph("baseline", images, labels, ul_images, lr, mom)
        scope.reuse_variables()
        # Build eval graph
        losses_eval_test = build_eval_graph(images_eval_test, labels_eval_test, images_eval_test)

    init_op = tf.global_variables_initializer()

In [10]:
logdir = ""
saver = tf.train.Saver(tf.global_variables())
sv = tf.train.Supervisor(
    is_chief=True,
    logdir=logdir,
    init_op=init_op,
    init_feed_dict={lr: 0.001, mom: 0.9},
    saver=saver,
    global_step=global_step,
    summary_op=None,
    summary_writer=None,
    save_model_secs=150, recovery_wait_secs=0)

Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession


In [12]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.3

In [13]:
with sv.managed_session(config=config) as sess:
    for ep in range(FLAGS.num_epochs):
        if sv.should_stop():
            break

        feed_dict = {lr: learning_rate, mom: mom1}

        sum_loss = 0
        start = time.time()
        for i in range(FLAGS.num_iter_per_epoch):
            _, batch_loss, _ = sess.run([train_op, loss, global_step], feed_dict=feed_dict)
            sum_loss += batch_loss
        end = time.time()
        print("Epoch:", ep, "CE_loss_train:", sum_loss / FLAGS.num_iter_per_epoch, "elapsed_time:", end - start)

        if (ep + 1) % FLAGS.eval_freq == 0 or ep + 1 == FLAGS.num_epochs:
            # Eval on test data
            act_values_dict = {}
            for key, _ in losses_eval_test.items():
                act_values_dict[key] = 0
            n_iter_per_epoch = NUM_EVAL_EXAMPLES / FLAGS.eval_batch_size
            for i in range(int(n_iter_per_epoch)):
                values = losses_eval_test.values()
                act_values = sess.run(values)
                for key, value in zip(act_values_dict.keys(), act_values):
                    act_values_dict[key] += value
            summary = tf.Summary()
            current_global_step = sess.run(global_step)
            for key, value in act_values_dict.items():
                print("test-" + key, value / n_iter_per_epoch)
                summary.value.add(tag=key, simple_value=value / n_iter_per_epoch)

sv.stop()

INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.ResourceExhaustedError'>, OOM when allocating tensor with shape[784,1200] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: CNN/fc1_W/Initializer/truncated_normal/TruncatedNormal = TruncatedNormal[T=DT_INT32, _class=["loc:@CNN/fc1_W/Assign"], dtype=DT_FLOAT, seed=87654321, seed2=92975, _device="/job:localhost/replica:0/task:0/device:GPU:0"](CNN/fc1_W/Initializer/truncated_normal/shape)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'CNN/fc1_W/Initializer/truncated_normal/TruncatedNormal', defined at:
  File "/usr/lib64/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib64/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/xyang2/software/dllib3/lib/

ResourceExhaustedError: OOM when allocating tensor with shape[784,1200] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: CNN/fc1_W/Initializer/truncated_normal/TruncatedNormal = TruncatedNormal[T=DT_INT32, _class=["loc:@CNN/fc1_W/Assign"], dtype=DT_FLOAT, seed=87654321, seed2=92975, _device="/job:localhost/replica:0/task:0/device:GPU:0"](CNN/fc1_W/Initializer/truncated_normal/shape)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'CNN/fc1_W/Initializer/truncated_normal/TruncatedNormal', defined at:
  File "/usr/lib64/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib64/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib64/python3.6/asyncio/base_events.py", line 421, in run_forever
    self._run_once()
  File "/usr/lib64/python3.6/asyncio/base_events.py", line 1431, in _run_once
    handle._run()
  File "/usr/lib64/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-4a0991f20396>", line 6, in <module>
    loss, train_op, global_step = build_training_graph("baseline", images, labels, ul_images, lr, mom)
  File "<ipython-input-4-1a2fd3ba1f38>", line 9, in build_training_graph
    logit, endpoints = model_ins(x, is_training=True, update_batch_stats=True, stochastic=False)
  File "/home/xyang2/project/other/DeepLearningCMP/tf_func/models/mlp.py", line 112, in logit
    h = L.fc(h, 784, 1200, seed=rng.randint(123456), name='fc1')
  File "/home/xyang2/project/other/DeepLearningCMP/tf_func/layers.py", line 80, in fc
    initializer=weights_initializer)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1467, in get_variable
    aggregation=aggregation)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 1217, in get_variable
    aggregation=aggregation)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 527, in get_variable
    aggregation=aggregation)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 481, in _true_getter
    aggregation=aggregation)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 903, in _get_single_variable
    aggregation=aggregation)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2443, in variable
    aggregation=aggregation)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2425, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 2406, in default_variable_creator
    constraint=constraint)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 259, in __init__
    constraint=constraint)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 368, in _init_from_args
    initial_value(), name="initial_value", dtype=dtype)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py", line 885, in <lambda>
    shape.as_list(), dtype=dtype, partition_info=partition_info)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/contrib/layers/python/layers/initializers.py", line 150, in _initializer
    seed=seed)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/random_ops.py", line 174, in truncated_normal
    shape_tensor, dtype, seed=seed1, seed2=seed2)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/ops/gen_random_ops.py", line 908, in truncated_normal
    name=name)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/home/xyang2/software/dllib3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[784,1200] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: CNN/fc1_W/Initializer/truncated_normal/TruncatedNormal = TruncatedNormal[T=DT_INT32, _class=["loc:@CNN/fc1_W/Assign"], dtype=DT_FLOAT, seed=87654321, seed2=92975, _device="/job:localhost/replica:0/task:0/device:GPU:0"](CNN/fc1_W/Initializer/truncated_normal/shape)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

