In [None]:
import time
import tensorflow as tf

In [None]:
# We simulate some raw input data 
# (think about it as fetching some data from the file system, say using feed_dict{})
# let's say: batches of 128 samples, each containing 1024 data points 
x_input_data = tf.random_normal([128, 1024], mean=0, stddev=1)


In [None]:
# This is the queue and deque part

# We build a FIFOQueue inside the graph 
# 1. an object 'q' is created which will wait for the data (a batch of 5 here)
#    'q' is empty until we run a session and feed the data to it

# 2. We need an operation that will actually fill the queue with our data
#    q.enqueue fetches the data into 'q'.
#    "enqueue_many" slices "x_input_data" along the 0th dimension to make multiple queue elements

# 3. To leverage multi-threading we create a "QueueRunner"
#    that will handle the "enqueue_op" outside of the main thread
#    We don't need much parallelism here, so we will use only 1 thread
#    (without step 3, its simply running on one main thread)
# 4. We need a dequeue op to get the next elements in the queue following the FIFO policy.

with tf.variable_scope("queue"):
    q = tf.FIFOQueue(capacity=5, dtypes=tf.float32) # enqueue 5 batches
    # We use the "enqueue" operation so 1 element of the queue is the full batch
    enqueue_op = q.enqueue(x_input_data) # we are loading 128 records as 1 batch
    
    # To leverage multi-threading we create a "QueueRunner"
    # that will handle the "enqueue_op" outside of the main thread
    # We don't need much parallelism here, so we will use only 1 thread
    numberOfThreads = 1
    qr = tf.train.QueueRunner(q, [enqueue_op] * numberOfThreads)
    #  must need to add "QueueRunner" to the QUEUE_RUNNERS collection
    tf.train.add_queue_runner(qr)
    
    input = q.dequeue() # It replaces our input placeholder
    # The input tensor is the equivalent of a placeholder now 
    # but directly connected to the data sources in the graph

    # We can also compute y_true right into the graph now
    y_true = tf.cast(tf.reduce_sum(input, axis=1, keep_dims=True) > 0, tf.int32)


In [None]:
# We build our small model: a basic two layers neural net with ReLU

with tf.variable_scope('FullyConnected'):
    w = tf.get_variable('w', shape=[1024, 1024], initializer=tf.random_normal_initializer(stddev=1e-1))
    b = tf.get_variable('b', shape=[1024], initializer=tf.constant_initializer(0.1))
    z = tf.matmul(input, w) + b
    y = tf.nn.relu(z)

    w2 = tf.get_variable('w2', shape=[1024, 1], initializer=tf.random_normal_initializer(stddev=1e-1))
    b2 = tf.get_variable('b2', shape=[1], initializer=tf.constant_initializer(0.1))
    z = tf.matmul(y, w2) + b2

In [None]:
# loss, accuracy and optimizer

with tf.variable_scope('Loss'):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(None, tf.cast(y_true, tf.float32), z)
    loss_op = tf.reduce_mean(losses)

with tf.variable_scope('Accuracy'):
    y_pred = tf.cast(z > 0, tf.int32)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    accuracy = tf.Print(accuracy, data=[accuracy], message="accuracy:")

# We add the training op ...
adam = tf.train.AdamOptimizer(1e-2)
train_op = adam.minimize(loss_op, name="train_op")

In [None]:
startTime = time.time()
with tf.Session() as sess:
    # ... init our variables, ...
    sess.run(tf.global_variables_initializer())

    # ... add the coordinator, ...
    # build the coordinator to coordinate our child threads with the main thread
    coord = tf.train.Coordinator()
    # you need to start all your queues before runnig anything otherwise,
    # The main threads will wait for them to start and you will hang again
    # This helper start all queues in tf.GraphKeys.QUEUE_RUNNERS
    threads = tf.train.start_queue_runners(coord=coord)

    # The QueueRunner will automatically call the enqueue operation
    # asynchronously in its own thread ensuring that the queue is always full
    # No more hanging for the main process
    
    # ... check the accuracy before training (without feed_dict!), ...
    sess.run(accuracy)

    # ... train ...
    for i in range(5000):
        #  ... without sampling from Python (no batch creation etc.) and without a feed_dict !
        _, loss = sess.run([train_op, loss_op])

        # We regularly check the loss
        if i % 500 == 0:
            print('iter:%d - loss:%f' % (i, loss))

    # Finally, we check our final accuracy
    sess.run(accuracy)
    
    # stop the child threads 
    coord.request_stop()
    
    # wait for the child threads to stop before releasing the main thread
    coord.join(threads)

print("Time taken: %f" % (time.time() - startTime))