In [1]:
import tensorflow as tf
import numpy as np
from alpha_preprocessing import generate_batch, data_generator

In [2]:
ENCODER_INPUT_SIZE = 4096
HIDDEN_LAYER_SIZE = 1024
EMBEDDING_SIZE = 1024
NUM_OF_LAYER = 1
BOS = 0
EOS = 1
BATCH_SIZE = 100
KEEP_PROB = 0.7


In [3]:
X, y_inputs, y_targets, caption_id_to_feature_id,word_idx, idx_word, num_of_words, max_length, sequence_lengths, video_id = data_generator('./data/training_data', './data/training_label.json', 2)

start loading data
start encoding
start convert to npy
Done data generation!


# Define the model

In [4]:
tf.reset_default_graph()
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
# device_count = {'GPU': 1},
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
with tf.name_scope('input'):
    encoder_inputs = tf.placeholder(tf.float32, shape = [None, None, ENCODER_INPUT_SIZE])
    decoder_inputs = tf.placeholder(tf.int32, shape = [None, None])
    decoder_targets = tf.placeholder(tf.int32, shape = [None, None])
    sequence_length = tf.placeholder(tf.int32, shape = [None])
    sequence_length_fake = tf.placeholder(tf.int32, shape = [None])
    sampling_prob = tf.placeholder(tf.float32, shape = [])
    batch_size = tf.placeholder(tf.int32, shape = [])
    keep_prob = tf.placeholder(tf.float32, shape = [])

# Define Encoder 

In [5]:
with tf.name_scope('encoder'):
    encoder_inputs_embedded = tf.layers.dense(encoder_inputs, EMBEDDING_SIZE)
    encoder_cell_fw = tf.contrib.rnn.MultiRNNCell([tf.nn.rnn_cell.DropoutWrapper(tf.contrib.rnn.LSTMCell(HIDDEN_LAYER_SIZE), keep_prob) for _ in range(NUM_OF_LAYER)])
    encoder_cell_bw = tf.contrib.rnn.MultiRNNCell([tf.nn.rnn_cell.DropoutWrapper(tf.contrib.rnn.LSTMCell(HIDDEN_LAYER_SIZE), keep_prob) for _ in range(NUM_OF_LAYER)])
    encoder_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(encoder_cell_fw, encoder_cell_bw, 
                                       encoder_inputs_embedded, 
                                       dtype=tf.float32)
    encoder_outputs = tf.concat(encoder_outputs, 2)

# Define Decoder for training

In [6]:
with tf.name_scope('training_decoder'):
    decoder_cell = tf.contrib.rnn.MultiRNNCell([tf.nn.rnn_cell.DropoutWrapper(tf.contrib.rnn.LSTMCell(HIDDEN_LAYER_SIZE), keep_prob) for _ in range(NUM_OF_LAYER)]) 
    # embedding for decoder
    embeddings = tf.Variable(tf.random_uniform([num_of_words, EMBEDDING_SIZE], -1.0, 1.0), dtype=tf.float32)
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)

    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
        num_units=HIDDEN_LAYER_SIZE, memory=encoder_outputs)
    attn_cell = tf.contrib.seq2seq.AttentionWrapper(
        decoder_cell, attention_mechanism, attention_layer_size=HIDDEN_LAYER_SIZE)
    out_cell = tf.contrib.rnn.OutputProjectionWrapper(
            attn_cell, num_of_words
        )

    training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(decoder_inputs_embedded, 
                                               sequence_length_fake, embeddings, sampling_prob)
    training_decoder = tf.contrib.seq2seq.BasicDecoder(out_cell, 
                                              training_helper, 
                                            initial_state = out_cell.zero_state(dtype=tf.float32, batch_size=batch_size))
    # unrolling the decoder layer
    training_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, 
                                                      impute_finished = True)

# Define Decoder for inference

In [7]:
with tf.variable_scope('inference_decoder', reuse = True):
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, 
                                                          tf.fill([batch_size], BOS), 
                                                          EOS)

    inference_decoder = tf.contrib.seq2seq.BasicDecoder(out_cell, 
                                                  inference_helper, 
                                                initial_state = out_cell.zero_state(dtype=tf.float32, batch_size=batch_size))

    inference_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder, 
                                                          impute_finished=True, maximum_iterations= max_length)

# Define the training logits

In [8]:
training_logits = tf.identity(training_outputs.rnn_output, name='logits')
pred_output = tf.identity(inference_outputs.rnn_output, name='logits')
masks = tf.sequence_mask(sequence_length, max_length, name = 'mask', dtype=tf.float32)

In [9]:
training_logits
print(max_length)

41


In [10]:
# Define training
with tf.name_scope("optimization"):
    # Loss function - weighted softmax cross entropy
    cost = tf.contrib.seq2seq.sequence_loss(
        training_logits,
        decoder_targets,
        masks)

    # Optimizer
    optimizer = tf.train.AdamOptimizer(1e-3)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)
    tf.summary.scalar('loss', cost)
    

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# Tensorbaord log storing

In [11]:
summaries_dir = './log'
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(summaries_dir + '/train',
                                      sess.graph)
test_writer = tf.summary.FileWriter(summaries_dir + '/test')

# Training

In [12]:
# training start
import math
sess.run(tf.global_variables_initializer())

epoc = 90
fake_max_sequence = np.array([max_length] * BATCH_SIZE)
for i in range(epoc):
    sample_prob_input = min(float(i) / epoc + 0.2, 1.0)
    for j in range(len(X) // BATCH_SIZE):
        X_batch, y_inputs_batch, y_targets_batch, sequence_length_batch = generate_batch(X, y_inputs, y_targets, caption_id_to_feature_id, word_idx,
         sequence_lengths, BATCH_SIZE)
#         print(y_inputs_batch.shape, y_targets_batch.shape, max(sequence_length_batch))
        _, loss, prediction  = sess.run([train_op, cost, pred_output], feed_dict= {encoder_inputs : X_batch, decoder_inputs : y_inputs_batch,
                                              decoder_targets: y_targets_batch, sequence_length : sequence_length_batch,
                                                                      sequence_length_fake : fake_max_sequence, sampling_prob : sample_prob_input,
                                                                                  batch_size : BATCH_SIZE, keep_prob : KEEP_PROB})
    print( [ idx_word[idx] for idx in np.argmax(prediction[0], axis = 1) ])
    print( 'truth:', [ idx_word[y_targets_batch[0,k]] for k in range(max_length)])
    print("epoch {0}: loss : {1}".format(i, loss))

['a', 'man', 'is', 'EOS', 'BOS', 'BOS', 'BOS']
truth: ['a', 'woman', 'peels', 'an', 'apple', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
epoch 0: loss : 4.3283796310424805
['a', 'man', 'is', 'playing', 'a', 'bowl', 'EOS']
truth: ['someone', 'is', 'UWK', 'two', 'cameras', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
epoch 1: loss : 3.9544951915740967
['a', 'man', 'is', 'playing', 'a', 'EOS', 'BOS']
truth: ['a', 'woman', 'is', 'singing', 'a', 'song', 'EOS', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',

KeyboardInterrupt: 

In [13]:
fake_max_sequence

array([41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41])

In [14]:
# Add ops to save and restore all the variables.
saver = tf.train.Saver()

save_path = saver.save(sess, "./tmp/model_lstm_with_drop_0.7_layer_1_miore.ckpt")
print("Model saved in path: %s" % save_path)

Model saved in path: ./tmp/model_lstm_with_drop_0.7_layer_1_miore.ckpt


In [15]:
#  Restore the path
saver = tf.train.Saver()

saver.restore(sess, save_path)


INFO:tensorflow:Restoring parameters from ./tmp/model_lstm_with_drop_0.7_layer_1_miore.ckpt


In [16]:
X_test, _, _, _, _, _, _, max_length, sequence_lengths, video_id = data_generator('./data/testing_data', './data/testing_label.json', 2)

start loading data
start encoding
start convert to npy
Done data generation!


In [17]:
fake_max_sequence = np.array([max_length] * X_test.shape[0])
print(X_test.shape[0])
prediction = sess.run(pred_output, feed_dict = {encoder_inputs : X_test, sequence_length : sequence_lengths,
                                                                      sequence_length_fake : fake_max_sequence, sampling_prob : 1.0,
                                               batch_size : len(X_test), keep_prob : 1.0} )
prediction = [ [ idx_word[idx] for idx in np.argmax(prediction[i], axis = 1) ] for i in range(X_test.shape[0]) ]
prediction = [[ word for word in pred if (word != 'EOS' and word != 'BOS' and word != 'UWK')] for pred in prediction]
prediction = [ " ".join(data) for data in prediction]
print(prediction)

import pandas as pd

df = pd.DataFrame({'id' : video_id, 'cation': prediction})
df.to_csv('result_drop_0.7_layer_1.csv', index = False, header = False)

100
['a man is the a a', 'a man is a a a', 'a man is a a a', 'a woman is the a', 'a woman is on a a on', 'a guinea are ate', 'a girl is drinking her a a', 'baby are are playing', 'a man is doing a', 'a man is the a a a a', 'a lady is the water', 'a man is a a a', 'a man is on a a', 'a little is is from a', 'a woman is the of', 'a man is running on a', 'a man is a a a a a', 'a woman is on a a', 'a man is a a on', 'a woman is up a', 'a man is up a', 'a man is a a', 'a man is playing a', 'a woman is the garlic', 'a man is up a a a', 'a man is on a a', 'a man is the a on a', 'a baby is up a a', 'a little is is out the hands', 'a woman is swinging', 'a woman is the into a a', 'a woman is cracking eggs into a bowl', 'a man is a on a', 'a man is dancing a a stage', 'a woman is wrapping a block of tofu in paper', 'a man is writing on a stage', 'a man is a a a', 'a man is a a', 'a man is running the a', 'a lady is slicing meat', 'a man is a a a a a', 'a is puts adding into a a', 'a man is a on 