In [6]:
%matplotlib inline

In [7]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [8]:
import tensorflow as tf
from tensorflow.python.layers.core import Dense
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
import pandas as pd
import collections
import pickle

In [9]:
X_test = []
TEST_VIDEO_DIR = 'MLDS_hw2_1_data/testing_data/feat/'
TEST_ID_DIR = 'MLDS_hw2_1_data/testing_id.txt'

In [10]:
test_id = pd.read_csv(TEST_ID_DIR, header=None, names=['id'])

### Read extracted video features into X,  label into y


In [11]:
for i, v in enumerate(test_id.id):
    v_dir = TEST_VIDEO_DIR + v + '.npy'
    X_test.append(np.load(v_dir))

X_test = np.array(X_test)

### Caption preprocessing (add buffer tokens to sentence and convert sentence to numbers)

In [12]:
## create word_to_idx, and idx_to_word
with open("word_dict_schedule.pkl","rb") as f:
    word_dict = pickle.load(f)
    
vocab_size = len(word_dict['word_to_idx'])

### Bulid Model

In [13]:
batch_size = 50
nm_epochs = 10
input_embedding_size = 128
encoder_hidden_units = 256
decoder_hidden_units = 256

In [14]:
tf.reset_default_graph()

In [15]:
encoder_inputs = tf.placeholder(shape=(None, 80, 4096), dtype=tf.float32)
decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32)
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32)
target_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
sampling_prob = tf.placeholder(shape=(), dtype=tf.float32)

In [16]:
def next_batch(source, target, batch_size):
    # Shuffle data
    source = np.array(source)
    target = np.array(target)
    shuffle_indices = np.random.permutation(np.arange(len(target)))
    source = source[shuffle_indices]
    target = target[shuffle_indices]
    
    for batch_i in range(0, len(source)//batch_size):
        start_i = batch_i * batch_size
        source_batch = source[start_i:start_i + batch_size]
        target_batch = target[start_i:start_i + batch_size]
        seqlen_batch = [list(row).index(2) for row in target_batch]

        yield np.array(source_batch), np.array(target_batch), np.array(seqlen_batch)

In [17]:
def build_model(batch_size, input_embedding_size, encoder_hidden_units, decoder_hidden_units):
    embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), dtype=tf.float32)
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)

    with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
        encoder_cell = tf.contrib.rnn.LSTMCell(encoder_hidden_units)

        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            encoder_cell, encoder_inputs, dtype=tf.float32
        )
        
    with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
        helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            decoder_inputs_embedded,
            target_seq_len,
            embedding=embeddings,
            sampling_probability=sampling_prob)
        
        output_layer = Dense(vocab_size)
        decoder_cell = tf.contrib.rnn.LSTMCell(decoder_hidden_units)
        
        decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cell,
            helper,
            encoder_final_state,
            output_layer=output_layer)
        
        maximum_iterations = tf.reduce_max(target_seq_len)
        
        decoder_outputs, decoder_final_state, seq_len = tf.contrib.seq2seq.dynamic_decode(
                                                        decoder, output_time_major=False,
                                                        impute_finished=True,
                                                        maximum_iterations=maximum_iterations)

    decoder_logits = tf.identity(decoder_outputs.rnn_output)
    
    return encoder_final_state, decoder_final_state, decoder_logits, maximum_iterations

In [18]:
def train_neural_network():
    final_preds = []
    encoder_final_state, decoder_final_state, decoder_logits, maximum_iterations = build_model(batch_size, input_embedding_size, encoder_hidden_units, decoder_hidden_units)
    decoder_prediction = tf.argmax(decoder_logits, 2)
    
    targets = tf.slice(decoder_targets, [0, 0], [-1, maximum_iterations])
    masks = tf.sequence_mask(target_seq_len, maximum_iterations, dtype=tf.float32)
    loss = tf.contrib.seq2seq.sequence_loss(
                    logits=decoder_logits,
                    targets=targets,
                    weights=masks)
    optimizer = tf.train.AdamOptimizer().minimize(loss)
    
    saver = tf.train.Saver()
    
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True

    with tf.Session(config=config) as sess:        
        saver.restore(sess, "models/lstm_model_schedule.ckpt")
        
        for x_test in X_test:
            preds = []
            current_pred = np.ones([1,1])
            x_test = np.expand_dims(x_test, axis=0)
            state = sess.run(encoder_final_state, feed_dict={encoder_inputs: x_test})

            for t in range(44):
                feed_dict={decoder_inputs: current_pred, encoder_final_state: state,
                           sampling_prob: 0.0, target_seq_len: [1]}
                current_pred, state = sess.run([decoder_prediction, decoder_final_state], feed_dict=feed_dict)
                if current_pred == 2:
                    break
                else:
                    preds.append(current_pred[0][0])
                    current_pred = current_pred.reshape(-1, 1)
            final_preds.append(preds)
        
    return final_preds

In [19]:
predictions = train_neural_network()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Restoring parameters from models/lstm_model_schedule.ckpt


In [None]:
predictions

In [21]:
text = [[word_dict['idx_to_word'][_id] for _id in row] for row in predictions]

In [None]:
text

In [23]:
# Write file

In [24]:
with open('output_schedule_inference.txt', 'w') as f:
    for i, t in zip(test_id.id, text):
        f.write('{},{}\n'.format(i, ' '.join(t)))