In [1]:
%matplotlib inline

In [2]:
import os 
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
import pandas as pd
import collections
import pickle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
X_train = []
X_test = []
y_train = []
y_test = []
TRAIN_VIDEO_DIR = 'MLDS_hw2_1_data/training_data/feat/'
TEST_VIDEO_DIR = 'MLDS_hw2_1_data/testing_data/feat/'
TRAIN_LABEL_DIR = 'MLDS_hw2_1_data/training_label.json'
TEST_LABEL_DIR = 'MLDS_hw2_1_data/testing_label.json'
TRAIN_ID_DIR = 'MLDS_hw2_1_data/training_id.txt'
TEST_ID_DIR = 'MLDS_hw2_1_data/testing_id.txt'

### Read extracted video features into X,  label into y


In [5]:
train_label = pd.read_json(TRAIN_LABEL_DIR)
test_label = pd.read_json(TEST_LABEL_DIR)
train_id = pd.read_csv(TRAIN_ID_DIR, header=None, names=['id'])
test_id = pd.read_csv(TEST_ID_DIR, header=None, names=['id'])

In [6]:
for i, v in enumerate(train_id.id):
    v_dir = TRAIN_VIDEO_DIR + v + '.npy'
    X_train.append(np.load(v_dir))    
    y_train.append(train_label.loc[i, ['caption']].tolist()[0])

X_train = np.array(X_train)
y_train = list(map(list, zip(*[y[:5] for y in y_train]))) # Sample 5 labels for each sample > sample size = 5*1450
y_train = [[text_to_word_sequence(s) for s in lst] for lst in y_train]

In [7]:
for i, v in enumerate(test_id.id):
    v_dir = TEST_VIDEO_DIR + v + '.npy'
    X_test.append(np.load(v_dir))

X_test = np.array(X_test)

### Caption preprocessing (add buffer tokens to sentence and convert sentence to numbers)

In [8]:
BUFFER_TOKENS = ['<PAD>', '<BOS>', '<EOS>', '<UNK>']

In [9]:
## get the vocaboluary 
list_of_all_words = [w for sublist in y_train for item in sublist for w in item]
counter = collections.Counter(list_of_all_words)
vocab = {k:v for k, v in counter.items() if v > 3} # words with frequency > 3 are used

In [10]:
## create word_to_idx, and idx_to_word
vocab = [i for i in vocab]
word_to_idx = {}
idx_to_word = {}
# add in BUFFER_TOKENS
for i in range(len(BUFFER_TOKENS)):
    idx_to_word[int(i)] = BUFFER_TOKENS[i]
    word_to_idx[BUFFER_TOKENS[i]] = i

for i in range(len(vocab)):
    word_to_idx[vocab[i]] = i + len(BUFFER_TOKENS)
    idx_to_word[int(i + len(BUFFER_TOKENS))] = vocab[i]

word_dict = {}
word_dict['idx_to_word'] = idx_to_word
word_dict['word_to_idx'] = word_to_idx
vocab_size = len(word_to_idx)

In [11]:
with open("word_dict.pkl","wb") as f:
    pickle.dump(word_dict, f)

In [12]:
# convert sentences into encoding/integers
# pad all sentence to length of padding_len - 2 
def _convert_sentence_to_numbers(s):
    """Convert a sentence s (a list of words) to list of numbers using word_to_idx"""
    UNK_IDX = BUFFER_TOKENS.index('<UNK>')
    PAD_IDX = BUFFER_TOKENS.index('<PAD>')
    START_TOKEN = BUFFER_TOKENS.index('<BOS>')
    END_IDX = BUFFER_TOKENS.index('<EOS>')
    padding_len = 44
    s_encoded = [START_TOKEN]
    s_encoded += [word_to_idx.get(w) for w in s if w in word_to_idx]
    s_encoded += [END_IDX]
    s_encoded += [PAD_IDX] * (padding_len - len(s_encoded))
    return s_encoded

In [13]:
Y_train = [[_convert_sentence_to_numbers(s) for s in lst] for lst in y_train]

### Bulid Model

In [14]:
batch_size = 50
nm_epochs = 10
input_embedding_size = 128
encoder_hidden_units = 256
decoder_hidden_units = 256
train_num_batches_per_epoch = len(X_train) // batch_size

In [15]:
tf.reset_default_graph()

In [16]:
encoder_inputs = tf.placeholder(shape=(None, 80, 4096), dtype=tf.float32)
decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32)
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32)
target_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)

In [17]:
def next_batch(source, target, batch_size):
    # Shuffle data
    source = np.array(source)
    target = np.array(target)
    shuffle_indices = np.random.permutation(np.arange(len(target)))
    source = source[shuffle_indices]
    target = target[shuffle_indices]
    
    for batch_i in range(0, len(source)//batch_size):
        start_i = batch_i * batch_size
        source_batch = source[start_i:start_i + batch_size]
        target_batch = target[start_i:start_i + batch_size]
        seqlen_batch = [list(row).index(2) for row in target_batch]

        yield np.array(source_batch), np.array(target_batch), np.array(seqlen_batch)

In [18]:
def build_model(batch_size, input_embedding_size, encoder_hidden_units, decoder_hidden_units):
    embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), dtype=tf.float32)
    decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)

    with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
        encoder_cell = tf.contrib.rnn.LSTMCell(encoder_hidden_units)

        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            encoder_cell, encoder_inputs, dtype=tf.float32
        )
        
    with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
        decoder_cell = tf.contrib.rnn.LSTMCell(decoder_hidden_units)

        decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
            decoder_cell, decoder_inputs_embedded,
            initial_state=encoder_final_state, 
            sequence_length=target_seq_len, dtype=tf.float32
        )

    decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
    
    return encoder_final_state, decoder_final_state, decoder_logits

In [19]:
def train_neural_network():
    final_preds = []
    encoder_final_state, decoder_final_state, decoder_logits = build_model(batch_size, input_embedding_size, encoder_hidden_units, decoder_hidden_units)
    decoder_prediction = tf.argmax(decoder_logits, 2)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
        logits=decoder_logits,
    )

    loss = tf.reduce_mean(cross_entropy)
    optimizer = tf.train.AdamOptimizer().minimize(loss)
    
    saver = tf.train.Saver()
    
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        for train_y in Y_train:
            for epoch in range(nm_epochs):
                training_loss = 0.0
                for epoch_x, epoch_y, epoch_seqlen in next_batch(X_train, train_y, batch_size):
                    _, c = sess.run([optimizer, loss], feed_dict={encoder_inputs: epoch_x, decoder_inputs: epoch_y[:, :-1], 
                                                                  decoder_targets: epoch_y[:, 1:], target_seq_len: epoch_seqlen})
                    training_loss += c / train_num_batches_per_epoch
                print('Epoch {} training loss: {}'.format(str(epoch+1)+'/'+str(nm_epochs), training_loss))
        
        saver.save(sess, "models/lstm_model_10.ckpt")
        
        for x_test in X_test:
            preds = []
            current_pred = np.ones([1,1])
            x_test = np.expand_dims(x_test, axis=0)
            state = sess.run(encoder_final_state, feed_dict={encoder_inputs: x_test})

            for t in range(44):
                feed_dict={decoder_inputs: current_pred, 
                           encoder_final_state: state, target_seq_len: [1]}
                current_pred, state = sess.run([decoder_prediction, decoder_final_state], feed_dict=feed_dict)
                if current_pred == 2:
                    break
                else:
                    preds.append(current_pred[0][0])
                    current_pred = current_pred.reshape(-1, 1)
            final_preds.append(preds)
        
    return final_preds

In [None]:
predictions = train_neural_network()

In [None]:
predictions

In [22]:
text = [[word_dict['idx_to_word'][_id] for _id in row] for row in predictions]

In [None]:
text

In [24]:
# Write file

In [25]:
with open('output_10.txt', 'w') as f:
    for i, t in zip(test_id.id, text):
        f.write('{},{}\n'.format(i, ' '.join(t)))