## Prepare the inputs

- encoder_input_data: 2D array of shape `(num_images * 5, 512)`
- decoder_input_data: 3D array of shape `(num_captions, max_words_in_sentence, num_words)`
- decoder_output_data: same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :]

## Image Preprocessing

obtain bottleneck features

In [1]:
import numpy as np

In [2]:
# Since there are 5 captions per image, duplicate the bottleneck features
def duplicate_bottleneck_features(features):
    num_captions = 5 # 5 stands for number of captions per image
    num_rows = features.shape[0] * num_captions 

    features_dup = np.zeros((num_rows, features.shape[1]))
    for i, image in enumerate(features):
        for j in range(num_captions):
            features_dup[i*num_captions + j] = image
    return features_dup    

In [3]:
bottleneck_features = np.load('bottleneck_features/Flicker8k_bottleneck_features_VGG16_avgpooling.npz')
bottleneck_features_train = bottleneck_features["train"]
bottleneck_features_validation = bottleneck_features["validation"]
bottleneck_features_test = bottleneck_features["test"]

bottleneck_features_train_dup = duplicate_bottleneck_features(bottleneck_features_train)
bottleneck_features_validation_dup = duplicate_bottleneck_features(bottleneck_features_validation)
bottleneck_features_test_dup = duplicate_bottleneck_features(bottleneck_features_test)

In [4]:
print(bottleneck_features_train_dup.shape)
print(bottleneck_features_validation_dup.shape)
print(bottleneck_features_test_dup.shape)

(30000, 512)
(5000, 512)
(5000, 512)


## Word Embedding

In [9]:
from caption_utils import *

train_fns_list, dev_fns_list, test_fns_list = load_split_lists()
del train_fns_list[-1]
del dev_fns_list[-1]
del test_fns_list[-1]

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)     
captions_data = (train_captions_raw.copy(), dev_captions_raw.copy(), test_captions_raw.copy())
train_captions, dev_captions, test_captions = process_captions(captions_data, token2idx)
del train_captions['']
del dev_captions['']
del test_captions['']

## Caption Preprocessing


In [5]:
captions_onehot = np.load('preprocessed_captions/Flicker8k_onehot_2530_words.npz')
train_captions_onehot = captions_onehot["train"]
validation_captions_onehot = captions_onehot["validation"]
test_captions_onehot = captions_onehot["test"]

In [6]:
print(train_captions_onehot.shape)
print(validation_captions_onehot.shape)
print(test_captions_onehot.shape)

(30000, 37, 2531)
(5000, 37, 2531)
(5000, 37, 2531)


## Build Model

In [7]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

batch_size = 64  # Batch size for training.
epochs = 5  # Number of epochs to train for.
latent_dim = 300  # Latent dimensionality of the encoding space.
num_samples = bottleneck_features_train_dup.shape[0]

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [33]:
input_images = bottleneck_features_train_dup
target_words = train_captions_onehot

In [30]:
input_words = []
for dataset in captions_data:
    for filename in dataset:
        for caption in dataset[filename]:
            input_words.extend(caption)
            
input_words = sorted(input_words)
target_words = sorted(input_words)
num_encoder_tokens = len(vocab)
num_decoder_tokens = len(vocab)
max_encoder_seq_length = train_captions_onehot.shape[1]
max_decoder_seq_length = train_captions_onehot.shape[1]

print('Number of samples:', len(input_words))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 433458
Number of unique input tokens: 2530
Number of unique output tokens: 2530
Max sequence length for inputs: 37
Max sequence length for outputs: 37


In [48]:
#input_token_index = dict(
#    [(word, i) for i, word in enumerate(input_words)])
#target_token_index = dict(
#    [(word, i) for i, word in enumerate(target_words)])

encoder_input_data = bottleneck_features_train_dup
decoder_input_data = train_captions_onehot
decoder_target_data = train_captions_onehot
# Change decoder_target_data: decoder_target_data is ahead of decoder_input_data by one timestep
for i, caption in enumerate(decoder_input_data):
    for j, word in enumerate(caption):
        if j > 0:
            decoder_target_data[i][j-1] = decoder_input_data[i][j]

In [56]:
# needs to be true
(decoder_target_data[0][0] == decoder_input_data[0][1]).all()

False

# Remove lines below

In [None]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
#    for t, char in enumerate(input_text):
#        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.