## Prepare the inputs

- encoder_input_data: 2D array of shape `(num_images * 5, 512)`
- decoder_input_data: 3D array of shape `(num_captions, max_words_in_sentence, num_words)`
- decoder_output_data: same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :]

## Image Preprocessing

obtain bottleneck features

In [1]:
import numpy as np

In [2]:
# Since there are 5 captions per image, duplicate the bottleneck features
def duplicate_bottleneck_features(features):
    num_captions = 5 # 5 stands for number of captions per image
    num_rows = features.shape[0] * num_captions 

    features_dup = np.zeros((num_rows, features.shape[1]))
    for i, image in enumerate(features):
        for j in range(num_captions):
            features_dup[i*num_captions + j] = image
    return features_dup    

In [3]:
bottleneck_features = np.load('bottleneck_features/Flicker8k_bottleneck_features_VGG16_avgpooling.npz')
bottleneck_features_train = bottleneck_features["train"]
bottleneck_features_validation = bottleneck_features["validation"]
bottleneck_features_test = bottleneck_features["test"]

bottleneck_features_train_dup = duplicate_bottleneck_features(bottleneck_features_train)
bottleneck_features_validation_dup = duplicate_bottleneck_features(bottleneck_features_validation)
bottleneck_features_test_dup = duplicate_bottleneck_features(bottleneck_features_test)

In [4]:
print(bottleneck_features_train_dup.shape)
print(bottleneck_features_validation_dup.shape)
print(bottleneck_features_test_dup.shape)

(30000, 512)
(5000, 512)
(5000, 512)


## Word Embedding

In [5]:
from caption_utils import *
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)     
captions_data = (train_captions_raw.copy(), dev_captions_raw.copy(), test_captions_raw.copy())
train_captions, dev_captions, test_captions = process_captions(captions_data, token2idx)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [6]:
print(len(vocab))

2531


## Caption Preprocessing


In [7]:
captions_onehot = np.load('preprocessed_captions/Flicker8k_onehot_'+str(len(vocab))+'_words.npz')
train_captions_onehot = captions_onehot["train"]
validation_captions_onehot = captions_onehot["validation"]
test_captions_onehot = captions_onehot["test"]

train_captions_onehot = train_captions_onehot.astype(np.float32)
validation_captions_onehot = validation_captions_onehot.astype(np.float32)
test_captions_onehot = test_captions_onehot.astype(np.float32)

In [8]:
print(train_captions_onehot.shape)
print(validation_captions_onehot.shape)
print(test_captions_onehot.shape)

(30000, 39, 2531)
(5000, 39, 2531)
(5000, 39, 2531)


In [9]:
print(train_captions_onehot.dtype, validation_captions_onehot.dtype, test_captions_onehot.dtype)

float32 float32 float32


In [10]:
def captions_onehot_split(captions_onehot):
    """ returns decoder input data and decoder target data """
    return captions_onehot[:, :-1, :], captions_onehot[:, :, :]

### Training captions - > decoder input, target data  

In [11]:
train_decoder_input, train_decoder_target = captions_onehot_split(train_captions_onehot)
validation_decoder_input, validation_decoder_target = captions_onehot_split(validation_captions_onehot)
test_decoder_input, test_decoder_target = captions_onehot_split(test_captions_onehot)

In [12]:
train_encoder_output = bottleneck_features_train_dup.astype(np.float32)
test_encoder_output = bottleneck_features_test_dup.astype(np.float32)
validation_encoder_output = bottleneck_features_validation_dup.astype(np.float32)

In [13]:
test_decoder_input = np.argmax(test_decoder_input, axis=-1)
train_decoder_input = np.argmax(train_decoder_input, axis=-1)
validation_decoder_input = np.argmax(validation_decoder_input, axis=-1)

In [15]:
print("Decoder Input", train_decoder_input.shape, train_decoder_input.dtype)
print("Decoder Target", train_decoder_target.shape, train_decoder_target.dtype)
print("Encoder Output", train_encoder_output.shape, train_encoder_output.dtype)

Decoder Input (30000, 38) int64
Decoder Target (30000, 39, 2531) float32
Encoder Output (30000, 512) float32


# Saving Final Data to be used for Training

In [17]:
np.savez('train_dev_test',
         train_encoder_output=train_encoder_output,
         train_decoder_input=train_decoder_input,
         train_decoder_target=train_decoder_target,
         validation_encoder_output=validation_encoder_output,
         validation_decoder_input=validation_decoder_input,
         validation_decoder_target=validation_decoder_target,
         test_encoder_output=test_encoder_output,
         test_decoder_input=test_decoder_input,
         test_decoder_target=test_decoder_target)