## Caption Preprocessing

### Word Embedding

In [1]:
from caption_utils import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
train_fns_list, dev_fns_list, test_fns_list = load_split_lists()

train_captions_raw, dev_captions_raw, test_captions_raw = get_caption_split()
vocab = create_vocab(train_captions_raw)
token2idx, idx2token = vocab_to_index(vocab)     
captions_data = (train_captions_raw.copy(), dev_captions_raw.copy(), test_captions_raw.copy())
train_captions, dev_captions, test_captions = process_captions(captions_data, token2idx)
print(len(vocab))

2531


In [3]:
assert(len(idx2token) == len(vocab) == len(token2idx))

### One hot encode

In [4]:
from keras.utils import to_categorical

def one_hot_encode(caption_dictionary):
    captions_dict = {}
    for filename in caption_dictionary:
        captions_dict[filename] = []
        for caption in caption_dictionary[filename]:
            encoded = to_categorical(caption, num_classes=len(vocab))
            captions_dict[filename].append(encoded)
    return captions_dict

In [5]:
train_captions_onehot = one_hot_encode(train_captions)
dev_captions_onehot = one_hot_encode(dev_captions)
test_captions_onehot = one_hot_encode(test_captions)

Change the shape of result of one hot encoded vectors

In [6]:
caption_lengths = []
for filename in train_captions.keys():
    for caption in train_captions[filename]:
        caption_lengths.append(len(caption))
for filename in dev_captions.keys():
    for caption in dev_captions[filename]:
        caption_lengths.append(len(caption))        
for filename in test_captions.keys():
    for caption in test_captions[filename]:
        caption_lengths.append(len(caption))

max_words_in_sentence = max(caption_lengths)

print("There are {} number of captions in total.".format(len(caption_lengths)))
print("The maximum words in a sentence is {}".format(max_words_in_sentence))

There are 40000 number of captions in total.
The maximum words in a sentence is 37


## 1. Training captions

In [7]:
num_words = len(vocab)
num_captions_per_image = 5 # 5 stands for number of captions per image
total_captions = len(train_captions) * num_captions_per_image 
print("There are {} distinct words in captions".format(num_words))
print("Thera are {} captions in training set".format(total_captions))

train_captions_onehot_processed = np.zeros((total_captions, max_words_in_sentence, num_words)).astype(bool)
print(train_captions_onehot_processed.shape)

for i, filename in enumerate(train_fns_list):
    for j, caption in enumerate(train_captions_onehot[filename]):
        for k, onehot in enumerate(caption):
            train_captions_onehot_processed[i*num_captions_per_image + j][k] = onehot
            
# Checking if train_captions_onehot_processed is correctly implemented
# Checking if number of words are identical per caption
Check_word_lengths = []
for i in range(len(train_captions)):
    for j in range(num_captions_per_image):
        Check_word_lengths.append(train_captions_onehot_processed[i*num_captions_per_image + j].sum() == len(train_captions[train_fns_list[i]][j]))

assert(sum(Check_word_lengths) == total_captions)

There are 2531 distinct words in captions
Thera are 30000 captions in training set
(30000, 37, 2531)


## 2. Validation captions

In [8]:
num_words = len(vocab)
num_captions_per_image = 5 # 5 stands for number of captions per image
total_captions = len(dev_captions) * num_captions_per_image 
print("There are {} distinct words in captions".format(num_words))
print("Thera are {} captions in validation set".format(total_captions))

dev_captions_onehot_processed = np.zeros((total_captions, max_words_in_sentence, num_words)).astype(bool)
print(dev_captions_onehot_processed.shape)

for i, filename in enumerate(dev_fns_list):
    for j, caption in enumerate(dev_captions_onehot[filename]):
        for k, onehot in enumerate(caption):
            dev_captions_onehot_processed[i*num_captions_per_image + j][k] = onehot
            
# Checking if dev_captions_onehot_processed is correctly implemented
# Checking if number of words are identical per caption
Check_word_lengths = []
for i in range(len(dev_captions)):
    for j in range(num_captions_per_image):
        Check_word_lengths.append(dev_captions_onehot_processed[i*num_captions_per_image + j].sum() == len(dev_captions[dev_fns_list[i]][j]))

assert(sum(Check_word_lengths) == total_captions)

There are 2531 distinct words in captions
Thera are 5000 captions in validation set
(5000, 37, 2531)


## 3. Test captions

In [9]:
num_words = len(vocab)
num_captions_per_image = 5 # 5 stands for number of captions per image
total_captions = len(test_captions) * num_captions_per_image 
print("There are {} distinct words in captions".format(num_words))
print("Thera are {} captions in test set".format(total_captions))

test_captions_onehot_processed = np.zeros((total_captions, max_words_in_sentence, num_words)).astype(bool)
print(test_captions_onehot_processed.shape)

for i, filename in enumerate(test_fns_list):
    for j, caption in enumerate(test_captions_onehot[filename]):
        for k, onehot in enumerate(caption):
            test_captions_onehot_processed[i*num_captions_per_image + j][k] = onehot
            
# Checking if test_captions_onehot_processed is correctly implemented
# Checking if number of words are identical per caption
Check_word_lengths = []
for i in range(len(test_captions)):
    for j in range(num_captions_per_image):
        Check_word_lengths.append(test_captions_onehot_processed[i*num_captions_per_image + j].sum() == len(test_captions[test_fns_list[i]][j]))

assert(sum(Check_word_lengths) == total_captions)

There are 2531 distinct words in captions
Thera are 5000 captions in test set
(5000, 37, 2531)


In [10]:
np.savez('preprocessed_captions/Flicker8k_onehot_'+str(len(vocab))+'_words',
        train=train_captions_onehot_processed,
        test=test_captions_onehot_processed,
        validation=dev_captions_onehot_processed)

In [11]:
print(onehot_to_caption(idx2token, train_captions[train_fns_list[0]][0]))
print(train_captions_raw[train_fns_list[0]][0])

a black dog is running after a white dog in the snow
A black dog is running after a white dog in the snow .
