### Understanding the data

In [None]:
import json
from IPython.display import JSON
import pprint
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
from keras.preprocessing import sequence as keras_seq
from keras.preprocessing.text import Tokenizer
from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, RepeatVector, Embedding, LSTM, TimeDistributed, Input, Concatenate
from keras.optimizers import  Adam
from keras.callbacks import ModelCheckpoint
import os
from time import time
import pickle
from operator import attrgetter
import math
import tensorflow as tf
from copy import copy

In [None]:
captions = []
with open('flickr8k/Flickr8k.token.txt', 'r') as f:
    for line in f:
        captions.append(line.strip())

In [None]:
print(len(captions))

### Ground truth image descriptions

In [None]:
def get_gt_image_descriptions(captions):
    descriptions = defaultdict(list)
    for cap in captions:
        elems = cap.split('\t')
        fn = elems[0][:-2]
        descriptions[fn].append(elems[1])
    return descriptions

In [None]:
descriptions = get_gt_image_descriptions(captions)
sample_image_id = np.random.choice(list(descriptions.keys()))
print('\n'.join(descriptions[sample_image_id]))

In [None]:
print(len(descriptions))

### Data cleaning

In [None]:
import string

In [None]:
string.punctuation

In [None]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            #desc = [w.lower() for w in desc]
            # remove punctuation
            desc = [w.translate(table) for w in desc]
            # remove 'a' and 's'
            #desc = [w for w in desc if len(w) > 1]
            # remove tokens with numbers in them
            #desc = [w for w in desc if w.isalpha()]
            # store as string
            desc_list[i] = ' '.join(desc)
    return descriptions

In [None]:
clean_descriptions = clean_descriptions(descriptions)

In [None]:
len(clean_descriptions)

### Save Descriptions

In [None]:
def save_descriptions(clean_descriptions, filename):
    lines = []
    for key, desc_list in clean_descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    with open(filename, 'w') as f:
        for line in lines:
            f.writelines(line)
            f.writelines('\n')

In [None]:
save_descriptions(clean_descriptions, 'descriptions.txt')

### Train/Test/Dev images

In [None]:
TRAIN_IMAGES_FILE = 'flickr8k/Flickr_8k.trainImages.txt'
TEST_IMAGES_FILE = 'flickr8k/Flickr_8k.testImages.txt'
DEV_IMAGES_FILE = 'flickr8k/Flickr_8k.devImages.txt'

In [None]:
def get_images_list(images_file):
    images_list = []
    with open(images_file, 'r') as f:
        for line in f:
            line = line.strip()
            images_list.append(line)
    return images_list

In [None]:
train_images = get_images_list(TRAIN_IMAGES_FILE)
test_images = get_images_list(TEST_IMAGES_FILE)
dev_images = get_images_list(DEV_IMAGES_FILE)
print('Num. of train images:', len(train_images))
print('Num. of test images:', len(test_images))
print('Num. of dev images:', len(dev_images))

In [None]:
print(train_images[0], test_images[0], dev_images[0])

### Train and Test Split

In [None]:
EOS_TOKEN = 'zeosz'

In [None]:
def extract_clean_descriptions(images, clean_descriptions):
    extracted_decriptions = {}
    for img in images:
        if img in clean_descriptions:
            # Add EOS_TOKEN to each descriptions
            extracted_decriptions[img] = list(map(lambda x: x + ' ' + EOS_TOKEN, clean_descriptions[img]))
    return extracted_decriptions

In [None]:
train_descriptions = extract_clean_descriptions(train_images, clean_descriptions)
test_descriptions = extract_clean_descriptions(test_images, clean_descriptions)

In [None]:
print('Descriptions: train={}'.format(len(train_descriptions)))
print('Descriptions: test={}'.format(len(test_descriptions)))

In [None]:
train_sample_image_id = np.random.choice(list(train_descriptions.keys()))
test_sample_image_id = np.random.choice(list(test_descriptions.keys()))
print('train sample image id:', train_sample_image_id)
print('test sample image id:', test_sample_image_id)

In [None]:
print('<train sample description>:', train_descriptions[train_sample_image_id])
print('<test sample description>:', test_descriptions[test_sample_image_id])

In [None]:
max_train_desc_len = 0
for k, v in train_descriptions.items():
    for desc in train_descriptions[k]:
        if max_train_desc_len < len(desc.split(' ')):
            max_train_desc_len = len(desc.split(' '))
print(max_train_desc_len)

### Create Vocabulary

In [None]:
def build_tokenizer(train_descriptions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([desc for k in train_descriptions.keys() for desc in train_descriptions[k]])
    return tokenizer

In [None]:
tokenizer = build_tokenizer(train_descriptions)

In [None]:
word_of = {i: w for w, i in tokenizer.word_index.items()}

In [None]:
print(word_of)

In [None]:
# index 0 is reserved for padding
vocab_size_no_pad = max(tokenizer.index_word)
vocab_size_with_pad = vocab_size_no_pad + 1

In [None]:
print('vocabulary size(no padding)', vocab_size_no_pad)
print('vocabulary size(with padding):', vocab_size_with_pad)

### Image embeddings

In [None]:
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model

In [None]:
image_model = InceptionV3(include_top=False, weights='imagenet', pooling='avg', input_shape=(299, 299, 3))

In [None]:
embed_dim = 300

In [None]:
for layer in image_model.layers:
    layer.trainable = False
dense_input = BatchNormalization(axis=-1)(image_model.output)
image_dense = Dense(units=embed_dim)(dense_input)
# Add a timestep dimension to match LSTM
image_embedding = RepeatVector(1)(image_dense)
image_input = image_model.input

In [None]:
print(image_input)
print(dense_input)
print(image_dense)
print(image_embedding)

### Word embeddings

In [None]:
feat_dim = image_model.output_shape[1]
print('embed_dim {}, feat_dim {}'.format(embed_dim, feat_dim))

In [None]:
sentence_input = Input(shape=[None])
word_embedding = Embedding(input_dim=vocab_size_no_pad, output_dim=embed_dim)(sentence_input)

In [None]:
print(sentence_input)
print(word_embedding)

### Use pretrained word vectors

In [None]:
# load pretrained word vectors
def load_word_vectors(word_vectors_path):
    embeddings = {}
    with open(word_vectors_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [None]:
pretrained_word_vectors = load_word_vectors('glove/glove.6B.200d.txt')

In [None]:
print('Num. of word vectors:', len(pretrained_word_vectors))

In [None]:
def build_word_embedding_matrix(tokenizer, pretrained_word_vectors, vocab_size_no_pad, embed_dim):
    word_embedding_matrix = np.zeros((vocab_size_no_pad, embed_dim))
    for word, i in tokenizer.word_index.items():
        # if the word is not included, get method returns None
        word_vector = pretrained_word_vectors.get(word)
        if word_vector is not None:
            word_embedding_matrix[i - 1] = word_vector
    return word_embedding_matrix

In [None]:
word_embedding_matrix = build_word_embedding_matrix(tokenizer, pretrained_word_vectors, vocab_size_no_pad, embed_dim)
print(word_embedding_matrix.shape)

### Encoder CNN and Decoder LSTM

In [None]:
seq_input = Concatenate(axis=1)([image_embedding, word_embedding])

In [None]:
print(image_embedding)
print(word_embedding)

In [None]:
input_ = seq_input
print(input_)

In [None]:
lstm_layers = 3
lstm_output_size = embed_dim
dropout_rate = 0.22
for _ in range(lstm_layers):
    input_ = BatchNormalization(axis=-1)(input_)
    lstm_out = LSTM(
        units=lstm_output_size, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate
    )(input_)
    input_ = lstm_out
seq_output = TimeDistributed(Dense(units=vocab_size_no_pad))(lstm_out)

In [None]:
model_im2txt = Model(inputs=[image_input, sentence_input], outputs=seq_output)

In [None]:
model_im2txt.summary()

#### Set weights from pretrained word vectors

In [None]:
model_im2txt.layers[-9].set_weights([word_embedding_matrix])
model_im2txt.layers[-9].trainable = False

#### Loss function and Metric

In [None]:
def categorical_crossentropy_from_logits(y_true, y_pred):
    # Discard the last timestep
    y_true = y_true[:, :-1, :]
    y_pred = y_pred[:, :-1, :]
    loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    return loss

In [None]:
def categorical_accuracy_with_variable_timestep(y_true, y_pred):
    # Discard the last timestep
    y_true = y_true[:, :-1, :]
    y_pred = y_pred[:, :-1 :]
    # Flatten the timestep dimension
    shape = tf.shape(y_true)
    y_true = tf.reshape(y_true, [-1, shape[-1]])
    y_pred = tf.reshape(y_pred, [-1, shape[-1]])
    # Discard rows that are all zeros as they represent padding words
    is_zero_y_true = tf.equal(y_true, 0)
    is_zero_row_y_true = tf.reduce_all(is_zero_y_true, axis=-1)
    y_true = tf.boolean_mask(y_true, ~is_zero_row_y_true)
    y_pred = tf.boolean_mask(y_pred, ~is_zero_row_y_true)
    
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_true, axis=1), tf.argmax(y_pred, axis=1)), dtype=tf.float32))
    return accuracy

#### Model compilation

In [None]:
learning_rate = 0.00051
model_im2txt.compile(optimizer=Adam(lr=learning_rate), loss=categorical_crossentropy_from_logits, metrics=[categorical_accuracy_with_variable_timestep])

### Training

In [None]:
image_dir = 'flickr8k/images/'

In [None]:
from collections import namedtuple

In [None]:
Datum = namedtuple('Datum', ['img_filename', 'img_path', 'caption_txt', 'all_captions_txt'])

In [None]:
def build_datum_list(clean_desc_keys, descriptions, image_dir):
    datum_list = []
    for k in clean_desc_keys:
        img_filename = k + '.jpg'
        img_path = os.path.join(image_dir, img_filename)
        all_captions_txt = descriptions[k]
        for desc in all_captions_txt:
            datum_list.append(Datum(img_filename=img_filename, img_path=img_path, caption_txt=desc, all_captions_txt=all_captions_txt))
    return datum_list

In [None]:
train_datum_list = build_datum_list(train_clean_desc_keys, train_descriptions, image_dir)

In [None]:
print(train_datum_list[0])

#### Build batch generator

In [None]:
batch_size = 32

In [None]:
def preprocess_an_image(img_path):
    # RGB format
    img = image.load_img(img_path, target_size=(299, 299, 3))
    img_array = image.img_to_array(img)
    # Use inception_v3.preprocess_input()
    img_array = preprocess_input(img_array)
    return img_array

In [None]:
def preprocess_images(imgs_path):
    return map(preprocess_an_image, imgs_path)

In [None]:
def encode_captions(tokenizer, captions_txt):
    return tokenizer.texts_to_sequences(captions_txt)

In [None]:
def preprocess_img_batch(img_batch):
    return np.array(list(img_batch))

In [None]:
def preprocess_caption_batch(caption_batch, tokenizer):
    # captions must have same length within a batch
    captions = keras_seq.pad_sequences(caption_batch, padding='post')
    # add 1 (first word is the image)
    captions_ext1 = keras_seq.pad_sequences(captions, maxlen=captions.shape[-1] + 1, padding='post')
    # one hot sequence (batch_size, seq_len, vocab_size)
    captions_one_hot = map(tokenizer.sequences_to_matrix, np.expand_dims(captions_ext1, axis=-1))
    captions_one_hot = np.array(list(captions_one_hot), dtype='int')
    # except index 0 (i.e. remove padding index)
    captions_one_hot_shifted = captions_one_hot[:, :, 1:]
    # index - 1  (index 0 is 'zeosz')
    captions_decreased = captions.copy()
    captions_decreased[captions_decreased > 0] -= 1
    
    captions_input = captions_decreased
    captions_output = captions_one_hot_shifted
    
    return captions_input, captions_output

In [None]:
def preprocess_batch(datum_batch, tokenizer):
    imgs_path = map(attrgetter('img_path'), datum_batch)
    captions_txt = map(attrgetter('caption_txt'), datum_batch)
    img_batch = preprocess_images(imgs_path)
    caption_batch = encode_captions(tokenizer, captions_txt)
    img_input = preprocess_img_batch(img_batch)
    captions = preprocess_caption_batch(caption_batch, tokenizer)
    captions_input, captions_output = captions
    X, y = [img_input, captions_input], captions_output
    return X, y

In [None]:
# generate batch from train_datum_list or test_datum_list
def batch_generator(datum_list, batch_size, tokenizer):
    datum_list_c = copy(datum_list)
    while True:
        np.random.shuffle(datum_list_c)
        datum_batch = []
        for datum in datum_list_c:
            datum_batch.append(datum)
            if len(datum_batch) >= batch_size:
                yield preprocess_batch(datum_batch, tokenizer)
                datum_batch = []
        if datum_batch:
            yield preprocess_batch(datum_batch, tokenizer)

In [None]:
training_set_size = len(train_descriptions)
print(training_set_size)

In [None]:
training_steps = int(math.ceil(1. * training_set_size / batch_size))
print(training_steps)

In [None]:
epochs = 100
max_q_size = 10

In [None]:
def training_set(train_datum_list, batch_size, tokenizer):
    for batch in batch_generator(train_datum_list, batch_size, tokenizer):
        yield batch

In [None]:
fpath = 'weights.epoch{epoch:02d}-loss{loss:.2f}-acc{categorical_accuracy_with_variable_timestep:.2f}-.hdf5'
cp_cb = ModelCheckpoint(filepath=fpath, save_best_only=False, monitor='val_loss')

In [None]:
model_im2txt.fit_generator(generator=training_set(train_datum_list, batch_size, tokenizer), steps_per_epoch=training_steps, epochs=epochs, max_queue_size=max_q_size, verbose=1, callbacks=[cp_cb])

### Predict

In [None]:
model_im2txt.load_weights('weights.epoch96-loss1.74-acc0.34-.hdf5')

In [None]:
test_img_path = os.path.join(image_dir, test_sample_image_id + '.jpg')

In [None]:
test_img = image.load_img(test_img_path)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.imshow(test_img)

In [None]:
test_datum_list = build_datum_list(test_clean_desc_keys, test_descriptions, image_dir)

In [None]:
print(test_datum_list[0])

In [None]:
def test_set(test_datum_list, batch_size, tokenizer):
    for batch in batch_generator(test_datum_list, batch_size, tokenizer):
        yield batch

In [None]:
def decode_captions(captions_pred):    
    captions = captions_pred[:, :-1, :]  # discard the last word
    label_encoded = captions.argmax(axis=-1)
    num_batches, num_words = label_encoded.shape
    caption_length = [num_words] * num_batches
    
    captions_str = []
    for caption_i in range(num_batches):
        caption_str = []
        for word_i in range(caption_length[caption_i]):
            label = label_encoded[caption_i, word_i]
            label += 1  # Real label = label in model + 1
            caption_str.append(word_of[label])
        captions_str.append(caption_str)
    
    return captions_str

In [None]:
def basic_inference(X, model_im2txt, tokenizer, max_caption_length=20):
    captions_pred = model_im2txt.predict_on_batch(X)
    captions_pred_str = decode_captions(captions_pred)
    return captions_pred_str

In [None]:
it = test_set(test_datum_list, 1, tokenizer)
batch_input, batch_output = it.__next__()

In [None]:
# Basic inference
captions_pred_str = basic_inference(batch_input, model_im2txt, tokenizer)
print(' '.join(captions_pred_str[0]))
print(' '.join(decode_captions(batch_output)[0]))

In [None]:
def beam_search_predict_on_batch(X, model_im2txt, tokenizer, max_caption_length=20):
    imgs_input, _ = X
    batch_size = imgs_input.shape[0]
    EOS_ENCODED = tokenizer.word_index(EOS_TOKEN)
    return