### Understanding the data

In [None]:
import json
from IPython.display import JSON
import pprint
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
from keras.preprocessing import sequence as keras_seq
from keras.preprocessing.text import Tokenizer
from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, RepeatVector, Embedding, LSTM, TimeDistributed, Input, Concatenate
from keras.optimizers import  Adam
import os
from time import time
import pickle
from operator import attrgetter
import math
import tensorflow as tf

In [None]:
with open('flickr8k/dataset.json', 'r') as f:
    json_data = json.load(f)

In [None]:
json_data.keys()

In [None]:
json_data['dataset']

In [None]:
json_data['images'][0].keys()

In [None]:
pprint.pprint(json_data['images'][0])

### Ground truth image descriptions

In [None]:
def get_gt_image_descriptions(json_data):
    descriptions = defaultdict(list)
    for jd in json_data['images']:
        fn = jd['filename'].split('.')[0]
        for s in jd['sentences']:
            descriptions[fn].append(s['raw'])
    return descriptions

In [None]:
descriptions = get_gt_image_descriptions(json_data)
sample_image_id = np.random.choice(list(descriptions.keys()))
print('\n'.join(descriptions[sample_image_id]))

### Data cleaning

In [None]:
import string

In [None]:
string.punctuation

In [None]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [w.lower() for w in desc]
            # remove punctuation
            desc = [w.translate(table) for w in desc]
            # remove 'a' and 's'
            desc = [w for w in desc if len(w) > 1]
            # remove tokens with numbers in them
            desc = [w for w in desc if w.isalpha()]
            # store as string
            desc_list[i] = ' '.join(desc)
    return descriptions

In [None]:
clean_descriptions = clean_descriptions(descriptions)

In [None]:
clean_descriptions[sample_image_id]

In [None]:
all_descriptions = [len(clean_descriptions[key]) for key in clean_descriptions.keys()]
print(sum(all_descriptions))

### Save Descriptions

In [None]:
def save_descriptions(clean_descriptions, filename):
    lines = []
    for key, desc_list in clean_descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    with open(filename, 'w') as f:
        for line in lines:
            f.writelines(line)
            f.writelines('\n')

In [None]:
save_descriptions(clean_descriptions, 'descriptions.txt')

### Train and Test Split

In [None]:
train_clean_desc_keys, test_clean_desc_keys = train_test_split(list(clean_descriptions.keys()))
print('train size:', len(train_clean_desc_keys))
print('test size:', len(test_clean_desc_keys))

In [None]:
def load_train_clean_descriptions(train_clean_desc_keys, filename):
    train_clean_descriptions = {}
    with open(filename, 'r') as f:
        for line in f:
            line = line.rstrip()
            tokens = line.split()
            image_id, image_desc = tokens[0], tokens[1:]
            # skip images not in the train set
            if image_id in train_clean_desc_keys:
                if not image_id in train_clean_descriptions:
                    train_clean_descriptions[image_id] = []
                # add start and end token
                desc = ' '.join(image_desc) + ' zeosz'
                train_clean_descriptions[image_id].append(desc)
    return train_clean_descriptions

In [None]:
train_descriptions = load_train_clean_descriptions(train_clean_desc_keys, 'descriptions.txt')

In [None]:
print('Descriptions: train={}'.format(len(train_descriptions)))

In [None]:
print(train_descriptions[sample_image_id])

### Create Vocabulary

In [None]:
def build_tokenizer(train_descriptions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([desc for k in train_descriptions.keys() for desc in train_descriptions[k]])
    return tokenizer

In [None]:
tokenizer = build_tokenizer(train_descriptions)

In [None]:
# index 0 is reserved for padding
vocab_size = max(tokenizer.index_word)  + 1

In [None]:
print('vocabulary size:', vocab_size)

### Image embeddings

In [None]:
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model

In [None]:
image_model = InceptionV3(include_top=False, weights='imagenet', pooling='avg', input_shape=(299, 299, 3))

In [None]:
embed_dim = 200

In [None]:
for layer in image_model.layers:
    layer.trainable = False
dense_input = BatchNormalization(axis=-1)(image_model.output)
image_dense = Dense(units=embed_dim)(dense_input)
# Add a timestep dimension to match LSTM
image_embedding = RepeatVector(1)(image_dense)
image_input = image_model.input

In [None]:
print(image_input)
print(dense_input)
print(image_dense)
print(image_embedding)

### Word embeddings

In [None]:
feat_dim = image_model.output_shape[1]
print('embed_dim {}, feat_dim {}'.format(embed_dim, feat_dim))

In [None]:
sentence_input = Input(shape=[None])
word_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(sentence_input)

In [None]:
print(sentence_input)
print(word_embedding)

### Use pretrained word vectors

In [None]:
# load pretrained word vectors
def load_word_vectors(word_vectors_path):
    embeddings = {}
    with open(word_vectors_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [None]:
pretrained_word_vectors = load_word_vectors('glove/glove.6B.200d.txt')

In [None]:
print('Num. of word vectors:', len(pretrained_word_vectors))

In [None]:
def build_word_embedding_matrix(tokenizer, pretrained_word_vectors, vocab_size, embed_dim):
    word_embedding_matrix = np.zeros((vocab_size, embed_dim))
    for word, i in tokenizer.word_index.items():
        # if the word is not included, get method returns None
        word_vector = pretrained_word_vectors.get(word)
        if word_vector is not None:
            word_embedding_matrix[i] = word_vector
    return word_embedding_matrix

In [None]:
word_embedding_matrix = build_word_embedding_matrix(tokenizer, pretrained_word_vectors, vocab_size, embed_dim)
print(word_embedding_matrix.shape)

### Encoder CNN and Decoder LSTM

In [None]:
seq_input = Concatenate(axis=1)([image_embedding, word_embedding])

In [None]:
print(image_embedding)
print(word_embedding)

In [None]:
input_ = seq_input
print(input_)

In [None]:
lstm_layers = 3
lstm_output_size = embed_dim
dropout_rate = 0.22
for _ in range(lstm_layers):
    input_ = BatchNormalization(axis=-1)(input_)
    lstm_out = LSTM(
        units=lstm_output_size, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate
    )(input_)
    input_ = lstm_out
seq_output = TimeDistributed(Dense(units=vocab_size))(lstm_out)

In [None]:
model_im2txt = Model(inputs=[image_input, sentence_input], outputs=seq_output)

In [None]:
model_im2txt.summary()

#### Set weights from pretrained word vectors

In [None]:
model_im2txt.layers[-9].set_weights([word_embedding_matrix])
model_im2txt.layers[-9].trainable = False

#### Loss function

In [None]:
def categorical_crossentropy_from_logits(y_true, y_pred):
    # Discard the last timestep
    y_true = y_true[:, :-1, :]
    y_pred = y_pred[:, :-1, :]
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true, logits=y_pred)
    return loss

#### Model compilation

In [None]:
learning_rate = 0.00051
model_im2txt.compile(optimizer=Adam(lr=learning_rate), loss=categorical_crossentropy_from_logits)

### Training

In [None]:
image_dir = 'flickr8k/images/'

In [None]:
from collections import namedtuple

In [None]:
Datum = namedtuple('Datum', ['img_filename', 'img_path', 'caption_txt', 'all_captions_txt'])

In [None]:
def build_train_datum_list(train_clean_desc_keys, train_descriptions, image_dir):
    train_datum_list = []
    for k in train_clean_desc_keys:
        img_filename = k + '.jpg'
        img_path = os.path.join(image_dir, img_filename)
        all_captions_txt = train_descriptions[k]
        for desc in all_captions_txt:
            train_datum_list.append(Datum(img_filename=img_filename, img_path=img_path, caption_txt=desc, all_captions_txt=all_captions_txt))
    return train_datum_list

In [None]:
train_datum_list = build_train_datum_list(train_clean_desc_keys, train_descriptions, image_dir)

In [None]:
print(train_datum_list[0])

#### Build batch generator

In [None]:
batch_size = 32

In [None]:
def preprocess_an_image(img_path):
    # RGB format
    img = image.load_img(img_path, target_size=(299, 299, 3))
    img_array = image.img_to_array(img)
    # Use inception_v3.preprocess_input()
    img_array = preprocess_input(img_array)
    return img_array

In [None]:
def preprocess_images(imgs_path):
    return map(preprocess_an_image, imgs_path)

In [None]:
def encode_captions(tokenizer, captions_txt):
    return tokenizer.texts_to_sequences(captions_txt)

In [None]:
def preprocess_img_batch(img_batch):
    return np.array(list(img_batch))

In [None]:
def preprocess_caption_batch(caption_batch, tokenizer):
    # captions must have same length within a batch
    captions = keras_seq.pad_sequences(caption_batch, padding='post')
    # add 1 (first word is the image)
    captions_ext1 = keras_seq.pad_sequences(captions, maxlen=captions.shape[-1] + 1, padding='post')
    # one hot sequence (batch_size, seq_len, vocab_size)
    captions_one_hot = map(tokenizer.sequences_to_matrix, np.expand_dims(captions_ext1, axis=-1))
    captions_one_hot = np.array(list(captions_one_hot), dtype='int')
    # except index 0 (i.e. remove padding index)
    captions_one_hot_shifted = captions_one_hot[:, :, 1:]
    # index - 1 
    captions_decreased = captions.copy()
    captions_decreased[captions_decreased > 0] -= 1
    
    captions_input = captions_decreased
    captions_output = captions_one_hot_shifted
    
    return captions_input, captions_output

In [None]:
def preprocess_batch(datum_batch, tokenizer):
    imgs_path = map(attrgetter('img_path'), datum_batch)
    captions_txt = map(attrgetter('caption_txt'), datum_batch)
    img_batch = preprocess_images(imgs_path)
    caption_batch = encode_captions(tokenizer, captions_txt)
    img_input = preprocess_img_batch(img_batch)
    captions = preprocess_caption_batch(caption_batch)
    captions_input, captions_output = captions
    X, y = [img_input, captions_input], captions_output
    return X, y

In [None]:
# generate batch from train_datum_list
def batch_generator(train_datum_list, batch_size):
    datum_list = copy(train_datum_list)
    while True:
        np.random.shuffle(datum_list)
        datum_batch = []
        for datum in datum_list:
            datum_batch.append(datum)
            if len(datum_batch) >= batch_size:
                yield preprocess_batch(datum_batch)
                datum_batch = []
        if datum_batch:
            yield preprocess_batch(datum_batch)