### Understanding the data

In [None]:
import json
from IPython.display import JSON
import pprint
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, RepeatVector, Embedding, LSTM, TimeDistributed, Input, Concatenate
from keras.optimizers import  Adam
import os
from time import time
import pickle
from operator import attrgetter
from functools import partial

In [None]:
with open('flickr8k/dataset.json', 'r') as f:
    json_data = json.load(f)

In [None]:
json_data.keys()

In [None]:
json_data['dataset']

In [None]:
json_data['images'][0].keys()

In [None]:
pprint.pprint(json_data['images'][0])

### Ground truth image descriptions

In [None]:
def get_gt_image_descriptions(json_data):
    descriptions = defaultdict(list)
    for jd in json_data['images']:
        fn = jd['filename'].split('.')[0]
        for s in jd['sentences']:
            descriptions[fn].append(s['raw'])
    return descriptions

In [None]:
descriptions = get_gt_image_descriptions(json_data)
sample_image_id = np.random.choice(list(descriptions.keys()))
print('\n'.join(descriptions[sample_image_id]))

### Data cleaning

In [None]:
import string

In [None]:
string.punctuation

In [None]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            desc = [w.lower() for w in desc]
            # remove punctuation
            desc = [w.translate(table) for w in desc]
            # remove 'a' and 's'
            desc = [w for w in desc if len(w) > 1]
            # remove tokens with numbers in them
            desc = [w for w in desc if w.isalpha()]
            # store as string
            desc_list[i] = ' '.join(desc)
    return descriptions

In [None]:
clean_descriptions = clean_descriptions(descriptions)

In [None]:
clean_descriptions[sample_image_id]

In [None]:
all_descriptions = [len(clean_descriptions[key]) for key in clean_descriptions.keys()]
print(sum(all_descriptions))

### Save Descriptions

In [None]:
def save_descriptions(clean_descriptions, filename):
    lines = []
    for key, desc_list in clean_descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    with open(filename, 'w') as f:
        for line in lines:
            f.writelines(line)
            f.writelines('\n')

In [None]:
save_descriptions(clean_descriptions, 'descriptions.txt')

### Train and Test Split

In [None]:
train_clean_desc_keys, test_clean_desc_keys = train_test_split(list(clean_descriptions.keys()))
print('train size:', len(train_clean_desc_keys))
print('test size:', len(test_clean_desc_keys))

In [None]:
def load_train_clean_descriptions(train_clean_desc_keys, filename):
    train_clean_descriptions = {}
    with open(filename, 'r') as f:
        for line in f:
            line = line.rstrip()
            tokens = line.split()
            image_id, image_desc = tokens[0], tokens[1:]
            # skip images not in the train set
            if image_id in train_clean_desc_keys:
                if not image_id in train_clean_descriptions:
                    train_clean_descriptions[image_id] = []
                # add start and end token
                desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
                train_clean_descriptions[image_id].append(desc)
    return train_clean_descriptions

In [None]:
train_descriptions = load_train_clean_descriptions(train_clean_desc_keys, 'descriptions.txt')

In [None]:
print('Descriptions: train={}'.format(len(train_descriptions)))

In [None]:
print(train_descriptions[sample_image_id])

### Create Vocabulary

In [None]:
def create_vocabulary(clean_descriptions):
    #  remove duplicate words (set of unique words)
    vocabulary = set()
    for key in clean_descriptions.keys():
        [vocabulary.update(d.split()) for d in clean_descriptions[key]]
    return vocabulary

In [None]:
vocabulary = create_vocabulary(train_descriptions)

In [None]:
print('vocabulary size:', len(vocabulary))

In [None]:
vocab_size = len(vocabulary) + 1

### Feature Vector Extraction

In [None]:
from keras.applications.inception_v3 import InceptionV3
from keras.models import Model

In [None]:
image_model = InceptionV3(include_top=False, weights='imagenet', pooling='avg', input_shape=(299, 299, 3))

In [None]:
def preprocess(image_path):
    # convert all images to the size 299x299 as expected by the Inception v3
    img = image.load_img(image_path, target_size=(299, 299))
    # convert PIL image to numpy array
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    # preprocess image using preprocess_input from inception_v3 module
    x = preprocess_input(x)
    return x

def encode(img):
    img = preprocess(img)
    feat_vec = image_model.predict(img)
    feat_vec = np.reshape(feat_vec, feat_vec.shape[1])
    return feat_vec

In [None]:
image_dir = 'flickr8k/images/'

In [None]:
encoded_train_images_filename = 'encoded_train_images.pkl'
if not os.path.exists(encoded_train_images_filename):
    start = time()
    encoding_train = {}
    for base_img_fn in train_clean_desc_keys:
        img_fn = base_img_fn + '.jpg'
        image_file_path = os.path.abspath(os.path.join(image_dir, img_fn))
        if not os.path.exists(image_file_path):
            print('Not found image:', image_file_path)
            continue
        encoding_train[base_img_fn] = encode(os.path.join(image_dir, img_fn))
    print('encoding time for train:', time() - start)
else:
    with open(encoded_train_images_filename, 'rb') as f:
        train_image_feats = pickle.load(f)

In [None]:
len(train_image_feats)

In [None]:
encoded_test_images_filename = 'encoded_test_images.pkl'
if not os.path.exists(encoded_test_images_filename):
    start = time()
    encoding_test = {}
    for i, base_img_fn in enumerate(test_clean_desc_keys):
        img_fn = base_img_fn + '.jpg'
        image_file_path = os.path.abspath(os.path.join(image_dir, img_fn))
        if not os.path.exists(image_file_path):
            print('Not found image:', image_file_path)
            continue
        else:
            print('{}: {}'.format(i, img_fn))
        encoding_test[base_img_fn] = encode(os.path.join(image_dir, img_fn))
    print('encoding time for test:', time() - start)
    
    with open(encoded_test_images_filename, "wb") as f:
      pickle.dump(encoding_test, f)
else:
    with open(encoded_test_images_filename, 'rb') as f:
        test_image_feats = pickle.load(f)

In [None]:
len(test_image_feats)

### Create Corpus

In [None]:
word_to_id = {}
id_to_word = {}

for word in vocabulary:
    if not word in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [None]:
# for padding
max_id = len(word_to_id)
word_to_id['0'] = max_id
id_to_word[max_id] = '0'

In [None]:
print(len(word_to_id))
print(len(id_to_word))

In [None]:
def max_length(clean_descriptions):
  max_length = 0
  for k in clean_descriptions.keys():
    if not k in train_clean_desc_keys:
      continue
    max_each = max([len(desc.split()) for desc in clean_descriptions[k]])
    if max_each > max_length:
      max_length = max_each
  return max_length

In [None]:
max_len = max_length(train_descriptions)
print(max_len)

### Word Embeddings

In [None]:
embed_dim = 200
feat_dim = image_model.output_shape[1]
print('embed_dim {}, feat_dim {}'.format(embed_dim, feat_dim))

In [None]:
# load pretrained word vectors
def load_word_vectors(word_vectors_path):
    embeddings = {}
    with open(word_vectors_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [None]:
embeddings = load_word_vectors('glove/glove.6B.200d.txt')

In [None]:
print('Num. of word vectors:', len(embeddings))

In [None]:
embeddings.get('aaaaaa') is None

In [None]:
word_embedding_matrix = np.zeros((vocab_size, embed_dim))
for word, i in word_to_id.items():
    # if the word is not included, get method returns None
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [None]:
print(word_embedding_matrix.shape)

### Encoder CNN and Decoder LSTM

In [None]:
for layer in image_model.layers:
    layer.trainable = False
dense_input = BatchNormalization(axis=-1)(image_model.output)
image_dense = Dense(units=embed_dim)(dense_input)
# Add a timestep dimension to match LSTM
image_embedding = RepeatVector(1)(image_dense)
image_input = image_model.input

In [None]:
print(image_input)
print(dense_input)
print(image_dense)
print(image_embedding)

In [None]:
sentence_input = Input(shape=[None])
word_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)(sentence_input)

In [None]:
print(sentence_input)
print(word_embedding)

In [None]:
seq_input = Concatenate(axis=1)([image_embedding, word_embedding])

In [None]:
print(image_embedding)
print(word_embedding)

In [None]:
input_ = seq_input
print(input_)

In [None]:
lstm_layers = 3
lstm_output_size = embed_dim
dropout_rate = 0.22
for _ in range(lstm_layers):
    input_ = BatchNormalization(axis=-1)(input_)
    lstm_out = LSTM(
        units=lstm_output_size, return_sequences=True, dropout=dropout_rate, recurrent_dropout=dropout_rate
    )(input_)
    input_ = lstm_out
seq_output = TimeDistributed(Dense(units=vocab_size))(lstm_out)

In [None]:
model_im2txt = Model(inputs=[image_input, sentence_input], outputs=seq_output)

In [None]:
model_im2txt.summary()

In [None]:
model_im2txt.layers[-9].set_weights([word_embedding_matrix])
model_im2txt.layers[-9].trainable = False

In [None]:
learning_rate = 0.00051
model_im2txt.compile(optimizer=Adam(lr=learning_rate), loss='categorical_crossentropy')

### Training

In [None]:
from collections import namedtuple

In [None]:
Datum = namedtuple('Datum', ['img_filename', 'img_path', 'caption_txt', 'all_captions_txt'])

In [None]:
def build_train_datum_list(train_clean_desc_keys, train_descriptions):
    train_datum_list = []
    for k in train_clean_desc_keys:
        img_filename = k + '.jpg'
        img_path = os.path.join(image_dir, img_filename)
        all_captions_txt = train_descriptions[k]
        for desc in all_captions_txt:
            train_datum_list.append(Datum(img_filename=img_filename, img_path=img_path, caption_txt=desc, all_captions_txt=all_captions_txt))
    return train_datum_list

In [None]:
train_datum_list = build_train_datum_list(train_clean_desc_keys, train_descriptions)

In [None]:
print(train_datum_list[0])

In [None]:
batch_size = 32

In [None]:
def preprocess_an_image(img_path):
    img = image.load_img(img_path, target_size=(299, 299, 3))
    img_array = image.img_to_array(img)
    img_array = preprocess_input(img_array)
    return img_array

In [None]:
def preprocess_images(imgs_path):
    return map(preprocess_an_image, imgs_path)