### Understanding the data

In [None]:
import json
from IPython.display import JSON
import pprint
from collections import defaultdict
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
from keras.preprocessing import sequence as keras_seq
from keras.preprocessing.text import Tokenizer
from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, RepeatVector, Embedding, LSTM, TimeDistributed, Input, Concatenate
from keras.optimizers import  Adam
from keras.callbacks import ModelCheckpoint
import os
from time import time
import pickle
from operator import attrgetter
import math
import tensorflow as tf
from copy import copy

In [None]:
captions = []
with open('flickr8k/Flickr8k.token.txt', 'r') as f:
    for line in f:
        captions.append(line.strip())

In [None]:
print(len(captions))

### Ground truth image descriptions

In [None]:
def get_gt_image_descriptions(captions):
    descriptions = defaultdict(list)
    for cap in captions:
        elems = cap.split('\t')
        fn = elems[0][:-2]
        descriptions[fn].append(elems[1])
    return descriptions

In [None]:
descriptions = get_gt_image_descriptions(captions)
sample_image_id = np.random.choice(list(descriptions.keys()))
print('\n'.join(descriptions[sample_image_id]))

In [None]:
print(len(descriptions))

### Data cleaning

In [None]:
import string

In [None]:
string.punctuation

In [None]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # tokenize
            desc = desc.split()
            # convert to lower case
            #desc = [w.lower() for w in desc]
            # remove punctuation
            desc = [w.translate(table) for w in desc]
            # remove 'a' and 's'
            #desc = [w for w in desc if len(w) > 1]
            # remove tokens with numbers in them
            #desc = [w for w in desc if w.isalpha()]
            # store as string
            desc_list[i] = ' '.join(desc)
    return descriptions

In [None]:
cleaned_descriptions = clean_descriptions(descriptions)

In [None]:
len(cleaned_descriptions)

### Save Descriptions

In [None]:
def save_descriptions(cleaned_descriptions, filename):
    lines = []
    for key, desc_list in cleaned_descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    with open(filename, 'w') as f:
        for line in lines:
            f.writelines(line)
            f.writelines('\n')

In [None]:
save_descriptions(cleaned_descriptions, 'descriptions.txt')

### Train/Test/Dev images

In [None]:
TRAIN_IMAGES_FILE = 'flickr8k/Flickr_8k.trainImages.txt'
TEST_IMAGES_FILE = 'flickr8k/Flickr_8k.testImages.txt'
DEV_IMAGES_FILE = 'flickr8k/Flickr_8k.devImages.txt'

In [None]:
def get_images_list(images_file):
    images_list = []
    with open(images_file, 'r') as f:
        for line in f:
            line = line.strip()
            images_list.append(line)
    return images_list

In [None]:
train_images = get_images_list(TRAIN_IMAGES_FILE)
test_images = get_images_list(TEST_IMAGES_FILE)
dev_images = get_images_list(DEV_IMAGES_FILE)
print('Num. of train images:', len(train_images))
print('Num. of test images:', len(test_images))
print('Num. of dev images:', len(dev_images))

In [None]:
print(train_images[0], test_images[0], dev_images[0])

### Extract image features

In [None]:
image_encoder = InceptionV3(weights='imagenet')

In [None]:
with open('train_img_feats.pickle', 'rb') as f:
    train_img_feats = pickle.load(f)

In [None]:
print(len(train_img_feats.keys()))
print(train_images[0])
print(len(train_img_feats[train_images[0]]))

In [None]:
with open('test_img_feats.pickle', 'rb') as f:
    test_img_feats = pickle.load(f)

In [None]:
print(len(test_img_feats.keys()))
print(test_images[0])
print(len(test_img_feats[test_images[0]]))

### Train and Test Split

In [None]:
SOS_TOKEN = 'zsosz'

In [None]:
EOS_TOKEN = 'zeosz'

In [None]:
def extract_clean_descriptions(images, cleaned_descriptions):
    extracted_decriptions = {}
    for img in images:
        if img in cleaned_descriptions:
            # Add EOS_TOKEN to each descriptions
            extracted_decriptions[img] = list(map(lambda x: SOS_TOKEN + ' ' + x + ' ' + EOS_TOKEN, cleaned_descriptions[img]))
    return extracted_decriptions

In [None]:
train_descriptions = extract_clean_descriptions(train_images, cleaned_descriptions)
test_descriptions = extract_clean_descriptions(test_images, cleaned_descriptions)

In [None]:
print('Descriptions: train={}'.format(len(train_descriptions)))
print('Descriptions: test={}'.format(len(test_descriptions)))

In [None]:
train_sample_image_id = np.random.choice(list(train_descriptions.keys()))
test_sample_image_id = np.random.choice(list(test_descriptions.keys()))
print('train sample image id:', train_sample_image_id)
print('test sample image id:', test_sample_image_id)

In [None]:
print('<train sample description>:', train_descriptions[train_sample_image_id])
print('<test sample description>:', test_descriptions[test_sample_image_id])

In [None]:
max_train_desc_len = 0
for k, v in train_descriptions.items():
    for desc in train_descriptions[k]:
        if max_train_desc_len < len(desc.split(' ')):
            max_train_desc_len = len(desc.split(' '))
print(max_train_desc_len)

### Create Vocabulary

In [None]:
train_words = set()
for k, v in train_descriptions.items():
    for desc in train_descriptions[k]:
        words = [d for d in desc.split(' ')]
        for word in words:
            train_words.add(word)
print(len(train_words))

In [None]:
word2idx = {val : idx for idx, val in enumerate(train_words)}

In [None]:
idx2word = {idx : val for idx, val in enumerate(train_words)}

In [None]:
print(word2idx['zsosz'])
print(idx2word[6254])