# Imports
The following packages are imported:


In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub
from tensorflow.keras.layers import *

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import IPython

from PIL import Image
from PIL import ImageColor
from PIL import ImageDraw
from PIL import ImageFont
from PIL import ImageOps


import tempfile
from six.moves.urllib.request import urlopen
from six import BytesIO

import os
import pathlib

# For measuring the inference time.
import time

# Check available GPU devices.
print("The following GPU devices are available: %s" % tf.test.gpu_device_name())

# Data
- data_dir is the path to the images and the `results.csv`
- image_dir is the path exculsively to the images
- csv_file is the path to the `results.csv` file

In [None]:
data_dir = '../input/flickr-image-dataset/flickr30k_images'
image_dir = f'{data_dir}/flickr30k_images'
csv_file = f'{data_dir}/results.csv'

Here we read the csv file as a dataframe and make some observations from it.
For a quick EDA we are going to 
- check the shape of the dataframe
- check the names of the columns
- find out the unique image names there are

In [None]:
df = pd.read_csv(csv_file, delimiter='|')

print(f'[INFO] The shape of dataframe: {df.shape}')
print(f'[INFO] The columns in the dataframe: {df.columns}')
print(f'[INFO] Unique image names: {len(pd.unique(df["image_name"]))}')

A quick observation here is to see that the dataframe has `158915` elements but only `31783` image names. This means that there is a duplicacy involved. On further inspection we will see that each image has 5 unique captions attached to it ($31783\times 5=158915$)

While looking into the dataframe I found out that `19999` had some messed up entries. This has led me to manually change the entries in that row.

In [None]:
df.columns = ['image_name', 'comment_number', 'comment']
del df['comment_number']

# Under scrutiny I had found that 19999 had a messed up entry
df['comment'][19999] = ' A dog runs across the grass .'

# Image names now correspond to the absolute position
df['image_name'] = image_dir+'/'+df['image_name']

# <start> comment <end>
df['comment'] = "<start> "+df['comment']+" <end>"

In [None]:
# Shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
SIZE = len(df)

train_size = int(0.7* SIZE) 
val_size = int(0.1* SIZE)
test_size = int(0.2* SIZE)

train_size, val_size, test_size

In [None]:
# Enter different indices.
index = 10000

image_name = df['image_name'][index]
comment = df['comment'][index]

print(comment)

IPython.display.Image(filename=image_name)

# Text Handling
- Defined the size of the vocab which is `5000`.
- Initialized the Tokenizer class.
    - Standardized (all to lower case)
    - Filters the punctuations
    - Splits the text
    - Creates the vocabulary (`<start>, <end> and <unk>` is defined)

In [None]:
# Choose the top 5000 words from the vocabulary
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~')

Here we fit the `tokenizer` object on the captions. This helps in the updation of the vocab that the `tokenizer` object might have.

In the first iteration the vocabulary does not start from `0`. Both the dictionaries have 1 as the key or value.

In [None]:
# build the vocabulary
tokenizer.fit_on_texts(df['comment'])

In [None]:
# This is a sanity check function
def check_vocab(word):
    i = tokenizer.word_index[word]
    print(f"The index of the word: {i}")
    print(f"Index {i} is word {tokenizer.index_word[i]}")
    
check_vocab("<end>")

Here we are padding the sentences so that each of the sentences are of the same length.

In [None]:
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
EMBEDDING_DIM = 100

path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/kaggle/working/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index), EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(df['comment'])

In [None]:
# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

In [None]:
cap_vector.shape

In [None]:
train_cap = cap_vector[:train_size] 
val_cap = cap_vector[train_size:train_size+val_size]
test_cap = cap_vector[train_size+val_size:]

train_cap.shape, val_cap.shape, test_cap.shape

In [None]:
cap_train_ds = tf.data.Dataset.from_tensor_slices(train_cap)
cap_val_ds = tf.data.Dataset.from_tensor_slices(val_cap)
cap_test_ds = tf.data.Dataset.from_tensor_slices(test_cap)

cap_train_ds, cap_val_ds, cap_test_ds

# Image Handling
- Load the image
- decode jpeg
- resize
- standardize

In [None]:
def load_img(image_path):
    # Decode file as strings
    img = tf.io.read_file(image_path)
    # Decode image, (0 to 1)
    img = tf.image.decode_jpeg(img)
    # Resize the image to specific height and width
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img

In [None]:
img_name = df['image_name'].values

train_img = img_name[:train_size] 
val_img = img_name[train_size:train_size+val_size]
test_img = img_name[train_size+val_size:]

train_img.shape, val_img.shape, test_img.shape

In [None]:
img_train_ds = tf.data.Dataset.from_tensor_slices(train_img).map(load_img)
img_val_ds = tf.data.Dataset.from_tensor_slices(val_img).map(load_img)
img_test_ds = tf.data.Dataset.from_tensor_slices(test_img).map(load_img)

img_train_ds, img_val_ds, img_test_ds

# Joint Dataset
We have the individual datasets with us. We need to zip the img and the cap dataset now.

In [None]:
train_ds = tf.data.Dataset.zip((img_train_ds, cap_train_ds))
val_ds = tf.data.Dataset.zip((img_val_ds, cap_val_ds))
test_ds = tf.data.Dataset.zip((img_test_ds, cap_test_ds))

In [None]:
# Cache, prefecth and batch the dataset
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 256

train_ds = train_ds.shuffle(42).batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.shuffle(42).batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.shuffle(42).batch(BATCH_SIZE).cache().prefetch(buffer_size=AUTOTUNE)

Sanity check for the division of datasets

In [None]:
for img, cap in test_ds.take(1):
    print(img.shape)
    print(cap.shape)
    plt.imshow(img[0])
    for c in cap[0]:
        print(tokenizer.index_word[c.numpy()],end=' ')

## Model
### Show (Encoder)
- InceptionV3: This will act like the feature extractor
- Use an FC layer to extract the features of the image
- The features will be used as the initial hidden state for the RNN

### Tell (Decoder)
- The initial hidden state is used
- The text is embedded
- Usage of an LSTM to produce softmax on the vocab
- Loss with captions

In [None]:
# Some global variables
EMBEDDIN_DIM = 100
VOCAB_SIZE = 5000
UNITS_RNN = 256

In [None]:
class CNN_Encoder(tf.keras.Model):
    
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.embedding_dim = embedding_dim
        
    def build(self, input_shape):
        self.inception = tf.keras.applications.InceptionV3(include_top=False,
                                                           weights='imagenet')
        self.inception.trainable=False
        self.gap = GlobalAveragePooling2D()
        self.fc = Dense(units=self.embedding_dim,
                        activation='sigmoid')
        
    def call(self, x):
        x = self.inception(x)
        x = self.gap(x)
        x = self.fc(x)
        return x

In [None]:
# Checking the CNN
encoder = CNN_Encoder(EMBEDDIN_DIM)
for image, caption in train_ds.take(1):
    print(encoder(image).shape)
    break

In [None]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.embedding = Embedding(len(word_index),
                                   EMBEDDING_DIM,
                                   weights=[embedding_matrix],
                                   input_length=80,
                                   trainable=True)
    
    def build(self, input_shape):
        self.gru = GRU(units=self.units,
                       return_sequences=True,
                       return_state=True)
        self.fc1 = Dense(self.units)
        self.fc2 = Dense(self.vocab_size)

    def call(self, x, initial_zero=False):
        # x, (batch, 512)
        # hidden, (batch, 256)
        if initial_zero:
            initial_state = decoder.reset_state(batch_size=x.shape[0])
            output, state = self.gru(inputs=x,
                                     initial_state=initial_state)
        else:
            output, state = self.gru(inputs=x)
        # output, (batch, 256)
        x = self.fc1(output)
        x = self.fc2(x)
        
        return x, state
    
    def embed(self, x):
        return self.embedding(x)
    
    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [None]:
# Checking the RNN
decoder = RNN_Decoder(embedding_dim=EMBEDDIN_DIM,
                      units=UNITS_RNN,
                      vocab_size=VOCAB_SIZE)
for image, caption in train_ds.take(1):
    features = tf.expand_dims(encoder(image),1) # (batch, 1, 128)
    em_words = decoder.embed(caption)
    x = tf.concat([features,em_words],axis=1)
    print(x.shape)
    predictions, state = decoder(x, True)
    print(predictions.shape)
    print(state.shape)

In [None]:
encoder = CNN_Encoder(EMBEDDIN_DIM)
decoder = RNN_Decoder(embedding_dim=EMBEDDIN_DIM,
                      units=UNITS_RNN,
                      vocab_size=VOCAB_SIZE)

We use `Adam` as the optimizer.

The loss is `SparseCategoricalCrossentropy`, because here it would be inefficient to use one-hot-encoders are the ground truth. We will also use mask to help mask the `<pad>` so that we do not let the sequence model learn to overfit on the same.

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


In [None]:
@tf.function
def train_step(img_tensor, target):
    # img_tensor (batch, 224,224,3)
    # target     (batch, 80)
    loss = 0
    with tf.GradientTape() as tape:
        features = tf.expand_dims(encoder(img_tensor),1) # (batch, 1, 128)
        em_words = decoder.embed(target)
        x = tf.concat([features,em_words],axis=1)
        predictions, _ = decoder(x, True)

        loss = loss_function(target[:,1:], predictions[:,1:-1,:])

    trainable_variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss

In [None]:
loss_plot = []

In [None]:
EPOCHS = 50

for epoch in range(EPOCHS):
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(train_ds.take(20)):
        loss = train_step(img_tensor, target)
        total_loss += loss
        if batch % 10 == 0:
            print (f'Epoch {epoch} Batch {batch} Loss {loss.numpy():.4f}')
    # storing the epoch end loss value to plot later
    loss_plot.append(loss)
    print(f'Loss: {total_loss/20:.4f}')

In [None]:
plt.plot(loss_plot)

## Inference

In [None]:
img, cap = next(iter(test_ds.take(1)))

img[0].shape, cap[0].shape

In [None]:
img = tf.expand_dims(img[0],0)
cap = tf.expand_dims(cap[0],0)

img.shape, cap.shape

In [None]:
feature = tf.expand_dims(encoder(img),1) # (1, 1, 128)

feature.shape

In [None]:
# For the image
prediction, _ = decoder(feature, True)
print(prediction.shape)

In [None]:
word = tf.reshape(tokenizer.word_index['<start>'], shape=(1,1))
em_words = decoder.embed(word)
print(em_words.shape)

prediction, _ = decoder(em_words)
idx = tf.random.categorical(tf.squeeze(prediction,1), 1)[0][0].numpy()
word = tokenizer.index_word[idx]
print(word)

In [None]:
count = 0
while word != '<end>':
    print(word, end=" ")
    if count > 20:
        break
    word_int = tf.reshape(tokenizer.word_index[word], shape=(1,1))  
    em_words = decoder.embed(word_int)
    prediction, _ = decoder(em_words)
    idx = tf.random.categorical(tf.squeeze(prediction,1), 1)[0][0].numpy()
    word = tokenizer.index_word[idx]
    count += 1

plt.imshow(image[0])
plt.show()