# Image Captioning

### CSE 575: Statistical Machine Learning

#### Under the guidance of 
### Prof. Guoliang Xue



#### Authors: Ramulu Reddy Challa, Akhilesh Reddy Eppa, Nagarjuna Vemuri, Vrushabh Jambhulkar, Mohammed Sauban Mussaddique

### Note: Only run the cells needed

#### Imports

In [None]:
import os
import re
import numpy
import matplotlib.pyplot as plot
from collections import Counter
import pickle
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.inception_v3 import preprocess_input as process_input_inception
from tensorflow.keras.applications.vgg16 import preprocess_input as process_input_vgg
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import layers
from tensorflow.keras import backend
from tensorflow.keras.models import load_model
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu

#### Constants

In [None]:
BASE_PATH = os.path.abspath("drive/My Drive/Colab Notebooks/image-captioning")

#Source: https://www.kaggle.com/hsankesara/flickr-image-dataset
#Source: http://academictorrents.com/details/9dea07ba660a722ae1008c4c8afdd303b6f6e53b
# change this if using Flickr30k dataset
IMAGES_PATH = os.path.join(BASE_PATH, "data/Flickr8k/Flicker8k_Dataset/")
TRAIN_IMAGES_PATH = os.path.join(BASE_PATH, "data/Flickr8k/Flickr8k_text/Flickr_8k.trainImages.txt")
TEST_IMAGES_PATH = os.path.join(BASE_PATH, "data/Flickr8k/Flickr8k_text/Flickr_8k.testImages.txt")
RAW_CAPTION_PATH = os.path.join(BASE_PATH, "data/Flickr8k/Flickr8k_text/Flickr8k.token.txt")
VOCABULARY_PATH = os.path.join(BASE_PATH, "data/Flickr8k/vocabulary.pkl")

# Source: https://github.com/stanfordnlp/GloVe
GLOVE_WORD_EMBEDDINGS_PATH_100D = os.path.join(BASE_PATH, "data/glove/glove.6B.100d.txt")
GLOVE_WORD_EMBEDDINGS_PATH_200D = os.path.join(BASE_PATH, "data/glove/glove.6B.200d.txt")
GLOVE_WORD_EMBEDDINGS_PATH_300D = os.path.join(BASE_PATH, "data/glove/glove.6B.300d.txt")

# Paths to save files for future use (change names to reflect the models or datasets used)
PROCESSED_CAPTIONS_PATH = os.path.join(BASE_PATH, "data/preprocessed_captions.pkl")
WORD_TO_INDEX_PATH = os.path.abspath("../data/word_to_index.pkl")
INDEX_TO_WORD_PATH = os.path.abspath("../data/word_to_index.pkl")
ENCODINGS_PATH = os.path.join(BASE_PATH, "data/encodings.pkl")
G_WORD_EMBEDDINGS_PATH_100D = os.path.join(BASE_PATH, "data/Flickr8k/g_word_embeddings_100d.pkl")
G_WORD_EMBEDDINGS_PATH_200D = os.path.join(BASE_PATH, "data/Flickr8k/g_word_embeddings_200d.pkl")
G_WORD_EMBEDDINGS_PATH_300D = os.path.join(BASE_PATH, "data/Flickr8k/g_word_embeddings_300d.pkl")
TEST_RESULTS_PATH = os.path.join(BASE_PATH, "res-inception-lstm-all-glove.pkl")


# Paths to load the saved models from (change names to reflect the models or datasets used)
MODEL_PATH = os.path.join(BASE_PATH, "inception-lstm-flickr8k-glove300d.h5")
MODEL_HISTORY_PATH = os.path.join(BASE_PATH, "inception-lstm-flickr8k-glove300d-history.pkl")

MIN_WORD_FREQ = 8
GLOVE_WORD_EMBEDDINGS_PATH = GLOVE_WORD_EMBEDDINGS_PATH_100D

# Data Preprocessing

## Data Cleaning

#### Extracting captions from Flickr8k dataset

In [None]:
captions = {}
with open(RAW_CAPTION_PATH, "r") as file:
    for line in file.readlines():
#         print(line)
        splits = re.compile("#\d+").split(line)
        image_name = splits[0]
        caption = splits[1]
        caption = re.sub(r"\n+|\t+|\s{2,}", "", caption)
        if image_name not in captions:
            captions[image_name] = []
        captions[image_name].append(caption)
captions

#### Extracting captions from flickr30k dataset

In [None]:
all_image_names = set(os.listdir(IMAGES_PATH))

captions = {}
with open(RAW_CAPTION_PATH, "r") as file:
    file.readline()
    for line in file.readlines():
#         print(line)
        splits = line.split("|")
        if(len(splits) != 3):
            continue
        image_name = splits[0].strip()
        if(image_name not in all_image_names):
            continue
        caption = splits[2].strip()
        caption = re.sub(r"\n+|\t+|\s{2,}", "", caption)
        if image_name not in captions:
            captions[image_name] = []
        captions[image_name].append(caption)
# captions

#### Sample display of images along with their captions

In [None]:
for img in ['1007129816_e794419615.jpg', '1002674143_1b742ab4b8.jpg', '1022454428_b6b660a67b.jpg']:
    plot.imshow(plot.imread(os.path.join(IMAGES_PATH, img)))
    plot.axis('off')
    plot.show()
    print("\n".join(captions[img]))
    print("\n")

> To lowercase <br> Remove words with special characters and numbers <br> Remove special characters (punctuations) & words of unit length (might keep 'a')

In [None]:
def wordFilter(word):
    if not re.match(r"^[a-z]+$", word):
        return False
    
    if len(word) == 1 and word != "a":
        return False
    
    return True

In [None]:
words = [] 
for _, _captions in captions.items():
    for i in range(len(_captions)):
        caption = _captions[i]
        caption = caption.lower()
        _words = caption.split(" ")
        _words = list(filter(wordFilter, _words))
        _captions[i] = ' '.join(_words)
#         We will add <start> and <end> in data_generator
        _captions[i] = '<start> ' + _captions[i] + ' <end>' 
        words.extend(_words)
word_freq = Counter(words)

In [None]:
word_freq
captions

> Remove words with frequency less than 5

In [None]:
vocabulary = []
for word, freq in word_freq.items():
    if(freq >= MIN_WORD_FREQ):
        vocabulary.append(word)
vocabulary.append('<start>')
vocabulary.append('<end>')
vocabulary.append('<unk>')
vocabulary.sort()
vocabulary.insert(0, '<pad>')
print(len(vocabulary))
vocabulary

#### Saving captions...

In [None]:
with open(PROCESSED_CAPTIONS_PATH, "wb") as file:
    pickle.dump(captions, file)

#### Saving vocabulary...

In [None]:
with open(VOCABULARY_PATH, "wb") as file:
    pickle.dump(vocabulary, file)

## Data Transformation (on captions)

#### Finding the length of the longest caption

In [None]:
max_length = 0
for _captions in captions.values():
    _lengths = list(map(lambda caption: len(caption.split(" ")), _captions))
    max_length = max(max_length, max(_lengths))
max_length

#### Word to index mapping and Index to word mapping

In [None]:
word_to_index = {}
index_to_word = {}

# word_to_index['<pad>'] = 0
# index_to_word[0] = '<pad>'

index = 0
for word in vocabulary:
    word_to_index[word] = index
    index_to_word[index] = word
    index += 1
print(word_to_index)
index_to_word

#### Saving word-to-index and index-to-word mappings

In [None]:
with open(WORD_TO_INDEX_PATH, "wb") as file:
    pickle.dump(word_to_index, file)
with open(INDEX_TO_WORD_PATH, "wb") as file:
    pickle.dump(index_to_word, file)

## Extract Image Encodings

#### Loading training examples from Flickr_8k.trainImages.txt

In [None]:
with open(TRAIN_IMAGES_PATH, "r") as file:
    train_images = file.read().strip().split("\n")
print(len(train_images))
train_images

#### Building the inception model

In [None]:
%%time
backend.clear_session()
inceptionv3 = InceptionV3(weights="imagenet")
input_layer = inceptionv3.layers[0].input
output_layer = inceptionv3.layers[-1].input
inception_model = Model(inputs=input_layer, outputs=output_layer)

#### Building the vgg model

In [None]:
%%time
backend.clear_session()
vgg16 = VGG16(weights="imagenet")
input_layer = vgg16.layers[0].input
output_layer = vgg16.layers[-1].input
vgg_model = Model(inputs=input_layer, outputs=output_layer)

#### Extracting encodings of the training examples (Inception model)

In [None]:
def get_encodings(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    img = image.img_to_array(img)
    img = preprocess_input(img)
    img = numpy.expand_dims(img, axis = 0)
    print(img.shape)
    _encodings = inception_model.predict(img)[0]
    return _encodings

#### Extracting encodings of the training examples (VGG model)

In [None]:
def get_encodings(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = preprocess_input(img)
    img = numpy.expand_dims(img, axis = 0)
    print(img.shape)
    _encodings = vgg_model.predict(img)[0]
    return _encodings

In [None]:
%%time
progress = tqdm(total=len(train_images), desc="Encoded", position=0)
encodings = {}
for train_image in train_images:
    _encodings = get_encodings(os.path.join(IMAGES_PATH, train_image))
    encodings[train_image] = _encodings
    progress.update(1)
    break
# print(len(encodings))
print(encodings[train_image].shape)
encodings

#### Saving encodings...

In [None]:
with open(ENCODINGS_PATH, "wb") as file:
    pickle.dump(encodings, file)

## Extracting Word Embeddings

In [None]:
%%time
progress = tqdm(desc="Extracted", position=0)
_word_embeddings = {}
with open(GLOVE_WORD_EMBEDDINGS_PATH, "r") as file:
    for line in file.readlines():
        splits = line.split(" ")
        word = splits[0]
        embeddings = splits[1:]
        _word_embeddings[word] = numpy.array(embeddings)
        progress.update(1)
print(len(_word_embeddings.keys()))
_word_embeddings['the'].shape

In [None]:
word_embeddings = numpy.zeros((len(vocabulary), 100));
for word, index in word_to_index.items():
    if index == 0:
        continue
    if word in _word_embeddings:
        word_embeddings[index-1] = _word_embeddings[word]
#     else:
#         word_embeddings[index-1] = numpy.zeros(100)
print(word_embeddings)

#### Saving embeddings matrix...

In [None]:
with open(WORD_EMBEDDINGS_PATH, "wb") as file:
    pickle.dump(word_embeddings, file)

# Training

#### Data Generator function (for Stochastic Gradient Descent)

In [None]:
def data_generator(batch_size, captions, image_encodings, word_to_index, vocabulary_length, max_caption_length):
    count = 0
    _image_encodings = []
    input_sequence_vectors = []
    output_word_vectors = []
    while 1:
        for img, _captions in captions.items():
            for caption in _captions:
                vectorized_caption = []
#                 vectorized_caption.append(word_to_index['<start>'])
                for word in caption.split(" "):
                    if word not in word_to_index:
                        word = '<unk>'
                    vectorized_caption.append(word_to_index[word])
#                 vectorized_caption.append(word_to_index['<end>'])
                for i in range(len(vectorized_caption) - 1):
                    input_sequence_vector = vectorized_caption[:i+1]
                    input_sequence_vector = sequence.pad_sequences([input_sequence_vector], maxlen=max_caption_length, dtype='int32', padding='post', value=0)[0]
                    
                    output_word_index = vectorized_caption[i+1]
                    output_word_vector = numpy.zeros(vocabulary_length)
                    output_word_vector[output_word_index] = 1
            
                    _image_encodings.append(image_encodings[img])
                    input_sequence_vectors.append(input_sequence_vector) 
                    output_word_vectors.append(output_word_vector)
            count += 1
            if count == batch_size:
#                 print(numpy.array(_image_encodings).shape)
#                 print(numpy.array(input_sequence_vectors).shape)
#                 print(numpy.array(output_word_vectors).shape)
                yield ((numpy.array(_image_encodings), numpy.array(input_sequence_vectors)), numpy.array(output_word_vectors))
                
                count = 0
                _image_encodings = []
                input_sequence_vectors = []
                output_word_vectors = []

#### Construction of the model

In [None]:
%%time
backend.clear_session()
input_img = layers.Input(shape=(2048,))
dropout = layers.Dropout(0.5)(input_img)
dense = layers.Dense(256, activation='relu')(dropout)
model_img = dense

input_txt = layers.Input(shape=(max_length, ))
embedding = layers.Embedding(word_embeddings.shape[0], 100, mask_zero=True)(input_txt)
dropout = layers.Dropout(0.5)(embedding)
# lstm = layers.VGG(256)(dropout)
lstm = layers.LSTM(256)(dropout)
model_txt = lstm

layer_merge = layers.add([model_img, model_txt])
dense = layers.Dense(256, activation='relu')(layer_merge)
output = layers.Dense(word_embeddings.shape[0], activation='softmax')(dense)

model = Model(inputs=[input_img, input_txt], outputs=output)

In [None]:
model.summary()

In [None]:
model.layers

#### Using the embeddings extracted from a pretrained GloVe model as weights on the embedding layer in our model

In [None]:
model.layers[2].set_weights([word_embeddings])
model.layers[2].trainable = False

#### Compiling the model

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
image_encodings = {}
with open(ENCODINGS_PATH, "rb") as file:
    image_encodings = pickle.load(file)

In [None]:
train_captions = {}
for train_image in train_images:
    if train_image in captions:
        train_captions[train_image] = captions[train_image]
print(len(train_captions))
train_captions

#### Training...

In [None]:
%%time
batch_size = 10

epochs = 200
history = model.fit_generator(data_generator(batch_size=batch_size, captions=train_captions, image_encodings=image_encodings, word_to_index=word_to_index, vocabulary_length=len(vocabulary), max_caption_length=max_length), 
                        steps_per_epoch=int(len(image_encodings)/batch_size), 
                        epochs=epochs, 
                        verbose=1)
model.save(MODEL_PATH)
with open(MODEL_HISTORY_PATH, "wb") as file:
    pickle.dump(history.history, file)

### Loading...

In [None]:
with open(ENCODINGS_PATH, "rb") as file:
    encodings = pickle.load(file)
with open(PROCESSED_CAPTIONS_PATH, "rb") as file:
    captions = pickle.load(file)
with open(WORD_EMBEDDINGS_PATH, "rb") as file:
    word_embeddings = pickle.load(file)
model = load_model(MODEL_PATH)

#### Beam Search

In [None]:
def beamSearch(test_image_path, beam_index, model):
    test_image_encodings = get_encodings(test_image_path)
    input_sequence = [[[word_to_index['<start>']], 0]]
    while 1:
        top_words = []
        for _seq in input_sequence:
            input_sequence_vector = sequence.pad_sequences([_seq[0]], maxlen=max_length, padding="post")
            prediction = model.predict([numpy.array([test_image_encodings]), numpy.array(input_sequence_vector)])[0]
            top_predictions = numpy.argsort(prediction)[-beam_index: ]
            
            for word in top_predictions:
                next_captions = _seq[0][:]
                probability = _seq[1]
                next_captions.append(word)
                probability += prediction[word]
                top_words.append([next_captions, probability])
        input_sequence = top_words
        input_sequence = sorted(input_sequence, reverse=True, key=lambda x: x[1])
        input_sequence = input_sequence[:beam_index]
        if len(input_sequence[0][0]) >= max_length:
            break
    input_sequence = input_sequence[0][0]
    predicted_caption = []
    for i in range(len(input_sequence)):
        if input_sequence[i] == word_to_index['<end>']:
            predicted_caption.append('<end>')
            break
        predicted_caption.append(index_to_word[input_sequence[i]])
  
    return (" ".join(predicted_caption))

In [None]:
with open(TEST_IMAGES_PATH, "r") as file:
    test_images = file.read().strip().split("\n")
print(len(test_images))
# test_images

In [None]:
def refactor(s):
    s = s.replace("<start>", "").replace("<end>", "").replace("<unk>", "").strip()
    return s

#### BLEU Score

In [None]:
%%time
progress = tqdm(total=len(test_images[:25]), desc="Tested", position=0, leave=True)

res = numpy.zeros((len(test_images[:25]), 3, 9), dtype='float32')
for i, test_image in  enumerate(test_images[:25]):
    test_image_path = os.path.join(IMAGES_PATH, test_image)

    # plot.imshow(plot.imread(test_image_path))
    # plot.axis('off')
    # plot.show()
  
    for j, model in enumerate([model_100d, model_200d, model_300d]): 
        for k, beam_index in enumerate([1, 2, 3, 4, 5, 6, 7]):
            cap1 = beamSearch(test_image_path, beam_index, model)
  
            cap1 = refactor(cap1)
  
            _captions = []
            for caption in captions[test_image]:
              _captions.append(refactor(caption))
  
            score1 = sentence_bleu(_captions, cap1)
  
            res[i][j][k] = score1
  
        progress.update(1)
res

In [None]:
with open(TEST_RESULTS_PATH, "wb") as file:
    pickle.dump(res, file)