In [1]:
'''
Sanjay Singh
san.singhsanjay@gmail.com
April-2021
To make inference - Image Captioning
'''

'\nSanjay Singh\nsan.singhsanjay@gmail.com\nApril-2021\nTo make inference - Image Captioning\n'

In [2]:
# start by connecting gdrive into the google colab
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# packages
import numpy as np
import pandas as pd
import cv2
import os
from keras.models import Model
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import load_model
from IPython.display import Image
from tqdm.notebook import tqdm

In [4]:
# constants
EMBEDDING_DIM = 200
IMG_WIDTH = 299
IMG_HEIGHT = 299
IMG_CHANNEL = 3

In [5]:
# function to make prediction of captions
def predict_caption(model, max_caption_length, wordtoix, ixtoword, img_feat):
    in_text = 'startseq'
    for i in range(max_caption_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_caption_length)
        y_hat = model.predict([img_feat,sequence], verbose=0)
        y_hat = np.argmax(y_hat)
        word = ixtoword[y_hat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    caption = in_text.split()
    caption = caption[1:-1]
    caption = ' '.join(caption)
    return caption

In [6]:
# paths
vocabulary_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/processed_data/vocabulary.txt"
max_caption_length_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/processed_data/max_caption_length.txt"
wordtoix_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/output/trained_models/wordtoix.csv"
ixtoword_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/output/trained_models/ixtoword.csv"
embedding_matrix_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/output/trained_models/embedding_matrix.csv"
trained_model_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/output/trained_models/model_19.h5"
test_image_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/test_image/val_images/"
target_path = "/content/gdrive/MyDrive/Flickr8k_ImageCaptioning/output/test_image_generated_captions/"

In [7]:
# reading vocabulary
vocabulary = list()
f_ptr = open(vocabulary_path, 'r')
lines = f_ptr.readlines()
for line in lines:
	vocabulary.append(line.strip())
print("Completed reading vocabulary file")

Completed reading vocabulary file


In [8]:
# read wordtoix and ixtoword
wordtoix = pd.read_csv(wordtoix_path, header=None)
ixtoword = pd.read_csv(ixtoword_path, header=None)
print("Completed reading wordtoix.csv and ixtoword.csv")

Completed reading wordtoix.csv and ixtoword.csv


In [9]:
# converting wordtoix and ixtoword to dictionary from dataframe
wordtoix = dict(wordtoix.values)
ixtoword = dict(ixtoword.values)
print("Successfully converted wordtoix and ixtoword from dataframe to dictionary")

Successfully converted wordtoix and ixtoword from dataframe to dictionary


In [10]:
# getting vocabulary size
vocab_size = len(wordtoix) + 1 # 1 is added for '0'
print("Vocabulary Size: ", vocab_size)

Vocabulary Size:  1652


In [11]:
# extract maximum length of caption - saved by script_preprocessing.py
f_ptr = open(max_caption_length_path, 'r')
line = f_ptr.readlines()
max_caption_length = int(line[0].split(":")[1].strip())
print("Maximum caption length: ", max_caption_length)

Maximum caption length:  28


In [12]:
'''# loading embedding_matrix
embedding_matrix = pd.read_csv(embedding_matrix_path, header=None)
print("Successfully loading embedding_matrix, its shape: ", embedding_matrix.shape)'''

'# loading embedding_matrix\nembedding_matrix = pd.read_csv(embedding_matrix_path, header=None)\nprint("Successfully loading embedding_matrix, its shape: ", embedding_matrix.shape)'

In [13]:
# loading InceptionV3 model
model_inception = InceptionV3(weights='imagenet')
model_inception = Model(model_inception.input, model_inception.layers[-2].output)

In [14]:
# extracting test images from test image directory
test_image_names = os.listdir(test_image_path)

In [15]:
# loading trained model for image captioning
model = load_model(trained_model_path)
print("Successfully loaded trained model")

Successfully loaded trained model


In [16]:
# generating captions
captions = list()
test_img_array = np.ndarray([1, IMG_WIDTH, IMG_HEIGHT, IMG_CHANNEL])
for i in tqdm(range(len(test_image_names))):
    # redaing test image
    test_img = cv2.imread(test_image_path + test_image_names[i])
    test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)
    test_img = cv2.resize(test_img, (IMG_WIDTH, IMG_HEIGHT))
    test_img = preprocess_input(test_img)
    test_img_array[0] = test_img
    # generating bottleneck feature for image
    img_feat = model_inception.predict(test_img_array)
    # prediction caption
    caption = predict_caption(model, max_caption_length, wordtoix, ixtoword, img_feat)
    captions.append(caption)

HBox(children=(FloatProgress(value=0.0, max=1091.0), HTML(value='')))




In [17]:
print("Writing generated captions")
# writing all captions in a text file
f_ptr = open(target_path + "captions.txt", "w")
for i in range(len(captions)):
    f_ptr.write(test_image_names[i] +"#" + captions[i] + "\n")
f_ptr.close()
print("Successfully written all image captions")

Writing generated captions
Successfully written all image captions
