In [1]:
import os
from sklearn.utils import shuffle
import tensorflow as tf
from tqdm import tqdm
import numpy as np

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
# Read the file containing all caption-image pairs
with open('Dataset/Flickr30k/Flickr30k.token.txt', 'r') as file:
    annotations = file.read()

In [4]:
# get path to the image folder
PATH = os.path.abspath('.') + '/Dataset/Flickr30k/flickr30k-images/'

# Store captions and image names in vectors
all_captions = []
all_img_name_vector = []

# splitting the file contents by line
for annot in annotations.split("\n"):
        # Skip empty lines
        if len(annot)<1:
            continue
        # separate out the caption from the line
        caption = annot.split()[1:]
        # add <start> and <end> token to the caption
        caption = "<start> " + ' '.join(caption) + " <end>"
        # separate out the image id from line)
        image_id = annot.split()[0]
        # remove caption number
        image_id = image_id.split('#')[0]
        # convert image id into the image path
        full_image_path = PATH + image_id

        all_img_name_vector.append(full_image_path)
        all_captions.append(caption)

In [5]:
len(all_captions), all_img_name_vector[0], all_captions[0]

(158915,
 '/home/shailesh/Projects/mytf2/Flickr30k_notebooks/Dataset/Flickr30k/flickr30k-images/1000092795.jpg',
 '<start> Two young guys with shaggy hair look at their hands while hanging out in the yard . <end>')

In [6]:
# to load and preprocess the image input for InceptionV3 pretrained model
def load_image(image_path):
    """loads and preprocesses image for imception-v3 model
    input:
        image_path ::= string
    returns:
        img ::= Image tensor of shape (299, 299)
        image_path := string
    """
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels = 3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

In [7]:
# create a Inception-V3 model object used for featufre extraction of images
image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')

new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

In [8]:
## TODO: Implement shradding the images for performance enhancement of image caching

# Caching the image features to be used while training the model

# Get unique images
encode_train = sorted(set(all_img_name_vector))

# create Dataset object to iterate over all the the image paths
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
# load image using the image paths
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(8)

# iterate over the images and store respective features as numpy array
for img, path in tqdm(image_dataset):
    batch_features = image_features_extract_model(img) # output shape = (?, 8, 8, 2048)
    batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3])) # (?, 8, 8, 2048) -> (?, 64, 2048)

    for bf, p in zip(batch_features, path):
        # get the image path
        path_of_feature = p.numpy().decode("utf-8")
        # change the parent directory in the image path
        path_of_feature = path_of_feature.split('/')
        path_of_feature[-2] = "Image_Features"
        path_of_feature = '/'.join(path_of_feature)
        # save the features for later use
        np.save(path_of_feature, bf.numpy())

3973it [12:29,  5.30it/s]


In [6]:
# Divide images into dev and train set
encode_train = sorted(set(all_img_name_vector))

encode_train = shuffle(encode_train, random_state=7)

In [None]:
# last 2k images be dev set
len(encode_train)-2000

In [36]:
train_file = open("Flickr_30k.trainImages.txt", 'w')
for id in encode_train[:29783]:
    id = id.split('/')[-1]
    train_file.write(id+'\n')

train_file.close()

In [38]:
dev_file = open("Flickr_30k.devImages.txt", 'w')
for id in encode_train[29783:]:
    id = id.split('/')[-1]
    dev_file.write(id+'\n')

dev_file.close()