In [1]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Add, Reshape, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model

from nltk.translate.bleu_score import corpus_bleu

In [3]:
from dataset.utils import load_coco, load_image, print_progress_bar
from models.vgg19 import load_vgg19

# Load Data

In [4]:
# Folder containing the datset
data_dir = 'dataset/'

In [5]:
train_data, val_data, test_data, category_id, id_category = load_coco(
    os.path.join(data_dir, 'coco_raw.pickle'), 'captions'
)

In [6]:
train_images, train_captions = train_data  # Load training data
val_images, val_captions = val_data  # Load validation data
test_images, test_captions = test_data  # Load test data

In [7]:
num_classes = len(id_category)
num_classes

80

In [8]:
num_images_train = len(train_images)
num_images_train

19324

In [9]:
num_images_val = len(val_images)
num_images_val

2415

# Load Pre-Trained Image Model

In [10]:
# Load the pre-trained feature extractor model
feature_model = load_vgg19()

### Process Images

In [15]:
def process_images(feature_model, data_dir, filenames, batch_size):
    """
    Process all the given files in the given data_dir using the
    pre-trained feature-model as well as the feature-model and return
    their transfer-values.
    
    The images are processed in batches to save
    memory and improve efficiency.
    """
    
    num_images = len(filenames)
    img_size = K.int_shape(feature_model.input)[1:3]    # Expected input size of the pre-trained network

    # Pre-allocate input-batch-array for images.
    shape = (batch_size,) + img_size + (3,)
    image_batch = np.zeros(shape=shape, dtype=np.float32)

    # Pre-allocate output-array for transfer-values.
    feature_transfer_values = np.zeros(
        shape=(num_images, K.int_shape(feature_model.output)[1]),
        dtype=np.float32
    )

    start_index = 0
    print_progress_bar(start_index, num_images)  # Initial call to print 0% progress

    while start_index < num_images:
        end_index = start_index + batch_size
        if end_index > num_images:
            end_index = num_images
        current_batch_size = end_index - start_index

        # Load all the images in the batch.
        for i, filename in enumerate(filenames[start_index:end_index]):
            path = os.path.join(data_dir, filename)
            img = load_image(path, size=img_size)
            image_batch[i] = img

        # Use the pre-trained models to process the image.
        feature_transfer_values_batch = feature_model.predict(
            image_batch[0:current_batch_size]
        )

        # Save the transfer-values in the pre-allocated array.
        feature_transfer_values[start_index:end_index] = feature_transfer_values_batch[0:current_batch_size]

        start_index = end_index
        print_progress_bar(start_index, num_images)  # Update Progress Bar

    print()
    return feature_transfer_values

In [18]:
def process_data(feature_model, data_dir, data_type, filenames, captions, batch_size):
    print('Processing {0} images in {1}-set ...'.format(len(filenames), data_type))

    # Path for the cache-file.
    cache_path_dir = os.path.join(data_dir, 'processed_lda_data')
    feature_cache_path = os.path.join(
        cache_path_dir, 'feature_transfer_values_{}.pkl'.format(data_type)
    )
    images_cache_path = os.path.join(
        cache_path_dir, 'images_{}.pkl'.format(data_type)
    )
    captions_cache_path = os.path.join(
        cache_path_dir, 'captions_{}.pkl'.format(data_type)
    )
    
    # Check if directory to store processed data exists
    if not os.path.exists(cache_path_dir):
        print('Directory created:', cache_path_dir)
        os.mkdir(cache_path_dir)

    # If the cache-file already exists then reload it,
    # otherwise process all images and save their transfer-values
    # to the cache-file so it can be reloaded quickly.
    feature_path_exists = os.path.exists(feature_cache_path)
    image_path_exists = os.path.exists(images_cache_path)
    caption_path_exists = os.path.exists(captions_cache_path)
    if feature_path_exists and image_path_exists and caption_path_exists:
        with open(feature_cache_path, mode='rb') as file:
            feature_obj = pickle.load(file)
        with open(images_cache_path, mode='rb') as file:
            filenames = pickle.load(file)
        with open(captions_cache_path, mode='rb') as file:
            captions = pickle.load(file)
        print("Data loaded from cache-file.")
    else:
        feature_obj = process_images(
            feature_model, data_dir, filenames, batch_size
        )
        with open(feature_cache_path, mode='wb') as file:
            pickle.dump(feature_obj, file)
        with open(images_cache_path, mode='wb') as file:
            pickle.dump(filenames, file)
        with open(captions_cache_path, mode='wb') as file:
            pickle.dump(captions, file)
        print("Data saved to cache-file.")

    return feature_obj, filenames, captions

In [13]:
process_batch_size = 128

In [19]:
%%time
# Training Data
feature_transfer_values_train, images_train, captions_train = process_data(
    feature_model, data_dir, 'train', train_images, train_captions, process_batch_size
)
print("feature shape:", feature_transfer_values_train.shape)

Processing 19324 images in train-set ...
Data loaded from cache-file.
feature shape: (19324, 4096)
CPU times: user 108 ms, sys: 345 ms, total: 453 ms
Wall time: 450 ms


In [20]:
%%time
# Validation Data
feature_transfer_values_val, images_val, captions_val = process_data(
    feature_model, data_dir, 'val', val_images, val_captions, process_batch_size
)
print("feature shape:", feature_transfer_values_val.shape)

Processing 2415 images in val-set ...
Progress: |██████████████████████████████████████████████████| 100.0% Complete

Data saved to cache-file.
feature shape: (2415, 4096)
CPU times: user 15min 1s, sys: 8.57 s, total: 15min 9s
Wall time: 2min 26s


In [21]:
%%time
# Test Data
feature_transfer_values_test, images_test, captions_test = process_data(
    feature_model, data_dir, 'test', test_images, test_captions, process_batch_size
)
print("feature shape:", feature_transfer_values_test.shape)

Processing 2417 images in test-set ...
Progress: |██████████████████████████████████████████████████| 100.0% Complete

Data saved to cache-file.
feature shape: (2417, 4096)
CPU times: user 14min 33s, sys: 8.93 s, total: 14min 42s
Wall time: 2min 23s


# Tokenizer

In [22]:
mark_start = 'startseq'
mark_end = 'endseq'

In [23]:
def mark_captions(captions_list):
    """ Mark all the captions with the start and the end marker """
    captions_marked = [
        [' '.join([mark_start, caption, mark_end]) for caption in captions] for captions in captions_list
    ]
    
    return captions_marked

In [24]:
captions_train_marked = mark_captions(captions_train)  # training
captions_val_marked = mark_captions(captions_val)  # validation
print('Marked captions:')
captions_train_marked[0]

Marked captions:


['startseq Children playing and standing on a small tennis court. endseq',
 'startseq A group of children standing on a tennis court. endseq',
 'startseq Children learning to play tennis with instructors at an outdoor tennis court. endseq',
 'startseq Several children gathered on a tennis court learning how today tennis endseq',
 'startseq A group of children are gathered around playing tennis. endseq']

In [25]:
def flatten(captions_list):
    """ Flatten all the captions into a single list """
    caption_list = [caption
                    for caption_list in captions_list
                    for caption in caption_list]
    
    return caption_list

In [26]:
captions_train_flat = flatten(captions_train_marked)

Create the tokenizer

In [27]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions_train_flat)

CPU times: user 1.94 s, sys: 7.96 ms, total: 1.95 s
Wall time: 1.95 s


In [28]:
# Get integer token for the start marker
token_start = tokenizer.word_index[mark_start]
token_start

2

In [29]:
# Get integer token for the end marker
token_end = tokenizer.word_index[mark_end]
token_end

3

In [30]:
# Numbers of words in the vocabulary
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8632

In [31]:
# Max length of each caption
max_tokens = 16

# LDA