In [1]:
import os
import pickle
import sys
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Add, Reshape, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model

from nltk.translate.bleu_score import corpus_bleu

In [3]:
from dataset.utils import load_coco, load_image, print_progress_bar
from models.vgg19 import load_vgg19

# Load Data

In [4]:
# Folder containing the datset
data_dir = 'dataset'

In [5]:
train_data, val_data, test_data, category_id, id_category = load_coco(
    os.path.join(data_dir, 'coco_raw.pickle'), 'captions'
)

In [6]:
train_images, train_captions = train_data  # Load training data
val_images, val_captions = val_data  # Load validation data
test_images, test_captions = test_data  # Load test data

In [7]:
num_classes = len(id_category)
num_classes

80

In [8]:
num_images_train = len(train_images)
num_images_train

19324

In [9]:
num_images_val = len(val_images)
num_images_val

2415

# Load Pre-Trained Image Model

In [10]:
# Load the pre-trained feature extractor model
feature_model = load_vgg19()

In [11]:
def load_data(data_type, data_dir):
    # Path for the cache-file.
    feature_cache_path = os.path.join(
        data_dir, 'feature_transfer_values_{}.pkl'.format(data_type)
    )
    captions_cache_path = os.path.join(
        data_dir, 'captions_{}.pkl'.format(data_type)
    )
    images_cache_path = os.path.join(
        data_dir, 'images_{}.pkl'.format(data_type)
    )

    feature_path_exists = os.path.exists(feature_cache_path)
    image_path_exists = os.path.exists(images_cache_path)
    caption_path_exists = os.path.exists(captions_cache_path)
    if feature_path_exists and image_path_exists and caption_path_exists:
        with open(feature_cache_path, mode='rb') as file:
            feature_obj = pickle.load(file)
        with open(images_cache_path, mode='rb') as file:
            images = pickle.load(file)
        with open(captions_cache_path, mode='rb') as file:
            captions = pickle.load(file)
    else:
        sys.exit('processed {} data does not exist.'.format(data_type))
    
    print('{} data loaded from cache-file.'.format(data_type))
    return feature_obj, images, captions

In [12]:
def load_topics(data_type, data_dir):
    topics_cache_path = os.path.join(
        data_dir, 'topics_{}.pkl'.format(data_type)
    )
    
    if os.path.exists(topics_cache_path):
        with open(topics_cache_path, mode='rb') as file:
            topics = pickle.load(file)
    else:
        sys.exit('{} does not exist.'.format(topics_cache_path))
    
    print('{} data topics loaded from cache-file.'.format(data_type))
    return topics

In [13]:
processed_data_dir = os.path.join(data_dir, 'processed_lda_data')

In [14]:
%%time
# Training data
feature_transfer_values_train, images_train, captions_train = load_data('train', processed_data_dir)
topics_train = load_topics('train', processed_data_dir)

train data loaded from cache-file.
train data topics loaded from cache-file.
CPU times: user 108 ms, sys: 351 ms, total: 459 ms
Wall time: 458 ms


In [15]:
%%time
# Validation data
feature_transfer_values_val, images_val, captions_val = load_data('val', processed_data_dir)
topics_val = load_topics('val', processed_data_dir)

val data loaded from cache-file.
val data topics loaded from cache-file.
CPU times: user 27.9 ms, sys: 34 ms, total: 61.8 ms
Wall time: 59.1 ms


In [16]:
%%time
# Test data
feature_transfer_values_test, images_test, captions_test = load_data('test', processed_data_dir)

test data loaded from cache-file.
CPU times: user 9.51 ms, sys: 49.2 ms, total: 58.7 ms
Wall time: 57 ms


# Tokenizer

In [17]:
mark_start = 'startseq'
mark_end = 'endseq'

In [18]:
def mark_captions(captions_list):
    """ Mark all the captions with the start and the end marker """
    captions_marked = [
        [' '.join([mark_start, caption, mark_end]) for caption in captions] for captions in captions_list
    ]
    
    return captions_marked

In [19]:
captions_train_marked = mark_captions(captions_train)  # training
captions_val_marked = mark_captions(captions_val)  # validation
print('Marked captions:')
captions_train_marked[0]

Marked captions:


['startseq A man in a blue shirt is playing tennis. endseq',
 'startseq Man running playing tennis with ball in air on court endseq',
 'startseq A man playing tennis with his tennis racket and his tennis ball. endseq',
 'startseq Male tennis player gets ready to hit the tennis ball. endseq',
 'startseq A male tennis player in action on the court. endseq']

In [20]:
def flatten(captions_list):
    """ Flatten all the captions into a single list """
    caption_list = [caption
                    for caption_list in captions_list
                    for caption in caption_list]
    
    return caption_list

In [21]:
captions_train_flat = flatten(captions_train_marked)

Create the tokenizer

In [22]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions_train_flat)

CPU times: user 1.91 s, sys: 0 ns, total: 1.91 s
Wall time: 1.91 s


In [23]:
# Get integer token for the start marker
token_start = tokenizer.word_index[mark_start]
token_start

2

In [24]:
# Get integer token for the end marker
token_end = tokenizer.word_index[mark_end]
token_end

3

In [25]:
# Numbers of words in the vocabulary
# + 1 is because of reserving padding (i.e. index zero)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8644

In [26]:
# Max length of each caption
max_tokens = 16

# Data Generator

In [27]:
def create_sequences(tokenizer, max_length, topic_transfer_value, feature_transfer_value, caption):
    """ Create sequences of topic_values, feature_values, input sequence and output sequence for an image """
    topic_values, feature_values = [], []
    input_captions, output_captions = [], []
    integer_sequence = tokenizer.texts_to_sequences([caption])[0]  # encode the sequence
    
    for idx in range(1, len(integer_sequence)):
        in_seq, out_seq = integer_sequence[:idx], integer_sequence[idx]  # split into input and output pair
        in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post', truncating='post')[0]  # pad input sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]  # encode output sequence
        
        # store
        topic_values.append(topic_transfer_value)
        feature_values.append(feature_transfer_value)
        input_captions.append(in_seq)
        output_captions.append(out_seq)
        
    return topic_values, feature_values, input_captions, output_captions

In [28]:
def batch_generator(topic_transfer_values, feature_transfer_values, captions_list, tokenizer, num_images, batch_size, max_length, vocab_size):
    """
    Generator function for creating random batches of training-data.
    
    It selects the data completely randomly for each
    batch, corresponding to sampling of the training-set with
    replacement. This means it is possible to sample the same
    data multiple times within a single epoch - and it is also
    possible that some data is not sampled at all within an epoch.
    However, all the data should be unique within a single batch.
    """

    # Infinite loop.
    while True:
        # Get a list of random indices for images in the dataset.
        indices = np.random.randint(num_images, size=batch_size)
        
        # For a batch of the randomly chosen images there are
        # at least 5 captions describing the contents of the image.
        # Select one of those captions at random
        topic_values, feature_values = [], []
        input_captions, output_captions = [], []
        for idx in indices:
            topic_value, feature_value, input_caption, output_caption = create_sequences(
                tokenizer,
                max_length,
                topic_transfer_values[idx],
                feature_transfer_values[idx],
                np.random.choice(captions_list[idx])
            )
            topic_values.extend(topic_value)
            feature_values.extend(feature_value)
            input_captions.extend(input_caption)
            output_captions.extend(output_caption)

        # Dict for the input-data. Because we have
        # several inputs, we use a named dict to
        # ensure that the data is assigned correctly.
        x_data = {
            'caption_input': np.array(input_captions),
            'topic_input': np.array(topic_values),
            'feature_input': np.array(feature_values)
        }

        # Dict for the output-data.
        y_data = {
            'caption_output': np.array(output_captions)
        }
        
        yield (x_data, y_data)

In [29]:
batch_size = 128

In [30]:
generator_train = batch_generator(  # training
    topics_train,
    feature_transfer_values_train,
    captions_train_marked,
    tokenizer,
    num_images_train,
    batch_size,
    max_tokens,
    vocab_size
)

generator_val = batch_generator(  # validation
    topics_val,
    feature_transfer_values_val,
    captions_val_marked,
    tokenizer,
    num_images_val,
    batch_size,
    max_tokens,
    vocab_size
)

batch = next(generator_train)
batch_x = batch[0]
batch_y = batch[1]

In [31]:
# Example of the topic-transfer-values for the first image in the batch
batch_x['topic_input'][0]

array([0.0016129 , 0.03387097, 0.0016129 , 0.01774194, 0.0016129 ,
       0.0016129 , 0.05      , 0.0016129 , 0.0016129 , 0.01774194,
       0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 ,
       0.0016129 , 0.0016129 , 0.0016129 , 0.01774194, 0.05      ,
       0.0983871 , 0.0016129 , 0.0016129 , 0.0016129 , 0.01774194,
       0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 ,
       0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 ,
       0.01774194, 0.08225806, 0.0016129 , 0.0016129 , 0.0016129 ,
       0.35645161, 0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 ,
       0.0016129 , 0.0016129 , 0.01774194, 0.0016129 , 0.0016129 ,
       0.0016129 , 0.0016129 , 0.06612903, 0.0016129 , 0.0016129 ,
       0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 ,
       0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 , 0.01774194,
       0.0016129 , 0.0016129 , 0.01774194, 0.0016129 , 0.0016129 ,
       0.0016129 , 0.0016129 , 0.0016129 , 0.0016129 , 0.00161

In [32]:
# Example of the feature-transfer-values for the first image in the batch
batch_x['feature_input'][0]

array([0.7923771 , 0.        , 0.41744143, ..., 0.        , 0.4082044 ,
       0.4943936 ], dtype=float32)

In [33]:
# Example of the token-sequence for the first image in the batch
# This is the input to the decoder-part of the neural network
batch_x['caption_input'][0]

array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [34]:
# This is the token-sequence for the output of the decoder
# Note how it is the same as the sequence above, except it is shifted one time-step
batch_y['caption_output'][0]

array([0., 1., 0., ..., 0., 0., 0.], dtype=float32)

In [35]:
# Shape of the output
batch_y['caption_output'].shape

(1454, 8644)

## Steps Per Epoch

In [36]:
def calculate_steps_per_epoch(captions_list, batch_size):
    # Number of captions for each image
    num_captions = [len(captions) for captions in captions_list]
    
    # Total number of captions
    total_num_captions = np.sum(num_captions)
    
    return int(total_num_captions / batch_size)

In [37]:
steps_per_epoch_train = calculate_steps_per_epoch(captions_train_marked, batch_size)
steps_per_epoch_train

755

In [38]:
steps_per_epoch_val = calculate_steps_per_epoch(captions_val_marked, batch_size)
steps_per_epoch_val

94

# Create Model

### Create pre-trained Embedding Layer

In [39]:
def read_glove_vecs(glove_file):
    print('Creating word to vec map...')
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)
    print('Done!')
    return word_to_vec_map

In [40]:
# load embeddings
word_to_vec_map = read_glove_vecs('{}/glove.6B.300d.txt'.format(data_dir))

Creating word to vec map...
Done!


In [41]:
# assign embeddings values to custom tokens
size = word_to_vec_map['unk'].shape

word_to_vec_map[mark_start] = np.random.uniform(low=-1.0, high=1.0, size=size)
word_to_vec_map[mark_end] = np.random.uniform(low=-1.0, high=1.0, size=size)

In [42]:
def create_embedding_layer(word_to_index, word_to_vec_map, num_words):
    """ Create a Keras Embedding() layer and load in pre-trained GloVe 300-dimensional vectors
        @params:
        :word_to_index -- dictionary containing the each word mapped to its index
        :word_to_vec_map -- dictionary mapping words to their GloVe vector representation
        :num_words -- number of words in the vocabulary
        
        @return:
        :decoder_embedding -- pretrained layer Keras instance
    """
    
    vocabulary_length = num_words + 1  # adding 1 to fit Keras embedding (requirement)
    embedding_dimensions = word_to_vec_map['unk'].shape[0]  # define dimensionality of GloVe word vectors (= 300)
    
    embedding_matrix = np.zeros((vocabulary_length, embedding_dimensions))  # initialize with zeros
    for word, index in word_to_index.items():
        try:
            embedding_matrix[index, :] = word_to_vec_map[word]
        except KeyError:
            embedding_matrix[index, :] = word_to_vec_map['unk']
    
    # we don't want the embeddings to be updated, thus trainable parameter is set to False
    decoder_embedding = Embedding(
        input_dim=vocabulary_length,
        output_dim=embedding_dimensions,
        trainable=False,
        name='decoder_embedding'
    )
    decoder_embedding.build((None,))
    decoder_embedding.set_weights([embedding_matrix])  # with this the layer is now pretrained
    
    return decoder_embedding

### Define some global values

In [43]:
# Internal state-sizes of LSTMs
state_size = 256

In [None]:
K.clear_session()

## Encode Images

In [44]:
# Input layer to receive the predictions from the feature model
feature_input = Input(
    shape=K.int_shape(feature_model.output)[1:], name='feature_input'
)

In [45]:
# Add a Dropout Layer
feature_net = Dropout(0.5)(feature_input)

In [46]:
# Reshape feature transfer values
image_model_output = Dense(state_size, activation='relu', name='image_model_output')(feature_net)
image_model_output

<tf.Tensor 'image_model_output/Relu:0' shape=(?, 256) dtype=float32>

## Encode Captions

### Create model to encode captions

Define layers

In [48]:
# This inputs topic-transfer-values to the LSTM
topic_input = Input(
    shape=topics_train.shape[1:], name='topic_input'
)

In [49]:
# Input for token-sequences to the decoder
# Using 'None' in the shape means that the token-sequences can have arbitrary lengths
caption_input = Input(shape=(max_tokens,), name='caption_input')

In [50]:
# Embedding-layer which converts sequences of integer-tokens to sequences of vectors
caption_embedding = create_embedding_layer(tokenizer.word_index, word_to_vec_map, vocab_size)

In [51]:
# The LSTM layer for the input captions
caption_lstm = LSTM(state_size, name='caption_lstm')

Connect the layers

In [52]:
# Reshape the topic transfer values to 3D tensor in-order to feed it to the LSTM
topic_input_reshaped = Reshape(
    target_shape=(topics_train.shape[1:] + (1,))
)(topic_input)

In [53]:
# Feed topic to LSTM
_, initial_state_h0, initial_state_c0 = LSTM(
    state_size, return_state=True, name='topic_lstm'
)(topic_input_reshaped)

topic_lstm_states = [initial_state_h0, initial_state_c0]

In [54]:
net = caption_input  # Start the decoder-network with its input-layer
net = caption_embedding(net)  # Connect the embedding-layer
net = Dropout(0.5)(net)
caption_model_output = caption_lstm(net, initial_state=topic_lstm_states) # Connect the caption LSTM layer
caption_model_output

<tf.Tensor 'caption_lstm/TensorArrayReadV3:0' shape=(?, 256) dtype=float32>

## Merge the image and the caption model

In [55]:
# Concat the outputs of both the models
merge_net = Add()([image_model_output, caption_model_output])

In [56]:
merge_net = Dense(state_size, activation='relu')(merge_net)
outputs = Dense(vocab_size, activation='softmax', name='caption_output')(merge_net)
outputs

<tf.Tensor 'caption_output/Softmax:0' shape=(?, 8644) dtype=float32>

# Create the Training Model

In [57]:
model = Model(
    inputs=[feature_input, topic_input, caption_input],
    outputs=outputs
)

In [58]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
caption_input (InputLayer)      (None, 16)           0                                            
__________________________________________________________________________________________________
topic_input (InputLayer)        (None, 80)           0                                            
__________________________________________________________________________________________________
feature_input (InputLayer)      (None, 4096)         0                                            
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, 16, 300)      2593500     caption_input[0][0]              
__________________________________________________________________________________________________
reshape (R

## Compile the Training Model

In [59]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

### Callback Functions

In [60]:
path_checkpoint = 'weights/caption-lda-weights.{epoch:02d}-{val_loss:.2f}.hdf5'
callback_checkpoint = ModelCheckpoint(
    filepath=path_checkpoint,
    monitor='val_loss',
    verbose=1,
    save_best_only=True
)

In [62]:
callback_tensorboard = TensorBoard(
    log_dir='./weights/caption-lda-logs/',
    histogram_freq=0,
    write_graph=True
)

In [63]:
callback_early_stop = EarlyStopping(monitor='val_loss', patience=8, verbose=1)

In [64]:
callback_reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, verbose=1, min_lr=0.0001)

In [65]:
callbacks = [callback_checkpoint, callback_tensorboard, callback_early_stop, callback_reduce_lr]

### Load Checkpoint

In [None]:
try:
    model.load_weights('weights/caption-lda-weights.20-2.25.adam.batch-128.lr-decay.hdf5')
    print('Weights loaded.')
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

### Visualize model

In [None]:
plot_model(model, to_file='caption_model_dropout.png', show_shapes=True, show_layer_names=True)

### Train the Model

In [None]:
%%time
model.fit_generator(
    generator=generator_train,
    steps_per_epoch=steps_per_epoch_train,
    epochs=30,
    callbacks=callbacks,
    validation_data=generator_val,
    validation_steps=steps_per_epoch_val
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 2.70062, saving model to weights/caption-lda-weights.01-2.70.hdf5
Epoch 2/30
Epoch 00002: val_loss improved from 2.70062 to 2.47448, saving model to weights/caption-lda-weights.02-2.47.hdf5
Epoch 3/30
Epoch 00003: val_loss improved from 2.47448 to 2.41824, saving model to weights/caption-lda-weights.03-2.42.hdf5
Epoch 4/30
Epoch 00004: val_loss improved from 2.41824 to 2.38259, saving model to weights/caption-lda-weights.04-2.38.hdf5
Epoch 5/30
Epoch 00005: val_loss improved from 2.38259 to 2.37128, saving model to weights/caption-lda-weights.05-2.37.hdf5
Epoch 6/30
Epoch 00006: val_loss improved from 2.37128 to 2.37124, saving model to weights/caption-lda-weights.06-2.37.hdf5
Epoch 7/30
Epoch 00007: val_loss did not improve from 2.37124
Epoch 8/30
Epoch 00008: val_loss did not improve from 2.37124
Epoch 9/30
Epoch 00009: val_loss did not improve from 2.37124

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00010000000474