In [1]:
import os
import pickle
import sys
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Add, Reshape, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model

from nltk.translate.bleu_score import corpus_bleu

In [3]:
from dataset.utils import load_coco, load_image, print_progress_bar

# Load Data

In [4]:
# Folder containing the datset
data_dir = 'dataset/processed_all_data'

In [5]:
def load_data(data_type, data_dir):
    # Path for the cache-file.
    feature_cache_path = os.path.join(
        data_dir, 'feature_transfer_values_{}.pkl'.format(data_type)
    )
    captions_cache_path = os.path.join(
        data_dir, 'captions_{}.pkl'.format(data_type)
    )

    feature_path_exists = os.path.exists(feature_cache_path)
    caption_path_exists = os.path.exists(captions_cache_path)
    if feature_path_exists and caption_path_exists:
        with open(feature_cache_path, mode='rb') as file:
            feature_obj = pickle.load(file)
        with open(captions_cache_path, mode='rb') as file:
            captions = pickle.load(file)
    else:
        sys.exit('processed {} data does not exist.'.format(data_type))
    
    if data_type != 'test':
        topics_cache_path = os.path.join(
            data_dir, 'topics_{}.pkl'.format(data_type)
        )
        if os.path.exists(topics_cache_path):
            with open(topics_cache_path, mode='rb') as file:
                topics = pickle.load(file)
            print('{} data loaded from cache-file.'.format(data_type))
            return feature_obj, topics, captions

    print('{} data loaded from cache-file.'.format(data_type))
    return feature_obj, captions

In [6]:
# Load pre-processed data
features_train, topics_train, captions_train = load_data(
    'train', data_dir
)
features_val, topics_val, captions_val = load_data(
    'val', data_dir
)
features_test, captions_test = load_data(
    'test', data_dir
)

train data loaded from cache-file.
val data loaded from cache-file.
test data loaded from cache-file.


# Tokenizer

In [7]:
def mark_captions(captions_list, mark_start, mark_end):
    """ Mark all the captions with the start and the end marker """
    captions_marked = [
        [' '.join([mark_start, caption, mark_end]) for caption in captions] for captions in captions_list
    ]
    
    return captions_marked

In [8]:
def flatten(captions_list):
    """ Flatten all the captions into a single list """
    caption_list = [
        caption for caption_list in captions_list for caption in caption_list
    ]
    
    return caption_list

In [9]:
def create_tokenizer(captions_marked):
    captions_flat = flatten(captions_marked)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions_flat)
    vocab_size = len(tokenizer.word_index) + 1
    return tokenizer, vocab_size

In [10]:
mark_start = 'startseq'
mark_end = 'endseq'

In [11]:
captions_train_marked = mark_captions(captions_train, mark_start, mark_end)  # training
captions_val_marked = mark_captions(captions_val, mark_start, mark_end)  # validation

print('Marked captions:')
captions_train_marked[0]

Marked captions:


['startseq A bird perched on the pavement and a lady seated near it endseq',
 'startseq A black bird perched on ledge next to a tree. endseq',
 'startseq A black bird sits on the corner of a wall. endseq',
 'startseq A gray bird stands on a rocky bench near some leaves. endseq',
 'startseq A bird sitting on a small cement bench looking out at something. endseq']

In [21]:
tokenizer, vocab_size = create_tokenizer(captions_train_marked)

In [22]:
tokenizer.num_words = 1000
vocab_size = 1000

In [13]:
# Max length of each caption
max_tokens = 16

# Data Generator

In [23]:
def create_sequences(tokenizer, max_length, topic_transfer_value, feature_transfer_value, caption, vocab_size):
    """ Create sequences of topic_values, feature_values, input sequence and output sequence for an image """
    topic_values, feature_values = [], []
    input_captions, output_captions = [], []
    integer_sequence = tokenizer.texts_to_sequences([caption])[0]  # encode the sequence
    
    for idx in range(1, len(integer_sequence)):
        in_seq, out_seq = integer_sequence[:idx], integer_sequence[idx]  # split into input and output pair
        in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post', truncating='post')[0]  # pad input sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]  # encode output sequence
        
        # store
        topic_values.append(topic_transfer_value)
        feature_values.append(feature_transfer_value)
        input_captions.append(in_seq)
        output_captions.append(out_seq)
        
    return topic_values, feature_values, input_captions, output_captions

In [24]:
def batch_generator(
    topic_transfer_values, feature_transfer_values, captions_list, tokenizer, num_images, batch_size, max_length, vocab_size
):
    """ Generator function for creating random batches of training-data """

    # Infinite loop.
    while True:
        # Get a list of random indices for images in the dataset.
        indices = np.random.randint(num_images, size=batch_size)
        
        # For a batch of the randomly chosen images there are
        # at least 5 captions describing the contents of the image.
        # Select one of those captions at random
        topic_values, feature_values = [], []
        input_captions, output_captions = [], []
        for idx in indices:
            topic_value, feature_value, input_caption, output_caption = create_sequences(
                tokenizer,
                max_length,
                topic_transfer_values[idx],
                feature_transfer_values[idx],
                np.random.choice(captions_list[idx]),
                vocab_size
            )
            topic_values.extend(topic_value)
            feature_values.extend(feature_value)
            input_captions.extend(input_caption)
            output_captions.extend(output_caption)

        # Dict for the input-data. Because we have
        # several inputs, we use a named dict to
        # ensure that the data is assigned correctly.
        x_data = {
            'caption_input': np.array(input_captions),
            'topic_input': np.array(topic_values),
            'feature_input': np.array(feature_values)
        }

        # Dict for the output-data.
        y_data = {
            'caption_output': np.array(output_captions)
        }
        
        yield (x_data, y_data)

In [25]:
batch_size = 128

In [26]:
# training-dataset generator
generator_train = batch_generator(
    topics_train,
    features_train,
    captions_train_marked,
    tokenizer,
    len(captions_train),
    batch_size,
    max_tokens,
    vocab_size
)

# validation-dataset generator
generator_val = batch_generator(
    topics_val,
    features_val,
    captions_val_marked,
    tokenizer,
    len(captions_val),
    batch_size,
    max_tokens,
    vocab_size
)

batch = next(generator_train)
batch_x = batch[0]
batch_y = batch[1]

In [27]:
# Example of the topic-transfer-values for the first image in the batch
batch_x['topic_input'][0]

array([0.00185185, 0.09444444, 0.00185185, 0.02037037, 0.00185185,
       0.00185185, 0.00185185, 0.26111111, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.37222222, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.05740741, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.00185185, 0.00185185, 0.00185185,
       0.00185185, 0.00185185, 0.05740741, 0.00185185, 0.00185

In [28]:
# Example of the feature-transfer-values for the first image in the batch
batch_x['feature_input'][0]

array([0.4600898 , 0.        , 0.60919714, ..., 0.        , 0.53796935,
       0.27270582], dtype=float32)

In [29]:
# Example of the token-sequence for the first image in the batch
# This is the input to the decoder-part of the neural network
batch_x['caption_input'][0]

array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [30]:
# This is the token-sequence for the output of the decoder
# Note how it is the same as the sequence above, except it is shifted one time-step
batch_y['caption_output'][0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [31]:
# Shape of the output
batch_y['caption_output'].shape

(1332, 1000)

## Steps Per Epoch

In [32]:
def calculate_steps_per_epoch(captions_list, batch_size):
    # Number of captions for each image
    num_captions = [len(captions) for captions in captions_list]
    
    # Total number of captions
    total_num_captions = np.sum(num_captions)
    
    return int(total_num_captions / batch_size)

In [33]:
steps_per_epoch_train = calculate_steps_per_epoch(captions_train_marked, batch_size)
steps_per_epoch_train

4385

In [34]:
steps_per_epoch_val = calculate_steps_per_epoch(captions_val_marked, batch_size)
steps_per_epoch_val

195

# Create Model

### Create pre-trained Embedding Layer

In [35]:
from models.embedding_layer import create_embedding_layer

### Define some global values

In [36]:
# Internal state-sizes of LSTMs
state_size = 256

In [53]:
K.clear_session()

## Encode Images

In [54]:
def image_encoder(feature_shape, state_size):
    """ Encode Images """
    feature_input = Input(
        shape=feature_shape, name='feature_input'
    )
    feature_net = Dropout(0.5)(feature_input)
    image_model_output = Dense(state_size, activation='relu', name='image_model_output')(feature_net)
    return feature_input, image_model_output

In [55]:
feature_input, image_model_output = image_encoder(features_train.shape[1:], state_size)

## Encode Captions

### Create model to encode captions

In [56]:
def caption_encoder(
    topic_input_shape, word_index, glove_file, mark_start, mark_end, state_size, vocab_size, max_tokens
):
    """ Encode Captions """

    # Define layers
    topic_input = Input(
        shape=topic_input_shape, name='topic_input'
    )
    topic_input_reshaped = Dense(state_size, activation='tanh', name='topic_input_reshaped')(topic_input)
    caption_input = Input(shape=(max_tokens,), name='caption_input')
    caption_embedding = create_embedding_layer(word_index, glove_file, mark_start, mark_end, vocab_size)
    caption_lstm = GRU(state_size, name='caption_lstm')

    # connect layers
    net = caption_input  # Start the decoder-network with its input-layer
    net = caption_embedding(net)  # Connect the embedding-layer
    net = Dropout(0.5)(net)
    caption_model_output = caption_lstm(net, initial_state=topic_input_reshaped) # Connect the caption LSTM layer

    return topic_input, caption_input, caption_model_output

In [57]:
# Encode Captions
glove_file = 'dataset/glove.6B.300d.txt'
topic_input, caption_input, caption_model_output = caption_encoder(
    topics_train.shape[1:],
    tokenizer.word_index,
    glove_file,
    mark_start,
    mark_end,
    state_size,
    len(tokenizer.word_index) + 1,
    max_tokens
)

Creating word to vec map...
Done!


## Merge the image and the caption model

In [58]:
# Concat the outputs of both the models
merge_net = Add()([image_model_output, caption_model_output])

In [59]:
merge_net = Dense(state_size, activation='relu')(merge_net)
outputs = Dense(vocab_size, activation='softmax', name='caption_output')(merge_net)
outputs

<tf.Tensor 'caption_output/Softmax:0' shape=(?, 1000) dtype=float32>

# Create the Training Model

In [60]:
model = Model(
    inputs=[feature_input, topic_input, caption_input],
    outputs=outputs
)

In [61]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
caption_input (InputLayer)      (None, 16)           0                                            
__________________________________________________________________________________________________
feature_input (InputLayer)      (None, 4096)         0                                            
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, 16, 300)      8006400     caption_input[0][0]              
__________________________________________________________________________________________________
topic_input (InputLayer)        (None, 80)           0                                            
__________________________________________________________________________________________________
dropout (D

## Compile the Training Model

In [62]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

### Callback Functions

In [63]:
path_checkpoint = 'weights/cplda-weights-{epoch:02d}-{val_loss:.2f}.hdf5'
callback_checkpoint = ModelCheckpoint(
    filepath=path_checkpoint,
    monitor='val_loss',
    verbose=1,
    save_best_only=True
)

In [64]:
callback_tensorboard = TensorBoard(
    log_dir='./weights/caption-lda-logs/',
    histogram_freq=0,
    write_graph=True
)

In [65]:
callback_early_stop = EarlyStopping(monitor='val_loss', patience=50, verbose=1)

In [66]:
callback_reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, verbose=1, min_lr=0.00001)

In [67]:
callbacks = [callback_checkpoint, callback_tensorboard, callback_early_stop, callback_reduce_lr]

### Load Checkpoint

In [None]:
try:
    model.load_weights('weights/')
    print('Weights loaded.')
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

### Visualize model

In [None]:
plot_model(model, to_file='caption_model_dropout.png', show_shapes=True, show_layer_names=True)

### Train the Model

In [68]:
%%time
model.fit_generator(
    generator=generator_train,
    steps_per_epoch=steps_per_epoch_train,
    epochs=60,
    callbacks=callbacks,
    validation_data=generator_val,
    validation_steps=steps_per_epoch_val
)

Epoch 1/60
Epoch 00001: val_loss improved from inf to 3.21933, saving model to weights/cplda-weights-01-3.22.hdf5
Epoch 2/60
Epoch 00002: val_loss improved from 3.21933 to 3.20960, saving model to weights/cplda-weights-02-3.21.hdf5
Epoch 3/60
 117/4385 [..............................] - ETA: 3:42 - loss: 2.1940

KeyboardInterrupt: 