In [1]:
import os
import sys
import argparse
import pickle
import numpy as np

In [2]:
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from models.caption_model import create_model
from dataset.process_texts import (
    mark_captions,
    clean_captions,
    caption_to_sequence,
    build_vocabulary_with_frequency_threshold
)

# Load Data

In [4]:
# Folder containing the datset
data_dir = 'dataset/processed_data'

In [5]:
def load_data(data_type, data_dir):
    # Path for the cache-file.
    feature_cache_path = os.path.join(
        data_dir, 'features_{}.pkl'.format(data_type)
    )
    topics_cache_path = os.path.join(
        data_dir, 'topics_{}.pkl'.format(data_type)
    )
    captions_cache_path = os.path.join(
        data_dir, 'captions_{}.pkl'.format(data_type)
    )

    feature_path_exists = os.path.exists(feature_cache_path)
    topic_path_exists = os.path.exists(topics_cache_path)
    caption_path_exists = os.path.exists(captions_cache_path)
    if feature_path_exists and topic_path_exists and caption_path_exists:
        with open(feature_cache_path, mode='rb') as file:
            feature_obj = pickle.load(file)
        with open(topics_cache_path, mode='rb') as file:
            topics = pickle.load(file)
        with open(captions_cache_path, mode='rb') as file:
            captions = pickle.load(file)
    else:
        sys.exit('processed {} data does not exist.'.format(data_type))

    print('{} data loaded from cache-file.'.format(data_type))
    return feature_obj, topics, captions

In [6]:
def process_captions(captions_list, mark_start, mark_end, freq_threshold):
    captions_list_marked = mark_captions(captions_list, mark_start, mark_end)
    captions_list_marked = clean_captions(captions_list_marked)
    vocab, word_idx, _ = build_vocabulary_with_frequency_threshold(captions_list_marked, freq_threshold)
    return captions_list_marked, word_idx, len(vocab) + 1

In [7]:
def calculate_steps_per_epoch(captions_list, batch_size):
    # Number of captions for each image
    num_captions = [len(captions) for captions in captions_list]
    
    # Total number of captions
    total_num_captions = np.sum(num_captions)
    
    return int(total_num_captions / batch_size)

In [8]:
def create_sequences(word_idx, max_length, topic_transfer_value, feature_transfer_value, caption, vocab_size):
    """ Create sequences of topic_values, feature_values, input sequence and output sequence for an image """
    topic_values, feature_values = [], []
    input_captions, output_captions = [], []
    integer_sequence = caption_to_sequence(caption, word_idx)  # encode the sequence
    
    for idx in range(1, len(integer_sequence)):
        in_seq, out_seq = integer_sequence[:idx], integer_sequence[idx]  # split into input and output pair
        in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post', truncating='post')[0]  # pad input sequence
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]  # encode output sequence
        
        # store
        topic_values.append(topic_transfer_value)
        feature_values.append(feature_transfer_value)
        input_captions.append(in_seq)
        output_captions.append(out_seq)
        
    return topic_values, feature_values, input_captions, output_captions

In [9]:
def batch_generator(topic_transfer_values, feature_transfer_values, captions_list, word_idx, num_images, batch_size, max_length, vocab_size):
    while True:
        indices = np.random.randint(num_images, size=batch_size)
        topic_values, feature_values = [], []
        input_captions, output_captions = [], []
        for idx in indices:
            topic_value, feature_value, input_caption, output_caption = create_sequences(
                word_idx,
                max_length,
                topic_transfer_values[idx],
                feature_transfer_values[idx],
                np.random.choice(captions_list[idx]),
                vocab_size
            )
            topic_values.extend(topic_value)
            feature_values.extend(feature_value)
            input_captions.extend(input_caption)
            output_captions.extend(output_caption)
            
        x_data = {
            'caption_input': np.array(input_captions),
            'topic_input': np.array(topic_values),
            'feature_input': np.array(feature_values)
        }

        y_data = {
            'caption_output': np.array(output_captions)
        }
        
        yield (x_data, y_data)

In [10]:
# Load pre-processed data
features_train, topics_train, captions_train = load_data(
    'train', data_dir
)
features_val, topics_val, captions_val = load_data(
    'val', data_dir
)
print('\nFeatures shape:', features_train.shape)
print('Topics shape:', topics_train.shape)

train data loaded from cache-file.
val data loaded from cache-file.

Features shape: (112218, 1000)
Topics shape: (112218, 80)


In [11]:
# process captions
mark_start = 'startseq'
mark_end = 'endseq'
captions_train_marked, word_idx, vocab_size = process_captions(  # training
    captions_train, mark_start, mark_end, 10
)
captions_val_marked = mark_captions(captions_val, mark_start, mark_end)  # validation
captions_val_marked = clean_captions(captions_val_marked)

In [12]:
# training-dataset generator
generator_train = batch_generator(
    topics_train,
    features_train,
    captions_train_marked,
    word_idx,
    len(captions_train),
    256,
    16,
    vocab_size
)

# validation-dataset generator
generator_val = batch_generator(
    topics_val,
    features_val,
    captions_val_marked,
    word_idx,
    len(captions_val),
    256,
    16,
    vocab_size
)

In [13]:
vocab_size

6857

In [14]:
# Create Model
model = create_model(
    topics_train.shape[1:],
    features_train.shape[1:],
    word_idx,
    'dataset/glove.6B.300d.txt',
    mark_start,
    mark_end,
    vocab_size,
    16
)

Creating word to vec map...
Done!
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
caption_input (InputLayer)      (None, 16)           0                                            
__________________________________________________________________________________________________
topic_input (InputLayer)        (None, 80)           0                                            
__________________________________________________________________________________________________
feature_input (InputLayer)      (None, 1000)         0                                            
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, 16, 300)      2057400     caption_input[0][0]              
___________________________________________________________________________

In [15]:
# define callbacks
path_checkpoint = 'weights/cplda-weights-{epoch:02d}-{val_loss:.2f}.hdf5'
callback_checkpoint = ModelCheckpoint(
    filepath=path_checkpoint,
    monitor='val_loss',
    verbose=1,
    save_best_only=True
)
callback_tensorboard = TensorBoard(
    log_dir='./weights/caption-lda-logs/',
    histogram_freq=0,
    write_graph=True
)
callback_early_stop = EarlyStopping(monitor='val_loss', patience=25, verbose=1)
callback_reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, min_lr=0.00001)
callbacks = [callback_checkpoint, callback_tensorboard, callback_early_stop, callback_reduce_lr]

In [16]:
# train model
model.fit_generator(
    generator=generator_train,
    steps_per_epoch=calculate_steps_per_epoch(captions_train, 128),
    epochs=30,
    callbacks=callbacks,
    validation_data=generator_val,
    validation_steps=calculate_steps_per_epoch(captions_val, 128)
)

Epoch 1/30
  71/4385 [..............................] - ETA: 28:28 - loss: 6.2030

KeyboardInterrupt: 