# Basline Model
The starting point of our Encoder-Decoder challenge.

In [1]:
# Install packages
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [2]:
import os  # operating system access
import sys  # system specific operations
import datetime as dt  # datetime handling

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split  # data preperation

import tensorflow as tf  # tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
import tensorflow_addons as tfa
import tensorboard as tb

import tqdm  # progress bars
import json
import codecs
import pickle

import warnings
warnings.simplefilter(action='ignore')

import matplotlib.pyplot as plt

import nltk.translate.bleu_score as ntbs  # BLEU scoring basics
import rouge_score.rouge_scorer as rouge_scorer  # ROUGE scoring basics
sys.path.insert(0, '../input/aida-metrics-functions')  # custom metrics implementation
import metrics as met  # custom metrics BLUE-1 pecsision and ROUGE-L recall

## Configuration

In [3]:
# Runtime config
EPOCHS = 1
RUN_TRAINING = True  # Train the model during run
SAVE_MODEL = True  # Write model to hd5 file
LOG_MODEL = True  # Write tensorboard and model history logs during training
RUN_DATA_CHECK = False  # Do a basic check on the input data

# Tensorboard view config
TENSORBOARD_SHOW_INLINE = False
# Launch tensorboard
# True: From that notebook, False: connect to already running tensorboard
TENSORBOARD_LAUNCH = True

# Directories and filenames
FP_CAPTIONS = '../input/flickr8k/captions.txt'
DIR_IMAGES = '../input/flickr8k/Images/'
DIR_IMAGE_FEATURES = '../input/aida-image-captioning/Images/'
DIR_MODEL_STORE = './models/'
DIR_MODEL_LOG = './models/'
DIR_RESULT_STORE = './results/'
DIR_TENSORBOARD_LOG = './tensorboard/'

# Create specified directories
!mkdir results
!mkdir models
!mkdir tensorboard

## Init

In [4]:
timestamp_str = None
FN_TIMESTAMP = "20210428-143443"
FN_MODEL2LOAD = f"Baseline_Train_Test_{FN_TIMESTAMP}.h5"
FN_HIST2LOAD = f"Baseline_Train_Test_{FN_TIMESTAMP}.pickle"

## Helper functions

In [5]:
def preproc_text(texts):
    """
    - Adds <start> to the beginning of texts to the inputs
    - Adds <end> to the end of texts
    Input: texts: list of sentences
    Output: - tokenized, padded text with <start> token for using as input in decoder
            - tokenized, padded text with <end> token for using as target in decoder
            - tokenizer
            - max_seq_len
    """
    
    print('Creating and fitting tokenizer...')
    tokenizer_fit_data = ['<start> ' + s + ' <end>' for s in texts]

    # create tokenizer for captions + <start> and <end> tags
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;?@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(tokenizer_fit_data)
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'
    print('Creating and fitting tokenizer done')
    
    print('Create text input sequences...')
    # add <start> tag before every caption and tokenize + pad ('post')
    enc_input_data = ['<start> ' + s for s in texts]
    enc_input_token_seqs = tokenizer.texts_to_sequences(enc_input_data)
    # if no "maxlen" defined then pad_sequences takes the longest sequence as max_len
    enc_input_prep = pad_sequences(enc_input_token_seqs, padding='post')
    print('Create text input sequences done')
    
    print('Create text target sequences...')
    # add <start> tag before every caption and tokenize + pad ('post')
    enc_target_data = [s + ' <end>' for s in texts]
    enc_target_token_seqs = tokenizer.texts_to_sequences(enc_target_data)
    # if no "maxlen" defined then pad_sequences takes the longest sequence as max_len
    enc_target_prep = pad_sequences(enc_target_token_seqs, padding='post')
    print('Create text target sequences done')
    
    print('\n\n')
    
    # show some texts processed
    for i  in range(2):
        print(f'               {i} orig: {texts[i]}')
        print(f'              {i} input: {enc_input_data[i]}')
        print(f' {i} preprocessed input: {enc_input_prep[i]}')
        print(f'             {i} target: {enc_target_data[i]}')
        print(f'{i} preprocessed target: {enc_target_prep[i]}')
        print()

    print('\n\n')
    print(f' preprocessed input shape: {enc_input_prep.shape}')
    print(f'preprocessed target shape: {enc_target_prep.shape}')
    
    return enc_input_prep, enc_target_prep, tokenizer, enc_input_prep.shape[-1]  # last dim is the length of the sequences        


class History_trained_model(object):
    """Helper class for saving training history"""
    def __init__(self, history, epoch, params):
        self.history = history
        self.epoch = epoch
        self.params = params
        

def save_hist(path, history):
    """Saves the training history to file"""
    with open(path, 'wb') as file:
        model_history = History_trained_model(history.history, history.epoch, history.params)
        pickle.dump(model_history, file, pickle.HIGHEST_PROTOCOL)        

        
def load_hist(path):
    """Loads the training history from file"""
    with open(path, 'rb') as file:
        history = pickle.load(file)
    return history

## Load Data

In [6]:
# Picture captions
df_cap = pd.read_csv(FP_CAPTIONS)
display(df_cap.head())

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


# Check Data

In [7]:
if RUN_DATA_CHECK :
    # Number of captions per image
    print(df_cap.groupby('image').agg(cap_count=('caption','count')).describe())

    # Test if images are there for every entry
    img_file_not_exist = []

    for image in tqdm(df_cap['image'].unique()):
        if not os.path.exists(os.path.join(DIR_IMAGES, image)):
            img_file_not_exist.append(image)

    print(f'Number of image files not exists in "{DIR_IMAGES}" directory: {len(img_file_not_exist)}')

    # test if images features are there for every entry
    img_file_not_exist = []
    for image in tqdm(df_cap['image'].unique()):
        if not os.path.exists(os.path.join(DIR_IMAGE_FEATURES, image.replace('.jpg', '.npy'))):
            img_file_not_exist.append(image)

    print(f'Number of image feature files not exists in "{DIR_IMAGE_FEATURES}" directory: {len(img_file_not_exist)}')

## Preprocess Captions

In [8]:
enc_input_prep, enc_target_prep, tokenizer, max_seq_len = preproc_text(df_cap['caption'].values)

Creating and fitting tokenizer...
Creating and fitting tokenizer done
Create text input sequences...
Create text input sequences done
Create text target sequences...
Create text target sequences done



               0 orig: A child in a pink dress is climbing up a set of stairs in an entry way .
              0 input: <start> A child in a pink dress is climbing up a set of stairs in an entry way .
 0 preprocessed input: [   2    1   43    4    1   90  172    7  119   51    1  393   12  394
    4   28 5159  670    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]
             0 target: A child in a pink dress is climbing up a set of stairs in an entry way . <end>
0 preprocessed target: [   1   43    4    1   90  172    7  119   51    1  393   12  394    4
   28 5159  670    3    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]

               1 orig: A girl going into a wooden building .

## Create Train-Test Split

In [9]:
# Create 80,20 % split on the images
_, test_img_names = train_test_split(df_cap['image'].unique(), test_size=0.2, random_state=42)
df_cap['is_test'] = df_cap['image'].isin(test_img_names)
display(df_cap.head(2))

# Trainset
train_enc_input_prep = enc_input_prep[df_cap[~df_cap['is_test']].index]
train_enc_target_prep = enc_target_prep[df_cap[~df_cap['is_test']].index]
train_image_feature_names = df_cap[~df_cap['is_test']]['image'].map(lambda x: os.path.join(DIR_IMAGE_FEATURES, x.replace('.jpg', '.npy'))).values

# Testset
test_enc_input_prep = enc_input_prep[df_cap[df_cap['is_test']].index]
test_enc_target_prep = enc_target_prep[df_cap[df_cap['is_test']].index]
test_image_feature_names = df_cap[df_cap['is_test']]['image'].map(lambda x: os.path.join(DIR_IMAGE_FEATURES,x.replace('.jpg', '.npy'))).values

# show train_image_features
train_image_feature_names.shape, train_image_feature_names[:2]

Unnamed: 0,image,caption,is_test
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...,False
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .,False


((32360,),
 array(['../input/aida-image-captioning/Images/1000268201_693b08cb0e.npy',
        '../input/aida-image-captioning/Images/1000268201_693b08cb0e.npy'],
       dtype=object))

## Create Train-Test Datasets for Tensorflow

__Currently doesn't work, continue with *Create Data Generator* (next section)__

In [10]:
"""
BUFFER_SIZE = 1000
BATCH_SIZE = 64
EMBEDDING_DIM = 256
UNITS = 512
VOCAB_SIZE = len(tokenizer.index_word)

print(f'vocab size: {VOCAB_SIZE}')

# Load the numpy files
def map_func(img_name, cap_inp, cap_targ):
    #img_tensor = np.load(img_name.decode('utf-8')+'.npy')
    img_tensor = np.load(img_name)
    return img_tensor, cap_inp, cap_targ

# train dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_image_feature_names, train_enc_input_prep, train_enc_target_prep))
#train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": train_image_feature_names, "input_2": train_enc_input_prep}, train_enc_target_prep))

# Use map to load the numpy files in parallel
train_dataset = train_dataset.map(lambda item1, item2, item3: tf.numpy_function(
          map_func, [item1, item2, item3], [tf.float32, tf.int32, tf.int32]),
          num_parallel_calls=tf.data.AUTOTUNE)

# Shuffle and batch
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

data = next(iter(train_dataset))

type(data), len(data)

# test dataset
test_dataset = tf.data.Dataset.from_tensor_slices((test_image_feature_names, test_enc_input_prep, test_enc_target_prep))

# Use map to load the numpy files in parallel
test_dataset = train_dataset.map(lambda item1, item2, item3: tf.numpy_function(
          map_func, [item1, item2, item3], [tf.float32, tf.int32, tf.int32]),
          num_parallel_calls=tf.data.AUTOTUNE)

# Do not shuffle
test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
"""

'\nBUFFER_SIZE = 1000\nBATCH_SIZE = 64\nEMBEDDING_DIM = 256\nUNITS = 512\nVOCAB_SIZE = len(tokenizer.index_word)\n\nprint(f\'vocab size: {VOCAB_SIZE}\')\n\n# Load the numpy files\ndef map_func(img_name, cap_inp, cap_targ):\n    #img_tensor = np.load(img_name.decode(\'utf-8\')+\'.npy\')\n    img_tensor = np.load(img_name)\n    return img_tensor, cap_inp, cap_targ\n\n# train dataset\ntrain_dataset = tf.data.Dataset.from_tensor_slices((train_image_feature_names, train_enc_input_prep, train_enc_target_prep))\n#train_dataset = tf.data.Dataset.from_tensor_slices(({"input_1": train_image_feature_names, "input_2": train_enc_input_prep}, train_enc_target_prep))\n\n# Use map to load the numpy files in parallel\ntrain_dataset = train_dataset.map(lambda item1, item2, item3: tf.numpy_function(\n          map_func, [item1, item2, item3], [tf.float32, tf.int32, tf.int32]),\n          num_parallel_calls=tf.data.AUTOTUNE)\n\n# Shuffle and batch\ntrain_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(

## Setup Data Generator

In [11]:
class MyDataGenerator(tf.keras.utils.Sequence):
    """Custom class for providing the data to the model"""
    def __init__(self, 
                 image_names: np.ndarray, 
                 cap_inputs: np.ndarray, 
                 cap_targets: np.ndarray, 
                 batch_size: int, 
                 shuffle: bool = True):
        """
        Constructor
        Arguments:
            image_name: Name of the imaga file
            cap_inputs: Word vectors of known captions used for training and test,
                        containing <start> keyword
            cap_targets: Word vectors of known captions used for prediction,
                         containing <end> keyword
            batch_size: Batch size to use
            shuffle: True: do shuffeling
                     False: use order as is
        """
        self.image_names = image_names
        self.cap_inputs = cap_inputs
        self.cap_targets = cap_targets
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()
        
    def on_epoch_end(self):
        """Updates indexes after each epoch"""
        self.indexes = np.arange(len(self.image_names))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
        
    def __len__(self):
        """Returns the number of batch in the sequence."""
        return int(np.floor(len(self.image_names) / self.batch_size))
    
    def __getitem__(self, index):
        """Generate one batch of data using the index"""
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        imgs, cap_ins, cap_outs = [], [], []
        for i in indexes:
            imgs.append( np.load(self.image_names[i]) )
            cap_ins.append( self.cap_inputs[i] )
            cap_outs.append( self.cap_targets[i] )
        
        return (np.array(imgs), np.array(cap_ins)),  np.array(cap_outs)

In [12]:
# Config batchsize
BATCH_SIZE = 64

# Init generators
train_gen = MyDataGenerator(train_image_feature_names, train_enc_input_prep, train_enc_target_prep, batch_size=BATCH_SIZE, shuffle=True)
test_gen = MyDataGenerator(test_image_feature_names, test_enc_input_prep, test_enc_target_prep, batch_size=BATCH_SIZE, shuffle=False)

# Create Encoder-Decoder model for Training/Test

In [20]:
# Create own loss function
# means: mask out padded targets
def loss_func(targets, preds):
    # last layer has softmax activation => from_logits=False
    #sparse_cat_ce = SparseCategoricalCrossentropy()
    sparse_cat_ce = SparseCategoricalCrossentropy(from_logits=True)
    mask = K.cast(K.greater(targets, 0), dtype=tf.int64)    
    # calc cross entropy masked
    loss = sparse_cat_ce(targets, preds, sample_weight=mask)
    return loss


# Create own accuracy function
def accuracy_func(targets, preds):
    real_preds = K.cast(K.argmax(preds, axis=-1), dtype=tf.float32)
    correct = K.cast(K.equal(targets, real_preds), dtype=tf.float32)
    # mask out 0 padded elements
    mask = K.cast(K.greater(targets, 0), dtype=tf.float32)
    n_correct = K.sum(mask * correct)
    n_total = K.sum(mask)
    return n_correct/n_total


if RUN_TRAINING:    
    # Set timestamp of training
    timestamp_str = dt.datetime.now().strftime("%Y%m%d-%H%M%S")  # set timestamp for current run
    
    def create_base_enc_dec_model(
            enc_input_shape: tuple, 
            output_seq_length: int, 
            embedding_dim: int, 
            vocab_size: int, 
            units: int) -> tf.keras.Model:    
        """
        Build up the encoder- and decoder-part for training and testing purpose

        Arguments:
            enc_input_shape: Tuple of input dimensions of the encoder
            output_seq_length: Max length of caption sequence to predict
            embedding_dim: Length of used word vectors
            vocab_size: Number of words within the vocabulary
            units: Number of CNN and RNN units to use
        Returns:
            compiled keras model
        """    
        # ENCODER
        enc_input = Input(shape=enc_input_shape, name='enc_input')   
        enc_output = layers.Flatten()(enc_input) # BSx64x2048 => BSx131072
        enc_output = layers.Dense(units, activation='relu', name='enc_output')(enc_output)    

        # DECODER
        dec_input = Input(shape=(None,), name='dec_input')    

        dec_embed_out = layers.Embedding(
            input_dim=vocab_size, 
            output_dim=embedding_dim, 
            name='dec_embed')(dec_input)

        dec_lstm_out, _, _ = layers.LSTM(
            units, 
            return_sequences=True, 
            return_state=True,
            name= 'dec_lstm')(dec_embed_out, initial_state=[enc_output, enc_output])

        dec_out = layers.Dense(vocab_size, activation='softmax', name='dec_dense')(dec_lstm_out)

        # Compile model
        model = Model([enc_input, dec_input], dec_out, name='Baseline_Train_Test')
        model.compile(
            optimizer=Adam(learning_rate=1e-4, clipnorm=5.0),
            loss=loss_func,
            metrics=[accuracy_func]        
        )

        return model

    # Config model dimensions
    EMBEDDING_DIM = 256
    UNITS = 512
    VOCAB_SIZE = len(tokenizer.index_word)

    # Build up model
    model = create_base_enc_dec_model(
        enc_input_shape=(64, 2048), 
        output_seq_length=enc_input_prep.shape[-1], 
        embedding_dim=EMBEDDING_DIM,
        vocab_size=VOCAB_SIZE,
        units=UNITS)

    # Define callbacks
    checkpoint_filepath = DIR_MODEL_STORE + 'checkpoint'
    cb_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy_func',
        mode='max',
        save_best_only=True)
    cb_early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    cb_tqdm = tfa.callbacks.TQDMProgressBar()    
    if LOG_MODEL:
        cb_tensorboard = tf.keras.callbacks.TensorBoard(
            log_dir = DIR_TENSORBOARD_LOG + f"{model.name}_{timestamp_str}",
            histogram_freq=0, update_freq='epoch',
            write_graph=True, write_images=False)
        callbacks = [cb_early_stopping, cb_tqdm, cb_tensorboard, cb_checkpoint]
    else:
        callbacks = [cb_early_stopping, cb_tqdm, cb_checkpoint]

    # Plot model
    if SAVE_MODEL:
        plot_model(model, 
               show_shapes=True, 
               to_file=DIR_MODEL_STORE + model.name + "_" + timestamp_str + ".png")
    else:
        plot_model(model, show_shapes=True)

    # Train model
    if RUN_TRAINING:
        history = model.fit(
            train_gen,
            epochs=EPOCHS, 
            validation_data=test_gen, 
            callbacks=callbacks,
            verbose=0)

        # Save model and history
        if SAVE_MODEL:
            # Save history
            fp_hist = DIR_MODEL_STORE + model.name + "_" + timestamp_str + ".pickle"
            save_hist(fp_hist, history)
            print(f"Model history for {model.name} saved to: {fp_hist}")
            
            # Save model
            fp_model = DIR_MODEL_STORE + model.name + "_" + timestamp_str + ".h5"
            model.save(fp_model, save_format='h5')
            print(f"Model {model.name} saved to: {fp_model}")

Training:   0%|           0/1 ETA: ?s,  ?epochs/s

Epoch 1/1


0/505           ETA: ?s - 

Model history for Baseline_Train_Test saved to: ./models/Baseline_Train_Test_20210428-175738.pickle
Model Baseline_Train_Test saved to: ./models/Baseline_Train_Test_20210428-175738.h5


In [14]:
# Display Tensorboard
if TENSORBOARD_SHOW_INLINE:
    # Load extension
    %load_ext tensorboard

    # Start tensorboard
    if TENSORBOARD_LAUNCH:
        %tensorboard --logdir $DIR_TENSORBOARD_LOG
    else:
        # Show tensorboard inline für monitoring training performance
        tb.notebook.list() # View open TensorBoard instances
        tb.notebook.display(port=6006, height=800)  #  Select the hopefully already startet

# Create Encoder-Decoder model for Prediction

In [22]:
def creat_prediction_models(model: tf.keras.Model, units: int) -> (tf.keras.Model, tf.keras.Model):
    """
    Create prediction models from trained model
    
    Arguments:
        model: Former created training model
        units: Number of CNN and RNN units to use
    Returns:
        Tuple containing the compiled models
        Index: 0: endoder
               1: decoder
    """    

    # ENCODER
    enc_input = model.get_layer(name='enc_input').input   
    enc_output = model.get_layer(name='enc_output').output
    encoder_model = Model(enc_input, [enc_output, enc_output])  # I take the same for hidden state and cell state
        
    # DECODER
    # 1. define Inputs (LSTM => (hidden_state, cell_state))
    dec_input = model.get_layer(name='dec_input').input
    dec_state_input_h = Input(shape=(units,))
    dec_state_input_c = Input(shape=(units,))
    dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    
    # 2. get embedding layer
    dec_embedding = model.get_layer(name='dec_embed')
    dec_outputs = dec_embedding(dec_input)
    
    # 3. get LSTM 
    # initial_state comes from encoder in first time step after that from last times hidden+cell state
    dec_lstm = model.get_layer(name='dec_lstm')  
    dec_outputs, state_h_dec, state_c_dec = dec_lstm(
        dec_outputs, initial_state=dec_states_inputs
    )
  
    # 4. output FC layer
    dec_dense = model.get_layer(name='dec_dense')
    dec_outputs = dec_dense(dec_outputs)
    
    # 5. Create Model
    decoder_model = Model(
        [dec_input, dec_state_input_h, dec_state_input_c],
        [dec_outputs, state_h_dec, state_c_dec] # give out hidden state for next iteration
    )
    
    return encoder_model, decoder_model

# Build up model
if not RUN_TRAINING:
    # Load training history
    fp_hist = DIR_MODEL_STORE + FN_HIST2LOAD
    history = load_hist(fp_hist)
    print(f"Training history for model {model.name} loaded from file: {fp_hist}")
    
    # Load model
    fp_model = DIR_MODEL_STORE + FN_MODEL2LOAD
    model = tf.keras.models.load_model(
        fp_model,
        custom_objects={"loss_func": loss_func, "accuracy_func" : accuracy_func}
    )
    print(f"Model {model.name} loaded from file: {fp_model}")
    
encoder_model, decoder_model = creat_prediction_models(model, units=UNITS)

In [23]:
MAX_DECODER_SEQ_LEN = enc_input_prep.shape[-1]

def make_prediction(input_img_feature_path, verbose: int = 0):
    """
    Initialize 
    Arguments:
        input_img_feature_path: Complete path to image file
        verbose: 0: don't show any comment
                 1: show working comments
    Returns:
        predicted caption for the input image
    """
    # input_img_feature_path: path to picture features numpy data
    # load numpy features from input_img_feature_path
    img_features = np.expand_dims(np.load(input_img_feature_path), axis=0 )
    
    # get encoder states
    state_h, state_c = encoder_model.predict(img_features)
    if verbose >= 1:
        print('state_h, state_c .shape', state_h.shape, state_c.shape)
    
    # create start token
    dec_input = tf.constant([[tokenizer.word_index['<start>']]])
    if verbose >= 1:
        print('dec_input.shape', dec_input.shape)
    
    caption = []
    while len(caption) < MAX_DECODER_SEQ_LEN:
        dec_output, state_h, state_c = decoder_model.predict(
            [dec_input, state_h, state_c],
            verbose = 0,
        )
        
        # next input must be the current output (this is already a word index)
        dec_input = tf.argmax(dec_output, -1)

        word_ndx = dec_input.numpy()[0,0] # detach from tf
        predicted_word = tokenizer.index_word[word_ndx]
        
        # do not append <end> tag to caption
        if predicted_word == '<end>':
           break
        
        caption.append(predicted_word)
        
    return ' '.join(caption)

## Calc ROUGE-L and BLEU-1 metrics

### Testset

In [None]:
# Prepare true captures (references) and predicted captures (candidates)
c_true_test = dict()
c_pred_test = dict()
df = df_cap.loc[df_cap.is_test == True]
for img_fname in tqdm.notebook.tqdm(df.image.unique(), total=len(df.image.unique())):
    c_true_test[img_fname] = df_cap.loc[df_cap.image == img_fname].caption.to_list()
    c_pred_test[img_fname] = make_prediction(os.path.join(DIR_IMAGE_FEATURES, img_fname.replace('.jpg','.npy')))

# Serialize data into file
if timestamp_str == None:
    timestamp_str = dt.datetime.now().strftime("%Y%m%d-%H%M%S")  # set timestamp for current run
json.dump(c_true_test, open(DIR_RESULT_STORE + f"c_true_test_{timestamp_str}.json", 'w'))
json.dump(c_pred_test, open(DIR_RESULT_STORE + f"c_pred_test_{timestamp_str}.json", 'w'))

# Calculate Metrics
rouge_l_test, bleu_1_test = met.evaluate(c_true_test, c_pred_test)
print(f"Metrics for Testset:\nROUGE-L recall:{rouge_l_test:>10.2%}\nBLEU-1 precision: {bleu_1_test:>7.2%}")

### Trainset

In [None]:
# Prepare true captures (references) and predicted captures (candidates)
c_true_train = dict()
c_pred_train = dict()
df = df_cap.loc[df_cap.is_test == False]
for img_fname in tqdm.notebook.tqdm(df.image.unique(), total=len(df.image.unique())):
    c_true_train[img_fname] = df_cap.loc[df_cap.image == img_fname].caption.to_list()
    c_pred_train[img_fname] = make_prediction(os.path.join(DIR_IMAGE_FEATURES, img_fname.replace('.jpg','.npy')))

# Serialize data into file
if timestamp_str == None:
    timestamp_str = dt.datetime.now().strftime("%Y%m%d-%H%M%S")  # set timestamp for current run
json.dump(c_true_train, open(DIR_RESULT_STORE + f"c_true_train_{timestamp_str}.json", 'w'))
json.dump(c_pred_train, open(DIR_RESULT_STORE + f"c_pred_train_{timestamp_str}.json", 'w'))

# Calculate Metrics
rouge_l_train, bleu_1_train = met.evaluate(c_true_train, c_pred_train)
print(f"Metrics for Trainset:\nROUGE-L recall:{rouge_l_train:>10.2%}\nBLEU-1 precision: {bleu_1_train:>7.2%}")

## Save results

In [None]:
# Save metrics to csv file
if timestamp_str == None:
    timestamp_str = dt.datetime.now().strftime("%Y%m%d-%H%M%S")  # set timestamp for current run

df = pd.DataFrame({
    "Metric": ["ROUGE-L recallt", "BLEU-1 precission"],
    "Score Test": [rouge_l_test, bleu_1_test],
    "Score Train": [rouge_l_train, bleu_1_train]}).reset_index(drop=True)

df.to_csv(DIR_RESULT_STORE + f"df_result_{timestamp_str}.csv")

# Predict something from test

In [None]:
def eval_image_from_test(img_name):
    img = plt.imread(os.path.join(DIR_IMAGES, img_name))
    plt.figure(figsize=(8,8))
    plt.imshow(img)
    plt.show()
    
    print(f'\nReference captions:\n{"-"*80}')
    for l in df_cap[df_cap['image']==img_name]['caption'].values:
        print(f'- {l}')
    
    prediction = make_predict(os.path.join(DIR_IMAGE_FEATURES, img_name.replace('.jpg','.npy')))
    print(f'\nPrediction:\n{"-"*80}')
    print(f'- {prediction}')

In [None]:
num_predictions = 2
for img_file_name in df_cap[df_cap['is_test']].sample(num_predictions)['image'].values:
    eval_image_from_test(img_file_name)
    print(f'{"="*80}\n')