## Import Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from textwrap import wrap

import os
import gc       # garbage collection: for memory allocation and deallocation
import pickle
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

from keras.applications import VGG16
import tensorflow as tf

from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical, plot_model, Sequence
from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Dropout, add, BatchNormalization, Concatenate

from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

# from nltk.translate.bleu_score import corpus_bleu

from PIL import Image

In [None]:
"""BLEU score implementation."""
import math
import sys
import warnings
from collections import Counter
from fractions import Fraction as _Fraction

from nltk.util import ngrams


class Fraction(_Fraction):
    """Fraction with _normalize=False support for 3.12"""

    def __new__(cls, numerator=0, denominator=None, _normalize=False):
        if sys.version_info >= (3, 12):
            self = super().__new__(cls, numerator, denominator)
        else:
            self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
        self._normalize = _normalize
        self._original_numerator = numerator
        self._original_denominator = denominator
        return self

    @property
    def numerator(self):
        if not self._normalize:
            return self._original_numerator
        return super().numerator

    @property
    def denominator(self):
        if not self._normalize:
            return self._original_denominator
        return super().denominator

def sentence_bleu(
    references,
    hypothesis,
    weights=(0.25, 0.25, 0.25, 0.25),
    smoothing_function=None,
    auto_reweigh=False,):
    return corpus_bleu([references], [hypothesis], weights, smoothing_function, auto_reweigh)

def corpus_bleu(
    list_of_references,
    hypotheses,
    weights=(0.25, 0.25, 0.25, 0.25),
    smoothing_function=None,
    auto_reweigh=False,):
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), ()

    try:
        weights[0][0]
    except:
        weights = [weights]
    max_weight_length = max(len(weight) for weight in weights)

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i in range(1, max_weight_length + 1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i in range(1, max_weight_length + 1)]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0 if len(weights) == 1 else [0] * len(weights)

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths)

    bleu_scores = []
    for weight in weights:
        # Uniformly re-weighting based on maximum hypothesis lengths if largest
        # order of n-grams < 4 and weights is set at default.
        if auto_reweigh:
            if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
                weight = (1 / hyp_lengths,) * hyp_lengths

        s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
        s = bp * math.exp(math.fsum(s))
        bleu_scores.append(s)
    return bleu_scores[0] if len(weights) == 1 else bleu_scores


def modified_precision(references, hypothesis, n):
    # Extracts all ngrams in hypothesis
    # Set an empty Counter if hypothesis is empty.
    counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
    # Extract a union of references' counts.
    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
    max_counts = {}
    for reference in references:
        reference_counts = (Counter(ngrams(reference, n)) if len(reference) >= n else Counter())
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    # Assigns the intersection between hypothesis and references' counts.
    clipped_counts = {ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()}

    numerator = sum(clipped_counts.values())
    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
    # Usually this happens when the ngram order is > len(reference).
    denominator = max(1, sum(counts.values()))

    return Fraction(numerator, denominator, _normalize=False)


def closest_ref_length(references, hyp_len):
    ref_lens = (len(reference) for reference in references)
    closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len))
    return closest_ref_len


def brevity_penalty(closest_ref_len, hyp_len):
    if hyp_len > closest_ref_len:
        return 1
    # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
    elif hyp_len == 0:
        return 0
    else:
        return math.exp(1 - closest_ref_len / hyp_len)


class SmoothingFunction:
    def __init__(self, epsilon=0.1, alpha=5, k=5):
        self.epsilon = epsilon
        self.alpha = alpha
        self.k = k

    def method0(self, p_n, *args, **kwargs):
        """
        No smoothing.
        """
        p_n_new = []
        for i, p_i in enumerate(p_n):
            if p_i.numerator != 0:
                p_n_new.append(p_i)
            else:
                _msg = str().format(i + 1)
                warnings.warn(_msg)
                # When numerator==0 where denonminator==0 or !=0, the result
                # for the precision score should be equal to 0 or undefined.
                # Due to BLEU geometric mean computation in logarithm space,
                # we we need to take the return sys.float_info.min such that
                # math.log(sys.float_info.min) returns a 0 precision score.
                p_n_new.append(sys.float_info.min)
        return p_n_new

    def method1(self, p_n, *args, **kwargs):
        """
        Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
        """
        return [(p_i.numerator + self.epsilon) / p_i.denominator
            if p_i.numerator == 0
            else p_i
            for p_i in p_n]

    def method2(self, p_n, *args, **kwargs):
        return [Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False)
            if i != 0
            else p_n[0]
            for i in range(len(p_n))]

    def method3(self, p_n, *args, **kwargs):
        incvnt = 1  # From the mteval-v13a.pl, it's referred to as k.
        for i, p_i in enumerate(p_n):
            if p_i.numerator == 0:
                p_n[i] = 1 / (2**incvnt * p_i.denominator)
                incvnt += 1
        return p_n

    def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        incvnt = 1
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        for i, p_i in enumerate(p_n):
            if p_i.numerator == 0 and hyp_len > 1:
                # incvnt = i + 1 * self.k / math.log(
                #     hyp_len
                # )  # Note that this K is different from the K from NIST.
                # p_n[i] = incvnt / p_i.denominator\
                numerator = 1 / (2**incvnt * self.k / math.log(hyp_len))
                p_n[i] = numerator / p_i.denominator
                incvnt += 1
        return p_n

    def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        m = {}
        # Requires an precision value for an addition ngram order.
        p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
        m[-1] = p_n[0] + 1
        for i, p_i in enumerate(p_n):
            p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
            m[i] = p_n[i]
        return p_n

    def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        # This smoothing only works when p_1 and p_2 is non-zero.
        # Raise an error with an appropriate message when the input is too short
        # to use this smoothing technique.
        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
        for i, p_i in enumerate(p_n):
            if i in [0, 1]:  # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
                # No. of ngrams in translation that matches the reference.
                m = p_i.numerator
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i + 1))
                # Calculates the interpolated precision.
                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
        return p_n

    def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        """
        Smoothing method 7:
        Interpolates methods 4 and 5.
        """
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        p_n = self.method4(p_n, references, hypothesis, hyp_len)
        p_n = self.method5(p_n, references, hypothesis, hyp_len)
        return p_n

## Import Data 

In [None]:
data = pd.read_csv('/kaggle/input/dataset3cap/FINAL_COMBINED_DATASET_3CAP.csv')
data.head(10)

## Display Images with Captions

In [None]:
# def readImage(path, img_size = 224):
#     img = load_img(path, color_mode = 'rgb', target_size = (img_size, img_size))
#     img = img_to_array(img)
#     img = img/255.
#     return img

# # THIS IS NOT NECESSARY, URDU CAPTIONS THEK DISPLAY NH HO RHE
# def display_images(temp_df):   
#     temp_df = temp_df.reset_index(drop = True)
#     plt.figure(figsize = (20, 20))
#     n = 0
#     for i in range(156070 * 5):  
#         n += 1
#         plt.subplot(156070, 5, n)  
#         plt.subplots_adjust(hspace = 0.7, wspace = 0.3)

#         image = readImage(f"/kaggle/input/images/Images/{temp_df.image[i]}")
#         plt.imshow(image)
#         plt.title("\n".join(wrap(temp_df.urdu_caption[i][-1::-1], 20)))
#         plt.axis("off")

In [None]:
# display_images(data)

## Model to Extract Features

In [None]:
# Load the Model
cnn = YOLO("yolov8m-cls.pt")

## Extracting Image Features

In [None]:
# features = {}
# directory = '/kaggle/input/images/Images'

# for img_name in tqdm(os.listdir(directory)): 
    
#     # load the image from file
#     img_path = directory + '/' + img_name
#     image = load_img(img_path, target_size = (299, 299))
#     image = img_to_array(image)                # convert image pixels to numpy array
    
#     # reshape data for model
#     image = image.reshape((1, image.shape[0],  # width 
#                            image.shape[1],     # height
#                            image.shape[2]      # channels (3 because of rgb)
#                           ))

#     image = preprocess_input(image)            # preprocess image for Xception
#     feature = cnn.predict(image, verbose=0)    # extract features
#     image_id = img_name.split('.')[0]          # get image ID
#     features[image_id] = feature[0]            # store feature  (size = 2048)

In [None]:
# # store features in pickle
# pickle.dump(features, open('/kaggle/working/ImgFeaturesVGG16.pkl', 'wb'))

In [None]:
pickle.dump(features, open('ImgFeaturesVGG16Reshaped.pkl', 'wb'))

In [None]:
# load features from pickle
with open('/kaggle/input/yolopkl/ImgFeaturesYOLO.pkl', 'rb') as f:    
    features = pickle.load(f)

## Preprocess captions

In [None]:
def preprocessCaption(df):
    df['urdu_caption'] = 'endseq ' + df['urdu_caption'] + ' startseq'
    return df

In [None]:
data = data.apply(preprocessCaption, axis = 1)
data

In [None]:
data.iloc[87714, 2]

## Tokenizing the Text

In Python, indices typically start from 0. However, when working with tokenization in NLP, we often reserve index 0 for special tokens, such as padding tokens or unknown tokens.
* **Padding Token:** In many NLP tasks, sequences of words or tokens are padded to ensure uniform length. Padding tokens are used to fill in the extra spaces in sequences to make them uniform. Index 0 is usually reserved for the padding token.
<br><br>
* **Unknown Token:** This token is used to represent words that are not present in the vocabulary. When a word that is not in the vocabulary is encountered during tokenization, it is replaced by the unknown token. Again, index 0 is often reserved for this purpose.

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["urdu_caption"])
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
tokenizer.word_index

This is dictionary containing urdu words as keys and their corresponding indices as values. It represents the vocabulary learned by the tokenizer during the fitting process

In [None]:
# get maximum length of the caption available
max_length = max(len(ucap.split()) for ucap in data['urdu_caption'])
max_length

## Splitting the Data

5 Captions per image
* Train = 70% (562089)
* Validation = 20% (140523)
* Test = 10% (78069)

1 caption per image:
* Train = 72% (112370)
* Validation = 18% (28093)
* Test = 10% (15607)

In [None]:
X_train_valid, X_test, Y_train_valid, Y_test = train_test_split(data['image'], data['urdu_caption'], 
                                                                test_size = 0.10, random_state = 30, shuffle = True)

X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_valid, Y_train_valid, 
                                                      test_size = 0.2, random_state = 30, shuffle = True)

print('Train set size:', X_train.shape)
print('Validation set size:', X_valid.shape)
print('Test set size:', X_test.shape)
print('Total data size', X_train.shape[0] + X_valid.shape[0] + X_test.shape[0])

## Model Building

* A model architecture that will combine visual information from images with textual information from partial captions to generate captions for images.
* The model will learn to generate captions based on both the visual content and the context provided by the captions.

In [None]:
strategy = tf.distribute.MultiWorkerMirroredStrategy()

In [None]:
with strategy.scope():
# image feature extractor model
    inputs1 = Input(shape = (1000,))             # defines an input layer for the image features.
    fe1 = BatchNormalization()(inputs1)          # applies batch normalization to the input image features.
    fe2 = Dense(512, activation = 'relu')(fe1)   # applies a dense layer with 512 units and ReLU activation to the batch-normalized image features.

    # partial caption sequence model
    inputs2 = Input(shape = (max_length,))       # defines an input layer for the partial caption sequences.
    se1 = Embedding(vocab_size, 512)(inputs2)    # embeds the input sequences into dense vectors of size 512. 
                                                 # this layer uses an embedding matrix with a vocabulary size of vocab_size.
    se2 = BatchNormalization()(se1)              # applies batch normalization to the embedded sequences.
    se3 = GRU(256)(se2)          # applies a GRU layer with 256 units to the batch-normalized embedded sequences.

    # decoder model
    decoder = Concatenate()([fe2, se3])                  # concatenates output features from the image extractor and partial caption sequence models.
    decoder2 = Dense(256, activation = 'relu')(decoder)  # applies a dense layer with 512 units and ReLU activation to the concatenated features.
    outputs = Dense(vocab_size,                          # applies a dense layer with vocab_size units and softmax activation to produce the output
                    activation = 'softmax')(decoder2)    # probability distribution over the vocabulary.

    # merge 2 networks
    model = Model(inputs = [inputs1, inputs2], outputs = outputs)

    optimizer = Adam(learning_rate = 0.005,
                     clipvalue = 5.0)          # technique used to limit the magnitude of gradients during training.
                                               # helps stabilize the training process, especially with exploding gradients.

    model.compile(loss = "categorical_crossentropy",   # loss function commonly used for multi-class classification problems.
                  optimizer = optimizer,               # used for updating the weights.
                  metrics = ['accuracy'])              # evaluation metric to monitor during training.

The merged model combines the image feature extractor model and the partial caption sequence model into a single model.<br>
It takes both the image features (inputs1) and the partial caption sequences (inputs2) as inputs and produces the output probabilities over the vocabulary (outputs).

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes=True)

## Model Training

In [None]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data.head()['image'].tolist():
            n += 1
            captions = data[data['image'] == key]['urdu_caption']   # ek img k sary captions, 'captions' is a list
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]    # for predicting next word for a given word
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]   # to make length of all captions same by appending zeros
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]   # to tell the index where the word is stored in tokenizer
                    
                    # store the sequences (all these are 2D lists now)
                    X1.append(features[key.split('.')[0]][0].cpu().numpy())
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield (X1, X2), y
                X1, X2, y = list(), list(), list()
                n = 0
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
train_set = {'image': X_train.tolist(), 'urdu_caption': Y_train.tolist()}
train_set = pd.DataFrame(train_set)
train_set

In [None]:
# train the model
epochs = 1
batch_size = 60
steps = len(X_train) // batch_size

# Define a ModelCheckpoint callback
# Callbacks provide flexibility and customization to the training process,
checkpoint_filepath = 'model_checkpoint.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = False,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

for i in range(epochs):
    # create data generator
    generator = data_generator(train_set, features, tokenizer, max_length, vocab_size, batch_size)
    
    # fit for one epoch
    model.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1, callbacks = [model_checkpoint_callback])

In [None]:
model.save('Trained_YOLO_GRU_3c1e.h5')         # write no. of captions used and epochs done

In [None]:
optimizer = Adam(learning_rate = 0.005,
                     clipvalue = 5.0)          # technique used to limit the magnitude of gradients during training.
                                               # helps stabilize the training process, especially with exploding gradients.

In [None]:
m1 = load_model('/kaggle/input/yolo-gru-m1/Trained_YOLO_GRU_3c1e.h5')
m1.compile(loss = "categorical_crossentropy",   # loss function commonly used for multi-class classification problems.
                  optimizer = optimizer,               # used for updating the weights.
                  metrics = ['accuracy'])              # evaluation metric to monitor during training.

In [None]:
# train the model
epochs = 1
batch_size = 60
steps = len(X_train) // batch_size

# Define a ModelCheckpoint callback
# Callbacks provide flexibility and customization to the training process,
checkpoint_filepath = 'model_checkpoint.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = False,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

for i in range(epochs):
    # create data generator
    generator = data_generator(train_set, features, tokenizer, max_length, vocab_size, batch_size)
    
    # fit for one epoch
    m1.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1, callbacks = [model_checkpoint_callback])

In [None]:
m1.save('Trained_YOLO_GRU_3c2e.h5')         # write no. of captions used and epochs done

In [None]:
m2 = load_model('/kaggle/input/yolo-gru-m2/Trained_YOLO_GRU_3c2e.h5')
m2.compile(loss = "categorical_crossentropy",   # loss function commonly used for multi-class classification problems.
                  optimizer = optimizer,               # used for updating the weights.
                  metrics = ['accuracy'])              # evaluation metric to monitor during training.

In [None]:
# train the model
epochs = 1
batch_size = 60
steps = len(X_train) // batch_size

# Define a ModelCheckpoint callback
# Callbacks provide flexibility and customization to the training process,
checkpoint_filepath = 'model_checkpoint.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = False,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

for i in range(epochs):
    # create data generator
    generator = data_generator(train_set, features, tokenizer, max_length, vocab_size, batch_size)

    # fit for one epoch
    m2.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1, callbacks = [model_checkpoint_callback])

In [None]:
m2.save('Trained_YOLO_GRU_3c3e.h5')         # write no. of captions used and epochs done

In [None]:
m3 = load_model('Trained_YOLO_GRU_3c3e.h5')
m3.compile(loss = "categorical_crossentropy",   # loss function commonly used for multi-class classification problems.
                  optimizer = optimizer,               # used for updating the weights.
                  metrics = ['accuracy'])              # evaluation metric to monitor during training.

In [None]:
# train the model
epochs = 1
batch_size = 60
steps = len(X_train) // batch_size

# Define a ModelCheckpoint callback
# Callbacks provide flexibility and customization to the training process,
checkpoint_filepath = 'model_checkpoint.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = False,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

for i in range(epochs):
    # create data generator
    generator = data_generator(train_set, features, tokenizer, max_length, vocab_size, batch_size)

    # fit for one epoch
    m3.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1, callbacks = [model_checkpoint_callback])

In [None]:
m3.save('Trained_YOLO_GRU_3c4e.h5')         # write no. of captions used and epochs done

In [None]:
m4 = load_model('Trained_YOLO_GRU_3c4e.h5')
m4.compile(loss = "categorical_crossentropy",   # loss function commonly used for multi-class classification problems.
                  optimizer = optimizer,               # used for updating the weights.
                  metrics = ['accuracy'])              # evaluation metric to monitor during training.

In [None]:
# train the model
epochs = 1
batch_size = 60
steps = len(X_train) // batch_size

# Define a ModelCheckpoint callback
# Callbacks provide flexibility and customization to the training process,
checkpoint_filepath = 'model_checkpoint.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = False,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

for i in range(epochs):
    # create data generator
    generator = data_generator(train_set, features, tokenizer, max_length, vocab_size, batch_size)

    # fit for one epoch
    m4.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1, callbacks = [model_checkpoint_callback])

In [None]:
m4.save('Trained_YOLO_GRU_3c5e.h5')         # write no. of captions used and epochs done

## Generate a Caption using the Model

In [None]:
def idx_to_word(integer, tokenizer):
    for word,index, in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'endseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        
        # get index with high probability
        yhat = np.argmax(yhat)
        
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        
        # stop if we reach end tag
        if word == 'startseq':
            break
    return in_text

In [None]:
# validate with test data
actual, predicted = list(), list()

for key in tqdm(X_test.iloc[:100]):
    # get actual caption
    captions = data[data['image'] == key]['urdu_caption']
    
    # predict the caption for image
    y_pred = predict_caption(model, features[key.split('.')[0]], tokenizer, max_length) 
    
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
    
# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(0.75, 0.25)))
# print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

In [None]:
def generate_caption(image_name):
    # load the image
    image_id = image_name.split('.')[0]
    img_path = "/kaggle/input/test-images/" + image_name
    image = Image.open(img_path)

    # Predict caption for image
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)[8:][:-6]
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [None]:
generate_caption("img000000.jpg")