## Import Libraries

In [None]:
pip install nltk

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from textwrap import wrap

import os
import gc       # garbage collection: for memory allocation and deallocation
import pickle
import keras
import tensorflow as tf
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model, Sequence
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add, BatchNormalization, Concatenate

from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

# from nltk.translate.bleu_score import corpus_bleu

from PIL import Image

2024-07-29 16:28:22.957509: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-29 16:28:22.957639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-29 16:28:23.066207: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
"""BLEU score implementation."""
import math
import sys
import warnings
from collections import Counter
from fractions import Fraction as _Fraction

from nltk.util import ngrams


class Fraction(_Fraction):
    """Fraction with _normalize=False support for 3.12"""

    def __new__(cls, numerator=0, denominator=None, _normalize=False):
        if sys.version_info >= (3, 12):
            self = super().__new__(cls, numerator, denominator)
        else:
            self = super().__new__(cls, numerator, denominator, _normalize=_normalize)
        self._normalize = _normalize
        self._original_numerator = numerator
        self._original_denominator = denominator
        return self

    @property
    def numerator(self):
        if not self._normalize:
            return self._original_numerator
        return super().numerator

    @property
    def denominator(self):
        if not self._normalize:
            return self._original_denominator
        return super().denominator

def sentence_bleu(
    references,
    hypothesis,
    weights=(0.25, 0.25, 0.25, 0.25),
    smoothing_function=None,
    auto_reweigh=False,):
    return corpus_bleu([references], [hypothesis], weights, smoothing_function, auto_reweigh)

def corpus_bleu(
    list_of_references,
    hypotheses,
    weights=(0.25, 0.25, 0.25, 0.25),
    smoothing_function=None,
    auto_reweigh=False,):
    # Before proceeding to compute BLEU, perform sanity checks.

    p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
    p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
    hyp_lengths, ref_lengths = 0, 0

    assert len(list_of_references) == len(hypotheses), ()

    try:
        weights[0][0]
    except:
        weights = [weights]
    max_weight_length = max(len(weight) for weight in weights)

    # Iterate through each hypothesis and their corresponding references.
    for references, hypothesis in zip(list_of_references, hypotheses):
        # For each order of ngram, calculate the numerator and
        # denominator for the corpus-level modified precision.
        for i in range(1, max_weight_length + 1):
            p_i = modified_precision(references, hypothesis, i)
            p_numerators[i] += p_i.numerator
            p_denominators[i] += p_i.denominator

        # Calculate the hypothesis length and the closest reference length.
        # Adds them to the corpus-level hypothesis and reference counts.
        hyp_len = len(hypothesis)
        hyp_lengths += hyp_len
        ref_lengths += closest_ref_length(references, hyp_len)

    # Calculate corpus-level brevity penalty.
    bp = brevity_penalty(ref_lengths, hyp_lengths)

    # Collects the various precision values for the different ngram orders.
    p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False)
        for i in range(1, max_weight_length + 1)]

    # Returns 0 if there's no matching n-grams
    # We only need to check for p_numerators[1] == 0, since if there's
    # no unigrams, there won't be any higher order ngrams.
    if p_numerators[1] == 0:
        return 0 if len(weights) == 1 else [0] * len(weights)

    # If there's no smoothing, set use method0 from SmoothinFunction class.
    if not smoothing_function:
        smoothing_function = SmoothingFunction().method0
    # Smoothen the modified precision.
    # Note: smoothing_function() may convert values into floats;
    #       it tries to retain the Fraction object as much as the
    #       smoothing method allows.
    p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths)

    bleu_scores = []
    for weight in weights:
        # Uniformly re-weighting based on maximum hypothesis lengths if largest
        # order of n-grams < 4 and weights is set at default.
        if auto_reweigh:
            if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25):
                weight = (1 / hyp_lengths,) * hyp_lengths

        s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0)
        s = bp * math.exp(math.fsum(s))
        bleu_scores.append(s)
    return bleu_scores[0] if len(weights) == 1 else bleu_scores


def modified_precision(references, hypothesis, n):
    # Extracts all ngrams in hypothesis
    # Set an empty Counter if hypothesis is empty.
    counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
    # Extract a union of references' counts.
    # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
    max_counts = {}
    for reference in references:
        reference_counts = (Counter(ngrams(reference, n)) if len(reference) >= n else Counter())
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    # Assigns the intersection between hypothesis and references' counts.
    clipped_counts = {ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()}

    numerator = sum(clipped_counts.values())
    # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
    # Usually this happens when the ngram order is > len(reference).
    denominator = max(1, sum(counts.values()))

    return Fraction(numerator, denominator, _normalize=False)


def closest_ref_length(references, hyp_len):
    ref_lens = (len(reference) for reference in references)
    closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len))
    return closest_ref_len


def brevity_penalty(closest_ref_len, hyp_len):
    if hyp_len > closest_ref_len:
        return 1
    # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
    elif hyp_len == 0:
        return 0
    else:
        return math.exp(1 - closest_ref_len / hyp_len)


class SmoothingFunction:
    def __init__(self, epsilon=0.1, alpha=5, k=5):
        self.epsilon = epsilon
        self.alpha = alpha
        self.k = k

    def method0(self, p_n, *args, **kwargs):
        """
        No smoothing.
        """
        p_n_new = []
        for i, p_i in enumerate(p_n):
            if p_i.numerator != 0:
                p_n_new.append(p_i)
            else:
                _msg = str().format(i + 1)
                warnings.warn(_msg)
                # When numerator==0 where denonminator==0 or !=0, the result
                # for the precision score should be equal to 0 or undefined.
                # Due to BLEU geometric mean computation in logarithm space,
                # we we need to take the return sys.float_info.min such that
                # math.log(sys.float_info.min) returns a 0 precision score.
                p_n_new.append(sys.float_info.min)
        return p_n_new

    def method1(self, p_n, *args, **kwargs):
        """
        Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
        """
        return [(p_i.numerator + self.epsilon) / p_i.denominator
            if p_i.numerator == 0
            else p_i
            for p_i in p_n]

    def method2(self, p_n, *args, **kwargs):
        return [Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False)
            if i != 0
            else p_n[0]
            for i in range(len(p_n))]

    def method3(self, p_n, *args, **kwargs):
        incvnt = 1  # From the mteval-v13a.pl, it's referred to as k.
        for i, p_i in enumerate(p_n):
            if p_i.numerator == 0:
                p_n[i] = 1 / (2**incvnt * p_i.denominator)
                incvnt += 1
        return p_n

    def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        incvnt = 1
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        for i, p_i in enumerate(p_n):
            if p_i.numerator == 0 and hyp_len > 1:
                # incvnt = i + 1 * self.k / math.log(
                #     hyp_len
                # )  # Note that this K is different from the K from NIST.
                # p_n[i] = incvnt / p_i.denominator\
                numerator = 1 / (2**incvnt * self.k / math.log(hyp_len))
                p_n[i] = numerator / p_i.denominator
                incvnt += 1
        return p_n

    def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        m = {}
        # Requires an precision value for an addition ngram order.
        p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
        m[-1] = p_n[0] + 1
        for i, p_i in enumerate(p_n):
            p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
            m[i] = p_n[i]
        return p_n

    def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        # This smoothing only works when p_1 and p_2 is non-zero.
        # Raise an error with an appropriate message when the input is too short
        # to use this smoothing technique.
        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
        for i, p_i in enumerate(p_n):
            if i in [0, 1]:  # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
                # No. of ngrams in translation that matches the reference.
                m = p_i.numerator
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i + 1))
                # Calculates the interpolated precision.
                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
        return p_n

    def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
        """
        Smoothing method 7:
        Interpolates methods 4 and 5.
        """
        hyp_len = hyp_len if hyp_len else len(hypothesis)
        p_n = self.method4(p_n, references, hypothesis, hyp_len)
        p_n = self.method5(p_n, references, hypothesis, hyp_len)
        return p_n

## For GPU

In [3]:
strategy = tf.distribute.MultiWorkerMirroredStrategy()

## For TPU

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.TPUStrategy(tpu)

AUTO = tf.data.experimental.AUTOTUNE
print("REPLICAS: ", strategy.num_replicas_in_sync)

## Import Data 

In [4]:
data = pd.read_csv('/kaggle/input/dataset3cap/FINAL_COMBINED_DATASET_3CAP.csv')
data.head(10)

Unnamed: 0,image,caption,urdu_caption
0,img000000.jpg,Closeup of bins of food that include broccoli ...,کھانے کے ڈبوں کا کلوز اپ جس میں بروکولی اور رو...
1,img000000.jpg,A bunch of trays that have different food.,ٹرے کا ایک گروپ جس میں مختلف کھانے ہوتے ہیں۔
2,img000000.jpg,"Colorful dishes holding meat, vegetables, frui...",رنگ برنگے پکوان جن میں گوشت، سبزیاں، پھل اور ر...
3,img000001.jpg,A giraffe standing up nearby a tree,ایک زرافہ ایک درخت کے قریب کھڑا ہے۔
4,img000001.jpg,A giraffe mother with its baby in the forest.,ایک زرافے کی ماں اپنے بچے کے ساتھ جنگل میں۔
5,img000001.jpg,Two giraffes standing in a tree filled area.,دو زرافے درختوں سے بھرے علاقے میں کھڑے ہیں۔
6,img000002.jpg,White vase with different colored flowers sitt...,سفید گلدان جس کے اندر مختلف رنگوں کے پھول بیٹھ...
7,img000002.jpg,A flower vase is sitting on a porch stand.,پورچ اسٹینڈ پر پھولوں کا گلدان بیٹھا ہے۔
8,img000002.jpg,a white vase with many flowers on a stage,ایک اسٹیج پر بہت سے پھولوں کے ساتھ ایک سفید گلدان
9,img000003.jpg,Zebra reaching its head down to ground where g...,زیبرا اپنا سر نیچے زمین تک پہنچا رہا ہے جہاں گ...


## Display Images with Captions

In [None]:
# def readImage(path, img_size = 224):
#     img = load_img(path, color_mode = 'rgb', target_size = (img_size, img_size))
#     img = img_to_array(img)
#     img = img/255.
#     return img

# # THIS IS NOT NECESSARY, URDU CAPTIONS THEK DISPLAY NH HO RHE
# def display_images(temp_df):   
#     temp_df = temp_df.reset_index(drop = True)
#     plt.figure(figsize = (20, 20))
#     n = 0
#     for i in range(156070 * 5):  
#         n += 1
#         plt.subplot(156070, 5, n)  
#         plt.subplots_adjust(hspace = 0.7, wspace = 0.3)

#         image = readImage(f"/kaggle/input/images/Images/{temp_df.image[i]}")
#         plt.imshow(image)
#         plt.title("\n".join(wrap(temp_df.urdu_caption[i][-1::-1], 20)))
#         plt.axis("off")

In [None]:
# display_images(data)

## Model to Extract Features

In [None]:
# # Load the Model
# cnn = Xception()

# # Restructure model 
# cnn = Model(inputs = cnn.inputs,               # specify inputs to be the same as Xception      
#             outputs = cnn.layers[-2].output    # specify 2nd last layer as output which extracts the features (last layer is final classification layer)
#            )

# # Summerize
# display(cnn.summary())

## Extracting Image Features

In [None]:
# features = {}
# directory = '/kaggle/input/images/Images'

# for img_name in tqdm(os.listdir(directory)): 
    
#     # load the image from file
#     img_path = directory + '/' + img_name
#     image = load_img(img_path, target_size = (299, 299))
#     image = img_to_array(image)                # convert image pixels to numpy array
    
#     # reshape data for model
#     image = image.reshape((1, image.shape[0],  # width 
#                            image.shape[1],     # height
#                            image.shape[2]      # channels (3 because of rgb)
#                           ))

#     image = preprocess_input(image)            # preprocess image for Xception
#     feature = cnn.predict(image, verbose=0)    # extract features
#     image_id = img_name.split('.')[0]          # get image ID
#     features[image_id] = feature               # store feature  (size = 2048)

In [None]:
# # store features in pickle
# pickle.dump(features, open('/kaggle/input/pklfile/ImgFeatures.pkl', 'wb'))

In [None]:
# load features from pickle
with open('/kaggle/input/pklfile/ImgFeatures.pkl', 'rb') as f:    
    features = pickle.load(f)

In [None]:
# features

## Preprocess captions

In [5]:
def preprocessCaption(df):
    df['urdu_caption'] = 'endseq ' + df['urdu_caption'] + ' startseq'
    return df

In [6]:
data = data.apply(preprocessCaption, axis = 1)
data

Unnamed: 0,image,caption,urdu_caption
0,img000000.jpg,Closeup of bins of food that include broccoli ...,endseq کھانے کے ڈبوں کا کلوز اپ جس میں بروکولی...
1,img000000.jpg,A bunch of trays that have different food.,endseq ٹرے کا ایک گروپ جس میں مختلف کھانے ہوتے...
2,img000000.jpg,"Colorful dishes holding meat, vegetables, frui...",endseq رنگ برنگے پکوان جن میں گوشت، سبزیاں، پھ...
3,img000001.jpg,A giraffe standing up nearby a tree,endseq ایک زرافہ ایک درخت کے قریب کھڑا ہے۔ sta...
4,img000001.jpg,A giraffe mother with its baby in the forest.,endseq ایک زرافے کی ماں اپنے بچے کے ساتھ جنگل ...
...,...,...,...
468205,img155817.jpg,a small sheep standing on top of a grass field,endseq ایک چھوٹی بھیڑ گھاس کے میدان کے اوپر کھ...
468206,img155818.jpg,a small bird sitting on top of a tree branch,endseq درخت کی شاخ کے اوپر بیٹھا ہوا ایک چھوٹا...
468207,img155818.jpg,an owl perched on a branch in a tree,endseq ایک اُلّو درخت کی شاخ پر بیٹھا ہے۔ star...
468208,img155819.jpg,a small brown elephant with brown fur eating g...,endseq بھوری کھال کے ساتھ ایک چھوٹا بھورا ہاتھ...


In [7]:
data.iloc[87014, 2]

'endseq شہر میں گاڑیوں کو سرخ بتی پر روکا جاتا ہے۔ startseq'

## Tokenizing the Text

In Python, indices typically start from 0. However, when working with tokenization in NLP, we often reserve index 0 for special tokens, such as padding tokens or unknown tokens.
* **Padding Token:** In many NLP tasks, sequences of words or tokens are padded to ensure uniform length. Padding tokens are used to fill in the extra spaces in sequences to make them uniform. Index 0 is usually reserved for the padding token.
<br><br>
* **Unknown Token:** This token is used to represent words that are not present in the vocabulary. When a word that is not in the vocabulary is encountered during tokenization, it is replaced by the unknown token. Again, index 0 is often reserved for this purpose.

In [8]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["urdu_caption"])
vocab_size = len(tokenizer.word_index) + 1
vocab_size

27180

In [9]:
tokenizer.word_index

{'endseq': 1,
 'startseq': 2,
 'ایک': 3,
 'کے': 4,
 'ہے۔': 5,
 'میں': 6,
 'پر': 7,
 'اور': 8,
 'رہا': 9,
 'کی': 10,
 'ساتھ': 11,
 'ہیں۔': 12,
 'کا': 13,
 'سے': 14,
 'آدمی': 15,
 'کر': 16,
 'رہی': 17,
 'دو': 18,
 'ہے': 19,
 'رہے': 20,
 'کو': 21,
 'ہوئے': 22,
 'جس': 23,
 'عورت': 24,
 'سفید': 25,
 'ہوا': 26,
 'بیٹھا': 27,
 'کھڑا': 28,
 'سڑک': 29,
 'جو': 30,
 'میز': 31,
 'پہنے': 32,
 'بورڈ': 33,
 'اپنے': 34,
 'کچھ': 35,
 'شخص': 36,
 'اوپر': 37,
 'رنگ': 38,
 'لوگ': 39,
 'بال': 40,
 'سامنے': 41,
 'دیکھ': 42,
 'سرخ': 43,
 'کھیل': 44,
 'نوجوان': 45,
 'لیے': 46,
 'والی': 47,
 'گروپ': 48,
 'سوار': 49,
 'پکڑے': 50,
 'پاس': 51,
 'قریب': 52,
 'والے': 53,
 'نیچے': 54,
 'والا': 55,
 'پانی': 56,
 'لوگوں': 57,
 'کھڑی': 58,
 'چل': 59,
 'گھاس': 60,
 'بیٹھی': 61,
 'سیاہ': 62,
 'اس': 63,
 'ہوئی': 64,
 'ٹینس': 65,
 'کھڑے': 66,
 'بہت': 67,
 'لڑکا': 68,
 'ہیں': 69,
 'قمیض': 70,
 'باہر': 71,
 'چھوٹا': 72,
 'تصویر': 73,
 'بیس': 74,
 'بیٹھے': 75,
 'پلیٹ': 76,
 'کتا': 77,
 'لڑکی': 78,
 'بلی': 79,
 'میدان': 80,
 '

This is dictionary containing urdu words as keys and their corresponding indices as values. It represents the vocabulary learned by the tokenizer during the fitting process

In [10]:
# get maximum length of the caption available
max_length = max(len(ucap.split()) for ucap in data['urdu_caption'])
max_length

94

## Splitting the Data

5 Captions per image
* Train = 70% (562089)
* Validation = 20% (140523)
* Test = 10% (78069)

1 caption per image:
* Train = 72% (112370)
* Validation = 18% (28093)
* Test = 10% (15607)

In [11]:
X_train_valid, X_test, Y_train_valid, Y_test = train_test_split(data['image'], data['urdu_caption'], 
                                                                test_size = 0.10, random_state = 30, shuffle = True)

X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_valid, Y_train_valid, 
                                                      test_size = 0.2, random_state = 30, shuffle = True)

print('Train set size:', X_train.shape)
print('Validation set size:', X_valid.shape)
print('Test set size:', X_test.shape)
print('Total data size', X_train.shape[0] + X_valid.shape[0] + X_test.shape[0])

Train set size: (337111,)
Validation set size: (84278,)
Test set size: (46821,)
Total data size 468210


## Model Building

* A model architecture that will combine visual information from images with textual information from partial captions to generate captions for images.
* The model will learn to generate captions based on both the visual content and the context provided by the captions.

In [None]:
with strategy.scope():
    # image feature extractor model
    inputs1 = Input(shape = (2048,))             # defines an input layer for the image features.
    fe1 = BatchNormalization()(inputs1)          # applies batch normalization to the input image features.
    fe2 = Dense(512, activation = 'relu')(fe1)   # applies a dense layer with 512 units and ReLU activation to the batch-normalized image features.

    # partial caption sequence model
    inputs2 = Input(shape = (max_length,))       # defines an input layer for the partial caption sequences.
    se1 = Embedding(vocab_size, 512)(inputs2)   # embeds the input sequences into dense vectors of size 512. 
                                                 # this layer uses an embedding matrix with a vocabulary size of vocab_size.
    se2 = BatchNormalization()(se1)              # applies batch normalization to the embedded sequences.
    se3 = LSTM(256)(se2)                         # applies a LSTM layer with 256 units to the batch-normalized embedded sequences.

    # decoder model
    decoder = Concatenate()([fe2, se3])                  # concatenates output features from the image extractor and partial caption sequence models.
    decoder2 = Dense(512, activation = 'relu')(decoder)  # applies a dense layer with 512 units and ReLU activation to the concatenated features.
    outputs = Dense(vocab_size,                          # applies a dense layer with vocab_size units and softmax activation to produce the output
                    activation = 'softmax')(decoder2)    # probability distribution over the vocabulary.

    # merge 2 networks
    model = Model(inputs = [inputs1, inputs2], outputs = outputs)

    optimizer = Adam(learning_rate = 0.005,
                     clipvalue = 5.0)          # technique used to limit the magnitude of gradients during training.
                                               # helps stabilize the training process, especially with exploding gradients.

    model.compile(loss = "categorical_crossentropy",   # loss function commonly used for multi-class classification problems.
                  optimizer = optimizer,               # used for updating the weights.
                  metrics = ['accuracy'])              # evaluation metric to monitor during training.

The merged model combines the image feature extractor model and the partial caption sequence model into a single model.<br>
It takes both the image features (inputs1) and the partial caption sequences (inputs2) as inputs and produces the output probabilities over the vocabulary (outputs).

In [None]:
model.summary()

In [None]:
plot_model(model, show_shapes=True)

## Model Training

In [None]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data['image'].tolist():
            n += 1
            captions = data[data['image'] == key]['urdu_caption']   # ek img k sary captions, 'captions' is a list
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]    # for predicting next word for a given word
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]   # to make length of all captions same by appending zeros
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]   # to tell the index where the word is stored in tokenizer
                    
                    # store the sequences (all these are 2D lists now)
                    X1.append(features[key.split('.')[0]][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield (X1, X2), y
                X1, X2, y = list(), list(), list()
                n = 0

In [12]:
train_set = {'image': X_train.tolist(), 'urdu_caption': Y_train.tolist()}
train_set = pd.DataFrame(train_set)
train_set

Unnamed: 0,image,urdu_caption
0,img112934.jpg,endseq گائیوں کا ایک غول خاردار باڑ کے پاس کھڑ...
1,img062210.jpg,endseq یہ ایک پل کے بارے میں انتباہ کے نشان کی...
2,img015619.jpg,endseq لوگ میز کے پاس بیٹھے مسکرا رہے ہیں اور ...
3,img060858.jpg,endseq پٹریوں پر نارنجی پٹی والی سفید ٹرین sta...
4,img128054.jpg,endseq پارک میں ایک سیاہ اور سفید کتا برف میں ...
...,...,...
337106,img070574.jpg,endseq مردوں کا ایک گروپ تلوار سے سفید چادر کا...
337107,img046623.jpg,endseq ایک آدمی لمبی بازو کی قمیض پہنے سمارٹ ف...
337108,img007919.jpg,endseq لوگوں کا ایک گروپ زرافے کے گرد کھڑا ہے۔...
337109,img059922.jpg,endseq ایک میز کے اوپر چمکدار ڈونٹس سے بھرے تی...


In [None]:
# train the model
epochs = 1
batch_size = 60
steps = len(X_train) // batch_size

#Define a ModelCheckpoint callback
#Callbacks provide flexibility and customization to the training process,
checkpoint_filepath = '/kaggle/working/model_checkpoint.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath = checkpoint_filepath,
    save_weights_only = False,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only = True)

for i in range(epochs):
    # create data generator
    generator = data_generator(train_set, features, tokenizer, max_length, vocab_size, batch_size)
    
    # fit for one epoch
    model.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1, callbacks = [model_checkpoint_callback])

In [None]:
model.save('Trained_Xception_LSTM_epoch10.h5')

In [13]:
from tensorflow.keras.models import load_model

In [None]:
model2 = load_model('/kaggle/input/modelepoch5/Trained_Xception_LSTM_epoch5.h5')

optimizer = Adam(learning_rate = 0.005,
                     clipvalue = 5.0)          # technique used to limit the magnitude of gradients during training.
                                               # helps stabilize the training process, especially with exploding gradients.

model2.compile(loss = "categorical_crossentropy",   # loss function commonly used for multi-class classification problems.
              optimizer = optimizer,               # used for updating the weights.
              metrics = ['accuracy'])              # evaluation metric to monitor during training.

In [None]:
model2.summary()

In [None]:
# train the model
epochs = 1
batch_size = 40
steps = len(X_train) // batch_size

#Define a ModelCheckpoint callback
#Callbacks provide flexibility and customization to the training process,
# checkpoint_filepath = '/kaggle/working/model_checkpoint.keras'
# model_checkpoint_callback = ModelCheckpoint(
#     filepath = checkpoint_filepath,
#     save_weights_only = False,
#     monitor = 'val_loss',
#     mode = 'min',
#     save_best_only = True)

for i in range(epochs):
    # create data generator
    generator = data_generator(train_set, features, tokenizer, max_length, vocab_size, batch_size)
    
    # fit for one epoch
    model2.fit(generator, epochs = 1, steps_per_epoch = steps, verbose = 1)

In [None]:
model2.save('Trained_Xception_LSTM_1c6e')

## Generate a Caption using the Model

In [14]:
def idx_to_word(integer, tokenizer):
    for word,index, in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [15]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'endseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        
        # get index with high probability
        yhat = np.argmax(yhat)
        
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        
        # stop if we reach end tag
        if word == 'startseq':
            break
    return in_text

In [25]:
m1 = load_model('/kaggle/input/bleu-test/Trained_YOLO_LSTM_3c10e.h5')

In [29]:
m2 = load_model('/kaggle/input/bleu-test/Trained_YOLO_GRU_3c7e.h5')

In [18]:
pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.2.68-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m758.6 kB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading ultralytics-8.2.68-py3-none-any.whl (828 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m828.2/828.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading ultralytics_thop-2.0.0-py3-none-any.whl (25 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.2.68 ultralytics-thop-2.0.0
Note: you may need to restart the kernel to use updated packages.


In [19]:
import cv2
from ultralytics import YOLO

In [20]:
cnn = YOLO("yolov8m-cls.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8m-cls.pt to 'yolov8m-cls.pt'...


100%|██████████| 32.7M/32.7M [00:00<00:00, 87.5MB/s]


In [31]:
# validate with test data
# with strategy.scope():
actual, predicted = list(), list()

for key in tqdm(X_test.iloc[:2000]):
    # get actual caption
    image = '/kaggle/input/images/Images/'+key
    captions = data[data['image'] == key]['urdu_caption']
    features = cnn.predict(image, embed = [-1])

    # predict the caption for image
    y_pred = predict_caption(m1, features, tokenizer, max_length) 

    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()

    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)

# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(0.7, 0.2, 0.1)))
# print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

  0%|          | 0/200 [00:00<?, ?it/s]









































































































































































































BLEU-1: 0.321402


In [None]:
def generate_caption(image_name):
    # load the image
    image_id = image_name.split('.')[0]
    img_path = "/kaggle/input/images/Images/" + image_name
    image = Image.open(img_path)

    # Predict caption for image
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)[8:][:-6]
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [None]:
generate_caption("img000000.jpg")