In [5]:
import os
import pickle
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [8]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model


model = ResNet50(weights='imagenet')
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

### I've used the ResNet50 model for image feature extraction

In [6]:
### After extracting the features and I had dumped the image_feature_map in my PC's storage
with open('/kaggle/input/idksomethingsomething11111/image_features.pkl', 'rb') as f:
    features = pickle.load(f)

### After fixing the given captions I dumped them in my PC's storage
with open('/kaggle/input/idksomethingsomething11111/caption_map.pkl', 'rb') as f:
    captions = pickle.load(f)

In [9]:
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input


image_features_map = {}


def extract_features(image_path, model):
    img = load_img(image_path, target_size=(224, 224))  # Resize to match ResNet50 input
    img = img_to_array(img)  # Convert to numpy array
    img = img.reshape((1, img.shape[0], img.shape[1], img.shape[2]))  
    img = preprocess_input(img) 
    features = model.predict(img)  
  



folder_path = "C:/Users/User/Music/archive (1)/Images"
for name in os.listdir(folder_path):
    ##image_path = os.path.join(folder_path, name)
    
    name = name.split('.')[0]
    image_features_map[name] = extract_features(image_path, model)

In [None]:
import re

new_caption_map = {}
for keys in caption_map:
    lsst = [] 
    for sentence in caption_map[keys]:
        sentence = sentence.lower()
        sentence = re.sub(r'[^a-z\s]', '', sentence)  
        sentence = re.sub(r'\s+', ' ', sentence).strip()
        sentence = 'startseq ' + " ".join([word for word in sentence.split() if len(word) > 1]) + ' endseq'
        lsst.append(sentence)
    
    new_caption_map[keys] = lsst


In [7]:
import random

def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    while 1:
        # Shuffle data keys at the start of each epoch
        random.shuffle(data_keys)
        X1, X2, y = list(), list(), list()
        n = 0

        for key in data_keys:
            captions = mapping[key]
            # Shuffle captions if needed
            random.shuffle(captions)

            # Process each caption
            for caption in captions:
                # Encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # Split the sequence into X, y pairs
                for i in range(1, len(seq)):

                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                    # Encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)

            n += 1
            if n == batch_size:
                indices = list(range(len(X1)))
                random.shuffle(indices)
                X1 = np.array([X1[i] for i in indices])
                X2 = np.array([X2[i] for i in indices])
                y = np.array([y[i] for i in indices])

                yield {"image": X1, "text": X2}, y

                X1, X2, y = list(), list(), list()
                n = 0


In [8]:
train_img_features = {}
count = 1

for i in range(1,5601):
    train_img_features[f'img{i}'] = features[f'img{i}']

len(train_img_features)

5600

In [9]:
new_captions = {}
for key in captions:
    new_key = key.split('.')[0]
    new_captions[new_key] = captions[key]


len(new_captions)

8091

In [10]:
train_captions = {}
count = 1

for i in range(1,5601):
    train_captions[f'img{i}'] = new_captions[f'img{i}']

len(train_captions)

5600

In [11]:
all_captions = []
for key in captions:
    for val in captions[key]:
        all_captions.append(val)

len(all_captions)

40455

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in all_captions)
print(f'Vocabulary Size of the Tokenizer is : {vocab_size} & the Max Length is : {max_length}')

Vocabulary Size of the Tokenizer is : 8313 & the Max Length is : 31


In [25]:
features['img6000']

array([[0.14186075, 0.49956983, 0.29384995, ..., 0.13239394, 0.52375025,
        0.24152865]], dtype=float32)

In [19]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length, padding='post')
        yhat = model.predict([image, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx_to_word(yhat, tokenizer)
        if word is None:
            break
        in_text += " " + word
        if word == 'endseq':
            break
    return in_text

In [11]:
model = define_model(vocab_size, max_length)

epochs = 10
steps = len(train_captions)
for i in range(epochs):
    generator = data_generator(train_captions, train_img_features, tokenizer, max_length, vocab_size)
    model.fit(generator, epochs=1, steps_per_epoch=250, verbose=1)
    model.save('model_' + str(i) + '.h5')

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 128ms/step - loss: 6.5987
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 127ms/step - loss: 4.9738
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 127ms/step - loss: 4.5494
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 127ms/step - loss: 4.1094
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 130ms/step - loss: 3.7569
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 128ms/step - loss: 3.4717
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 127ms/step - loss: 3.1963
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 129ms/step - loss: 2.9846
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 128ms/step - loss: 2.7613
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 126ms/step - loss: 2.6088


In [34]:
y_pred = predict_caption(model, features['img1513'], tokenizer, max_length)
y_pred

'startseq man is sitting on the street endseq'

In [39]:
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import corpus_bleu

# Validate with test data
actual, predicted = list(), list()
count = 0

for key in tqdm(test):
    # Get actual captions
    captions = new_captions[key]
    # Predict the caption for the image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    actual.append(actual_captions)
    predicted.append(y_pred)
    
    # Increment count and break after 100 samples
    count += 1
    if count == 100:
        break

# Calculate overall BLEU scores
print("Net BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("Net BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print("Net BLEU-3: %f" % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
print("Net BLEU-4: %f" % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))



### I imported the .ipynb file from Kaggle hence the 0%. I found out the BLEU score of the first 100 instances.

  0%|          | 0/810 [00:00<?, ?it/s]

Net BLEU-1: 0.659352
Net BLEU-2: 0.443801
Net BLEU-3: 0.331601
Net BLEU-4: 0.177773
