In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 0


In [3]:
print("TensorFlow using:", tf.__version__)
print("Running on GPU:", tf.test.is_gpu_available())

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


TensorFlow using: 2.18.0
Running on GPU: False


In [4]:
import os
import re
import numpy as np
import pickle
import tqdm
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.preprocessing import image
from keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
image_path = '/content/drive/MyDrive/ML/image_caption/data/Images'
captions_file = '/content/drive/MyDrive/ML/image_caption/data/captions.txt'
feature_save_path = '/content/drive/MyDrive/ML/image_caption/image_features.pkl'

# Clean the Captions

In [10]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def load_captions(file_path):
    descriptions = {}
    with open(file_path, 'r') as f:
        next(f)  # skip header
        for line in f:
            tokens = line.strip().split(',')
            if len(tokens) != 2:
                continue
            img_name, caption = tokens
            img_id = img_name.split('.')[0]
            caption = clean_text(caption)
            if img_id not in descriptions:
                descriptions[img_id] = []
            descriptions[img_id].append(f"startseq {caption} endseq")
    return descriptions

descriptions = load_captions(captions_file)
print("Total valid image IDs:", len(descriptions))

Total valid image IDs: 8091


# Extract feaures using ResNet50

In [None]:
def extract_image_features(image_path, valid_img_ids):
    resnet = ResNet50(weights='imagenet')
    model = Model(inputs=resnet.input, outputs=resnet.layers[-2].output)
    print("ResNet50 model loaded (top layer removed)")

    def preprocess_img(img_path):
        img = image.load_img(img_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        return img_array

    features = {}
    for img_id in tqdm.tqdm(valid_img_ids):
        full_path = os.path.join(image_path, img_id + '.jpg')
        if not os.path.exists(full_path):
            for ext in ['.jpeg', '.png']:
                full_path = os.path.join(image_path, img_id + ext)
                if os.path.exists(full_path):
                    break
            else:
                print("Image not found for ID:", img_id)
                continue

        img_array = preprocess_img(full_path)
        feature_vector = model.predict(img_array, verbose=0)
        features[img_id] = feature_vector.flatten()

    return features

# Get image IDs that actually exist in the folder
available_images = set(os.path.splitext(f)[0] for f in os.listdir(image_path)
                       if f.lower().endswith(('.jpg', '.jpeg', '.png')))
valid_img_ids = descriptions.keys() & available_images
print("Total matching image IDs for feature extraction:", len(valid_img_ids))

# Extract and save features
features = extract_image_features(image_path, valid_img_ids)

with open(feature_save_path, 'wb') as f:
    pickle.dump(features, f)

print("Image features saved at:", feature_save_path)
print("Total features extracted:", len(features))

Total matching image IDs for feature extraction: 8091
ResNet50 model loaded (top layer removed)


100%|██████████| 8091/8091 [15:25<00:00,  8.74it/s]


Image features saved at: /content/drive/MyDrive/ML/image_caption/image_features.pkl
Total features extracted: 8091


# Tokenize Captions

In [None]:
# flatten all captions into a list
all_captions = [caption for captions in descriptions.values() for caption in captions]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

# convert captions to sequences of intergers
sequences = tokenizer.texts_to_sequences(all_captions)

# define maximum sequnce length
max_sequence_length = max(len(seq) for seq in sequences)

# create the word-to-index dictionary
vocab_size = len(tokenizer.word_index) + 1

print("Total sequences:", len(sequences))
print("Vocabulary size:", vocab_size)
print("Max sequence length:", max_sequence_length)
print("First 5 sequences:", sequences[:5])

Total sequences: 38008
Vocabulary size: 8520
Max sequence length: 35
First 5 sequences: [[2, 1, 42, 4, 1, 91, 171, 7, 114, 51, 1, 396, 12, 377, 4, 28, 5001, 689, 3], [2, 1, 18, 303, 62, 1, 188, 115, 3], [2, 1, 37, 18, 114, 62, 1, 188, 2328, 3], [2, 1, 37, 18, 114, 5, 377, 20, 61, 2328, 3], [2, 1, 37, 18, 4, 1, 91, 171, 303, 62, 1, 188, 3266, 3]]


# Prepare Input-Output Pairs

In [None]:
# Load features
with open(feature_save_path, 'rb') as f:
    features = pickle.load(f)

print("Total loaded image features:", len(features))

Total loaded image features: 8091


In [None]:
image_features = []
input_sequences = []
output_words = []

for img_id, caption_list in descriptions.items():
    if img_id not in features:
        continue
    image_feature = features[img_id]

    for caption in caption_list:
        sequence = tokenizer.texts_to_sequences([caption])[0]
        for i in range(1, len(sequence)):
            in_seq = sequence[:i]
            out_word = sequence[i]
            in_seq = pad_sequences([in_seq], maxlen=max_sequence_length, padding='post')[0]

            image_features.append(image_feature)
            input_sequences.append(in_seq)
            output_words.append(out_word)

X1 = np.array(image_features)
X2 = np.array(input_sequences)
y = np.array(output_words)

print("X1 (image features) shape:", X1.shape)
print("X2 (caption sequences) shape:", X2.shape)
print("y (next word) shape:", y.shape)

X1 (image features) shape: (441365, 2048)
X2 (caption sequences) shape: (441365, 35)
y (next word) shape: (441365,)


In [None]:
np.save('/content/drive/MyDrive/ML/image_caption/data/X1.npy', X1)
np.save('/content/drive/MyDrive/ML/image_caption/data/X2.npy', X2)
np.save('/content/drive/MyDrive/ML/image_caption/data/y.npy', y)

# Train the model

In [None]:
X1_load = np.load('/content/drive/MyDrive/ML/image_caption/data/X1.npy')
X2_load = np.load('/content/drive/MyDrive/ML/image_caption/data/X2.npy')
y_load = np.load('/content/drive/MyDrive/ML/image_caption/data/y.npy')

In [6]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [None]:
# One-hot encode the output y
# vocab_size = len(tokenizer.word_index) + 1  # already defined
# vocab_size = 8520
# y = to_categorical(y, num_classes=vocab_size)

In [None]:
y_load

array([  1,  42,   4, ..., 114, 107,   3])

# Define the model architecture

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, LSTM, add
from tensorflow.keras.models import Model

# image feature vector input (2048-dim)
input1 = Input(shape=(2048,))
img_dense = Dense(128, activation='relu')(input1)  # Reduced to 128
img_dropout = Dropout(0.3)(img_dense)

# caption sequence input
max_sequence_length = 35
vocab_size = 8520

input2 = Input(shape=(max_sequence_length,))
cap_embedding = Embedding(input_dim=vocab_size, output_dim=128, mask_zero=True)(input2)
cap_dropout = Dropout(0.3)(cap_embedding)
cap_lstm = LSTM(128)(cap_dropout)  # Reduced units

# merge both models
decoder = add([img_dropout, cap_lstm])
decoder = Dense(128, activation='relu')(decoder)
output = Dense(vocab_size, activation='softmax')(decoder)

# define model
model = Model(inputs=[input1, input2], outputs=output)

In [None]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

In [None]:
model.fit([X1_load, X2_load], y_load, epochs=10, batch_size=128)

Epoch 1/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m737s[0m 212ms/step - accuracy: 0.2620 - loss: 4.4448
Epoch 2/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 212ms/step - accuracy: 0.3587 - loss: 3.2750
Epoch 3/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m745s[0m 213ms/step - accuracy: 0.3806 - loss: 3.0137
Epoch 4/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m742s[0m 213ms/step - accuracy: 0.3908 - loss: 2.8628
Epoch 5/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m743s[0m 213ms/step - accuracy: 0.3999 - loss: 2.7559
Epoch 6/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m724s[0m 210ms/step - accuracy: 0.4071 - loss: 2.6866
Epoch 7/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m748s[0m 211ms/step - accuracy: 0.4105 - loss: 2.6292
Epoch 8/10
[1m3449/3449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m740s[0m 211ms/step - accuracy: 0.4150 - loss:

<keras.src.callbacks.history.History at 0x7ea99b9fe690>

In [None]:
model.save('/content/drive/MyDrive/ML/image_caption/caption_model.h5')



In [None]:
model.save('/content/drive/MyDrive/ML/image_caption/img_caption_model.keras')

In [None]:
with open('/content/drive/MyDrive/ML/image_caption/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
import json

params = {
    "max_sequence_length": max_sequence_length,
    "vocab_size": vocab_size
}

with open('/content/drive/MyDrive/ML/image_caption/model_params.json', 'w') as f:
    json.dump(params, f)

In [7]:
from tensorflow.keras.models import load_model
import pickle
import json

# Load the model without using custom_object_scope initially
# model = load_model('/content/drive/MyDrive/ML/image_caption/caption_model.h5')
model  = load_model('/content/drive/MyDrive/ML/image_caption/img_caption_model.keras')

# Load tokenizer
with open('/content/drive/MyDrive/ML/image_caption/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

# Load image features
with open('/content/drive/MyDrive/ML/image_caption/image_features.pkl', 'rb') as f:
    features = pickle.load(f)

# Load config.json
with open('/content/drive/MyDrive/ML/image_caption/model_params.json', 'r') as f:
    model_params = json.load(f)

# Extract parameters
max_sequence_length = model_params['max_sequence_length']
vocab_size = model_params['vocab_size']

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
import random
import tqdm
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the caption generator
def generate_caption(model, tokenizer, features, max_length, img_id):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        image_feature = features[img_id]
        y_pred = model.predict([image_feature.reshape(1, 2048), sequence], verbose=0)
        predicted_id = np.argmax(y_pred)
        word = tokenizer.index_word.get(predicted_id)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    final_caption = in_text.replace('startseq', '').replace('endseq', '').strip()
    return final_caption

# Output file path
output_file = '/content/drive/MyDrive/ML/image_caption/predicted_captions_1000.txt'
generated_captions = {}

# Get a random sample of 1000 image IDs that are also in features
available_img_ids = list(set(descriptions.keys()) & set(features.keys()))
sampled_img_ids = random.sample(available_img_ids, 1000)

# Generate and save captions
with open(output_file, 'w') as f:
    for img_id in tqdm.tqdm(sampled_img_ids):
        caption = generate_caption(model, tokenizer, features, max_sequence_length, img_id)
        generated_captions[img_id] = caption
        f.write(f"{img_id}.jpg, {caption}\n")

print("Predicted captions saved for 1000 images:", output_file)

100%|██████████| 1000/1000 [32:48<00:00,  1.97s/it]

✅ Predicted captions saved for 1000 images: /content/drive/MyDrive/ML/image_caption/predicted_captions_1000.txt





In [15]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smooth = SmoothingFunction().method4
bleu1, bleu2, bleu3, bleu4 = [], [], [], []

for img_id in generated_captions:
    reference_list = [caption.split() for caption in descriptions[img_id]]  # real captions
    predicted = generated_captions[img_id].split()  # predicted caption

    bleu1.append(sentence_bleu(reference_list, predicted, weights=(1, 0, 0, 0), smoothing_function=smooth))
    bleu2.append(sentence_bleu(reference_list, predicted, weights=(0.5, 0.5, 0, 0), smoothing_function=smooth))
    bleu3.append(sentence_bleu(reference_list, predicted, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smooth))
    bleu4.append(sentence_bleu(reference_list, predicted, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth))

print("BLEU-1:", np.mean(bleu1))
print("BLEU-2:", np.mean(bleu2))
print("BLEU-3:", np.mean(bleu3))
print("BLEU-4:", np.mean(bleu4))


BLEU-1: 0.43454095627453854
BLEU-2: 0.2741517607012966
BLEU-3: 0.1729977913888319
BLEU-4: 0.11299715211521574


### How to Interpret These BLEU Scores
1. BLEU-1 (~0.43): This is unigram precision, meaning about 43% of the predicted words match the reference captions. This is pretty decent for Flickr8k and shows the model understands individual word predictions well.

2. BLEU-2 to BLEU-4: These drop off because longer sequences (bigrams to 4-grams) are harder to match in image captioning. That’s expected. Your BLEU-4 of ~0.11 is within range for basic models on Flickr8k.




### Model Architecture Context
- ResNet50 encoder + LSTM decoder
-  128 LSTM units
-  Accuracy: 42% after 10 epochs on Flickr8k
- That’s a solid baseline! But remember:
- Accuracy isn't the best metric for this kind of sequence generation task (BLEU is better).
- Still, 42% suggests it’s learning patterns.

### If you’re looking to improve performance:

1. Train longer (20–30 epochs) – 10 might be too early.

2. Increase LSTM units to 256 or 512 if RAM allows.

3. Use Beam Search decoding instead of greedy — this improves BLEU scores.

4. Add attention mechanism.

5. More data — Flickr8k is small. Consider moving to Flickr30k or MS COCO later.