In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img,img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import zipfile

zip_path = '/content/drive/MyDrive/flickr8k.zip'  # Path to your zip file
extract_path = '/content/flickr8k'                # Where to extract

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted successfully!")


In [None]:
IMAGES_PATH = '/content/flickr8k/Images'


In [None]:
# load vgg16 model
model = VGG16()
# restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# summarize
print(model.summary())

In [None]:
# Extract features
features = {}

for img_name in tqdm(os.listdir(IMAGES_PATH)):
    img_path = os.path.join(IMAGES_PATH, img_name)

    # Load and preprocess the image
    image = load_img(img_path, target_size=(224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)

    # Extract features using VGG16
    feature = model.predict(image, verbose=0)
    image_id = img_name.split('.')[0]
    features[image_id] = feature

print("Features extracted for all images!")


In [None]:
len(features)

In [None]:
#storing the features
pickle.dump(features, open('features.pkl', 'wb'))

In [None]:
# Load from local file
features = pickle.load(open('features.pkl', 'rb'))

In [None]:
#loading the caption data
# Direct path to captions file
captions_path = '/content/flickr8k/captions.txt'

# Read captions into a variable
with open(captions_path, 'r') as f:
    next(f)  # Skip header if present
    captions_doc = f.read()


In [None]:
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)

In [None]:
len(mapping)

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i].lower()

            # remove characters other than alphabets by keeping only valid characters
            caption = ''.join(char if char.isalpha() or char.isspace() else ' ' for char in caption)

            # remove extra spaces
            caption = ' '.join(caption.split())

            # remove single-letter words (like 'a', 'i') if desired
            caption = ' '.join([word for word in caption.split() if len(word) > 1])

            # ✅ avoid adding startseq/endseq multiple times
            if not caption.startswith('startseq'):
                caption = 'startseq ' + caption + ' endseq'

            captions[i] = caption


In [None]:
# before preprocess of text
mapping['1000268201_693b08cb0e']

In [None]:
# preprocess the text
clean(mapping)

# after preprocessing the text
mapping['1000268201_693b08cb0e']


In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

len(all_captions)


In [None]:
all_captions[:10]

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
# get maximum length of the caption available important for padding

max_length=max(len(caption.split()) for caption in all_captions)
max_length

In [None]:
# After preprocessing the data now we will train, test and split
image_ids=list(mapping.keys())
split=int(len(image_ids)*0.90)
train=image_ids[:split]
test=image_ids[split:]

In [None]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length, padding='post')[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], 	num_classes=vocab_size)[0]
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield {"image": X1, "text": X2}, y
                X1, X2, y = list(), list(), list()
                n = 0




In [None]:
# Model Creation
# encoder model

# image feature layers
inputs1 = Input(shape=(4096,), name="image")
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layers
inputs2 = Input(shape=(max_length,), name="text")
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# plot the model
plot_model(model, show_shapes=True)


In [None]:
train = [key for key in train if key in features]
len(train)

In [None]:
print(len(tokenizer.word_index))

In [None]:
# train the model
epochs = 20
batch_size = 16
steps = len(train) // batch_size

for i in range(epochs):
    # create data generator
    generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

In [None]:
# save the model
model.save('best_model.h5')

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
          return word
    return None

In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length, padding='post')
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
    return in_text


In [None]:
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm

actual, predicted = [], []

for key in tqdm(test):
    # Get all reference captions (ground truth)
    references = [caption.split() for caption in mapping[key]]

    # Get the predicted caption
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    y_pred = y_pred.split()

    # Store references and prediction
    actual.append(references)
    predicted.append(y_pred)

# Calculate BLEU scores after the loop
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))     # unigram
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))   # bigram
print("BLEU-3: %f" % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))) # trigram
print("BLEU-4: %f" % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))) # 4-gram


In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import os

def generate_caption(image_name):
    image_id = image_name.split('.')[0]
    img_path = f"/content/flickr8k/Images/{image_name}"  # ✅ Adjusted path

    image = Image.open(img_path)

    # Actual captions
    captions = mapping[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)

    # Predicted caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)

    # Show image
    plt.imshow(image)
    plt.axis("off")
    plt.show()


In [None]:
generate_caption("640053014_549d2f23d2.jpg")

In [None]:
generate_caption("3197981073_3156963446.jpg")

In [None]:
generate_caption("2562377955_8d670ccec6.jpg")

In [None]:
generate_caption("58363930_0544844edd.jpg")

In [None]:
generate_caption("1436760519_8d6101a0ed.jpg")