# Image Caption Generator Using Cnn and LSTM

# What is Image Caption Generator?
Image caption generator is a system that will predict some discription after see a image by using computer vision and deep learning. This project is required computer vision and npl operation. Cnn is used for image classification. There is a pretrain model called Xception Which is trained by imagenet dataset. Xception is resposible for image features extractions. We will use this pretrain model for extract the features from our dataset.

# Import Modules

In [None]:
import os
import numpy as np
from PIL import Image
import pickle
from tensorflow.keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, LSTM, Embedding, Concatenate, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm

In [None]:
# download the data from kagglehub
import kagglehub

# Alternate data sources are available!
#coco_2017_dataset_path = kagglehub.dataset_download('awsaf49/coco-2017-dataset')
#flickr30k_path = kagglehub.dataset_download('eeshawn/flickr30k')

flickr8k_path = kagglehub.dataset_download('adityajn105/flickr8k')
print(flickr8k_path)

In [None]:
# if you have the data locally, set the base path appropriately.
local_flickr8k = "/store/datasets/flickr8k"

In [None]:
#file = '/store/datasets/flickr8k/Images/'
BASE_PATH = flickr8k_path  # OR  local_flickr8k OR coco_2017_dataset_path OR flickr30k_path

# Extract Image Features

In [None]:
modelx = Xception()
modelx = Model(inputs=modelx.inputs, outputs=modelx.layers[-2].output)
#modelx.summary()

### run the following block if the pickle file doesn't exist.  TAKES LONG to RUN

In [None]:
features = {}
directory = os.path.join(BASE_PATH + "/Images")

for img_name in tqdm(os.listdir(directory)):
    # join the directory path and image name
    img_path = os.path.join(directory, img_name)

    # check if it's a file (not a subdirectory)
    if os.path.isfile(img_path):
        try:
            # load image from file
            image = load_img(img_path, target_size=(299, 299))
            # convert image pixels to numpy array
            image = img_to_array(image)
            image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            # preprocess image
            image = preprocess_input(image)
            feature = modelx.predict(image, verbose=0)
            # get image id
            image_id = img_name.split('.')[0]
            # store feature
            features[image_id] = feature
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

with open('features1.pkl','wb') as f:
    pickle.dump(features,f)

# Now 'features' should contain the extracted features for each valid image


### Download the precomputed features file for Flick8k dataset

In [None]:
# if the features are available already, download it
import gdown
gdown.download(url="https://drive.google.com/file/d/1Xl5aS71ZP5UVi11QAOori4oT9HG5qaZy/view?usp=sharing", fuzzy=True, output="features1.pkl")

In [None]:
# load features
with open('features1.pkl','rb') as f:
    features = pickle.load(f)

print(f"loaded {len(features)} data points..")
print(f"embedding size:", features[next(iter(features))].shape[1])

# Perform Data Cleaning

In [None]:
with open(BASE_PATH + '/captions.txt','r') as f:
    next(f)
    captions_doc = f.read()

# Cleaning and mapping

In [None]:
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)

In [None]:
len(mapping)

# Preprocess Text

In [None]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc.,
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [None]:
# process text
clean(mapping)

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[33:56]

### Load some images with captions

Here we need to map the images in the training set to their corresponding descriptions which are present in our descriptions variable. Create a list of names of all training images and then create an empty dictionary and map the images to their descriptions using image name as key and a list of descriptions as its value. while mapping the descriptions add unique words at the beginning and end to identify the start and end of the sentence.

In [None]:
list(mapping.keys())[:5]

In [None]:
def visualization(data, num_of_images):
    count = 1
    fig = plt.figure(figsize=(10,20))
    for filename in list(mapping.keys()): #captions_dictionary
        captions = mapping[filename]
        image_load = load_img(BASE_PATH + "/Images/" + filename + ".jpg", target_size=(199,199,3))

        ax = fig.add_subplot(num_of_images,2,count,xticks=[],yticks=[])
        ax.imshow(image_load)
        count += 1

        ax = fig.add_subplot(num_of_images,2,count)
        plt.axis('off')
        ax.plot()
        ax.set_xlim(0,1)
        ax.set_ylim(0,len(captions))
        for i, caption in enumerate(captions):
            ax.text(0,i,caption,fontsize=14)
        count += 1
        if count > num_of_images:
          break
    plt.show()

visualization(list(mapping.keys()), 5)

### Exploring the Caption Length Distribution

We analyze the length of captions to determine an optimal sequence length.

In [None]:
def captions_length(data):
    plt.figure(figsize=(15, 7), dpi=100)
    sns.set_style('darkgrid')
    sns.histplot(x=[len(x.split(' ')) for x in data], kde=True, binwidth=1)
    plt.title('Caption lengths histogram', fontsize=15, fontweight='bold')
    plt.xticks(fontweight='bold')
    plt.yticks(fontweight='bold')
    plt.xlabel('Length', fontweight='bold')
    plt.ylabel('Frequency', fontweight='bold')
    plt.show()

captions_length(all_captions)

# Tokenize the content

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
vocab_size

In [None]:
# get maximum len of the captions available
max_length = max(len(caption.split()) for caption in all_captions)
max_length

# Train Test Split

In [None]:
image_ids = list(mapping.keys())
n_data = len(image_ids)
split = int(len(image_ids) * 0.90)
# only 2% of the data used for validation.
v_split = split + int((n_data - split)/5)

train = image_ids[:split]
val = image_ids[split:v_split]
test = image_ids[v_split:]

print(f"training: {len(train)}, val: {len(val)}, test: {len(test)}")

# Create Data Generator

In [None]:
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # Loop over images
    x1, x2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # Spllt the squences into x,y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq  = pad_sequences([in_seq], maxlen=max_length, padding='post', truncating='post')[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # store the sequeces
                    x1.append(features[key][0])
                    x2.append(in_seq)
                    y.append(out_seq)

                if n == batch_size:
                    x1,x2,y = np.array(x1), np.array(x2), np.array(y)
                    yield (x1,x2), y
                    x1, x2, y = list(), list(), list()
                    n = 0


# Model Architecture

In [None]:
# Assuming you have defined vocab_size and max_length

# Encoder model
inputs1 = Input(shape=(2048,), name="image_input")
fe1 = BatchNormalization(name="image_batch_norm")(inputs1)
fe2 = Dense(512, activation='relu',name="image_mlp_layer")(fe1)

inputs2 = Input(shape=(max_length,), name="text_input")
se1 = Embedding(vocab_size, 512, mask_zero=True, name='text_embedding')(inputs2)
se2 = BatchNormalization(name="text_batch_norm")(se1)
se3 = Bidirectional(LSTM(256), name="text_bi_lstm")(se2)

# Decoder
decoder = Concatenate(name='conc_image_text')([fe2, se3])
decoder2 = Dense(512, activation='relu', name="decoder")(decoder)
outputs = Dense(vocab_size, activation='softmax', name="output")(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
optimizer = Adam(learning_rate=0.0001, clipvalue=5.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

model.summary()


# Train model

In [None]:
epochs = 5 # 10 works well
batch_size = 128
steps_per_epoch = len(train) // batch_size

# Define a ModelCheckpoint callback
checkpoint_filepath = 'model_checkpoint.keras'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True
)

generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
#val_generator = data_generator(val, mapping, features, tokenizer, max_length, vocab_size, batch_size)

# Add the ModelCheckpoint callback to the list of callbacks
history = model.fit(generator, epochs=epochs, verbose=1,
                    steps_per_epoch=steps_per_epoch,)
                    #validation_data = val_generator,
                    #callbacks=[model_checkpoint_callback])

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True)

In [None]:
model.save('caption-generator-model.keras')

# Generate Captions For Images

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length, padding='post', truncating='post')
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break

    return in_text

def beam_search_generator(model, image, tokenizer, max_caption_length, K_beams = 3, log = False):
    start = [tokenizer.word_index['startseq']]
    start_word = [[start, 0.0]]
    for _ in range(max_caption_length):
        temp = []
        for s in start_word:
            sequence  = pad_sequences([s[0]], maxlen=max_caption_length, padding='post', truncating='post').reshape((1,max_caption_length))
            preds = model.predict([image, sequence], verbose=0)
            word_preds = np.argsort(preds[0])[-K_beams:]
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                if log:
                    prob += np.log(preds[0][w]) # assign a probability to each K words
                else:
                    prob += preds[0][w]
                temp.append([next_cap, prob])

        start_word = temp
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        start_word = start_word[-K_beams:]

    start_word = start_word[-1][0]
    captions_ = [tokenizer.index_word[i] for i in start_word]
    final_caption = []
    for i in captions_:
        if i != 'endseq':
            final_caption.append(i)
        else:
            break

    final_caption = ' '.join(final_caption[1:])
    return final_caption

In [None]:
from nltk.translate.bleu_score import corpus_bleu
# validation with test data
actual, predicted_greedy, predicted_beam = list(), list(), list()

for key in tqdm(test[:10]):
    captions = mapping[key]
    # split into words
    actual_captions = [caption.split() for caption in captions]
    # append to the list
    actual.append(actual_captions)
    # predict the caption data
    y_pred_beam = beam_search_generator(model, features[key], tokenizer, max_length)
    y_pred_greedy = predict_caption(model, features[key], tokenizer, max_length)
    predicted_greedy.append(y_pred_greedy.split())
    predicted_beam.append(y_pred_beam.split())

# calculate Bleu Scores
print('Bleu-1 (Greedy): %f' % corpus_bleu(actual, predicted_greedy, weights=(1.0,0,0,0)))
print('Bleu-2 (Greedy): %f' % corpus_bleu(actual, predicted_greedy, weights=(0.5,0.5,0,0)))
print('Bleu-1 (Beam 3): %f' % corpus_bleu(actual, predicted_beam, weights=(1.0,0,0,0)))
print('Bleu-2 (Beam 3): %f' % corpus_bleu(actual, predicted_beam, weights=(0.5,0.5,0,0)))

# Generate Captions For Images

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length, padding='post', truncating='post')
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break

    return in_text

# Visualize the image

In [None]:
# check if it's a file (not a subdirectory)
def generate_features(img_path):
    if os.path.isfile(img_path):
        try:
            # load image from file
            image = load_img(img_path, target_size=(299, 299))
            # convert image pixels to numpy array
            image = img_to_array(image)
            image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            # preprocess image
            image = preprocess_input(image)
            feature = modelx.predict(image, verbose=0)
            # store feature
            return feature
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name, skip_true=False):
    # load the image
    image_id = image_name.split('.')[0]
    img_path = f"{BASE_PATH}/Images/{image_name}"
    image = Image.open(img_path)

    if skip_true == False:
        captions = mapping[image_id]
        print('---------------------Actual---------------------')
        for caption in captions:
            print(caption)
        feature = features[image_id]
    else:
        feature = generate_features(image_name)

    # predict the caption
    y_pred = predict_caption(model, feature, tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

def generate_caption_2(image_path):
    image = Image.open(image_path)
    feature = generate_features(image_path)
    # predict the caption
    y_pred = predict_caption(model, feature, tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)


In [None]:
np.random.shuffle(test)
test[:5]

In [None]:
generate_caption('503090187_8758ab5680.jpg')

In [None]:
generate_caption_2('/tmp/test.jpg')

Ref: https://www.geeksforgeeks.org/image-caption-generator-using-deep-learning-on-flickr8k-dataset/