In [1]:
import pandas as pd
import os
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Concatenate
import pickle

csv_file = "captions.csv"
df = pd.read_csv(csv_file)

def clean_caption(caption):
    caption = caption.lower()
    caption = re.sub(r'\b(figure|fig)\s*\d+(\.\d+)?', '', caption, flags=re.IGNORECASE)
    caption = re.sub(r"[^a-z0-9A-Z\s]", "", caption)
    caption = re.sub(r"\s+", " ", caption).strip()
    return caption


df['processed_caption'] = df['full_caption'].apply(clean_caption)


def preprocess_captions(df):
    df['processed_caption'] = df['processed_caption'].apply(
        lambda x: f"<start> {x.lower()} <end>")
    return df


df = preprocess_captions(df)
print(df["processed_caption"])

tokenizer = Tokenizer(oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

tokenizer.fit_on_texts(["<start>","<end>"])
tokenizer.fit_on_texts(df['processed_caption'])

if "<start>" not in tokenizer.word_index:
    tokenizer.word_index["<start>"] = len(tokenizer.word_index) + 1
if "<end>" not in tokenizer.word_index:
    tokenizer.word_index["<end>"] = len(tokenizer.word_index) + 1

tokenizer.index_word[tokenizer.word_index["<start>"]] = "<start>"
tokenizer.index_word[tokenizer.word_index["<end>"]] = "<end>"

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
print("<start> token index:", tokenizer.word_index.get("<start>"))
print("<end> token index:", tokenizer.word_index.get("<end>"))

In [None]:
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Pad captions
max_caption_length = 250
sequences = tokenizer.texts_to_sequences(df['processed_caption'])
padded_captions = pad_sequences(
    sequences, maxlen=max_caption_length, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")

In [None]:
# ResNet50 for feature extraction
resnet = ResNet50(weights="imagenet", include_top=False, pooling="avg")

image_folder = "ImageList/"
image_features = {}

for image_id in df['imageid'].astype(str).unique():
    image_path = os.path.join(image_folder, f"{image_id}.png")
    if os.path.exists(image_path):
        img = load_img(image_path, target_size=(224, 224))
        img_array = img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features = resnet.predict(img_array)
        image_features[image_id] = features[0]
    else:
        print(f"Image {image_id}.png not found in {image_folder}.")

np.save("image_features.npy", image_features)

In [None]:
def prepare_data(image_features, captions, tokenizer, max_length):
    X1, X2, y = [], [], []
    for _, row in captions.iterrows():
        image_id, caption = str(row['imageid']), row['processed_caption']
        if image_id in image_features:
            image_feature = image_features[image_id]
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                X1.append(image_feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)


X1, X2, y = prepare_data(image_features, df, tokenizer, max_caption_length)

In [None]:
embedding_dim = 256
lstm_units = 256

image_input = Input(shape=(2048,), name="image_input")
image_dense = Dense(embedding_dim*2, activation="relu")(image_input)
image_dense = Dense(embedding_dim, activation="relu")(image_input)


caption_input = Input(shape=(max_caption_length,), name="caption_input")
caption_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True)(caption_input)
caption_lstm = LSTM(lstm_units, use_cudnn=False)(caption_embedding)


combined = Concatenate()([image_dense, caption_lstm])
decoder_output = Dense(vocab_size, activation="softmax")(combined)

model = Model(inputs=[image_input, caption_input], outputs=decoder_output)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.summary()

In [None]:
model.fit([X1, X2], y, batch_size=64, epochs=10, verbose=1)

model.save("lvlm_model.keras")

In [None]:
def generate_caption(image_feature, tokenizer, model, max_length):
    if "<start>" not in tokenizer.word_index or "<end>" not in tokenizer.word_index:
        raise ValueError(
            "Error: '<start>' or '<end>' token missing from tokenizer!")

    caption = [tokenizer.word_index["<start>"]]
    for _ in range(max_length):
        sequence = pad_sequences([caption], maxlen=max_length, padding='post')
        y_pred = model.predict(
            [image_feature.reshape((1, 2048)), sequence], verbose=0)
        y_pred = np.argmax(y_pred)
        word = tokenizer.index_word.get(y_pred)
        if word == "<end>" or word is None:
            break
        caption.append(y_pred)
    return " ".join([tokenizer.index_word[i] for i in caption if i > 0])


model = load_model("lvlm_model.keras")
image_features = np.load("image_features.npy", allow_pickle=True).item()

print("ResNet50 + LSTM")
Resnet50_LSTM = None
test_image_id = "7"  
if test_image_id in image_features:
    test_image_feature = image_features[test_image_id]
    generated_caption = generate_caption(test_image_feature, tokenizer, model, max_caption_length)
    Resnet50_LSTM = generate_caption
    print(f"Generated Caption for Image {test_image_id}: {generated_caption}")
else:
    print(f"Image ID {test_image_id} not found in features.")