In [3]:
# 📦 Image Caption Generator - Fixed with tf.data.Dataset
import os, string, numpy as np, tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import pickle

# 📂 Paths
CAPTIONS_PATH = r"C:\Users\sagni\Downloads\Image Captioner\archive (1)\captions.txt"
IMAGES_PATH = r"C:\Users\sagni\Downloads\Image Captioner\archive (1)\Images"
SAVE_DIR = r"C:\Users\sagni\Downloads\Image Captioner"

# 📖 Load captions
def load_captions(filename):
    captions = {}
    with open(filename, 'r') as file:
        next(file)
        for line in file:
            tokens = line.strip().split(',')
            img_id, caption = tokens[0].split('.')[0], tokens[1]
            caption = caption.lower().translate(str.maketrans('', '', string.punctuation))
            captions.setdefault(img_id, []).append('startseq ' + caption + ' endseq')
    return captions
captions = load_captions(CAPTIONS_PATH)
print(f"✅ Loaded {len(captions)} image captions.")

# 🖼 Feature extraction
def extract_features(directory):
    model = InceptionV3(weights='imagenet')
    model = Model(inputs=model.input, outputs=model.layers[-2].output)
    features = {}
    for img_name in tqdm(os.listdir(directory), desc="Extracting Features"):
        img_path = os.path.join(directory, img_name)
        image = load_img(img_path, target_size=(299, 299))
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        img_id = img_name.split('.')[0]
        features[img_id] = feature
    return features
features = extract_features(IMAGES_PATH)
print(f"✅ Extracted features for {len(features)} images.")

# 🔤 Tokenizer
all_captions = sum(captions.values(), [])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1
max_len = max(len(c.split()) for c in all_captions)
print(f"🔤 Vocab size: {vocab_size}, Max length: {max_len}")

# 🏗 Model
def define_model(vocab_size, max_len):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape=(max_len,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    return Model(inputs=[inputs1, inputs2], outputs=outputs)
model = define_model(vocab_size, max_len)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

# 🚀 Data generator
def create_sequences(desc_list, photo):
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
            out_seq = tf.one_hot(out_seq, vocab_size)
            yield (photo[0], in_seq), out_seq

def full_generator():
    for img_id, desc_list in captions.items():
        photo = features[img_id]
        yield from create_sequences(desc_list, photo)

dataset = tf.data.Dataset.from_generator(
    full_generator,
    output_signature=(
        (tf.TensorSpec(shape=(2048,), dtype=tf.float32),
         tf.TensorSpec(shape=(max_len,), dtype=tf.int32)),
        tf.TensorSpec(shape=(vocab_size,), dtype=tf.float32)
    )
).batch(64).prefetch(tf.data.AUTOTUNE)

# 🏃‍♂️ Train
EPOCHS = 5  # Increase for real training
model.fit(dataset, epochs=EPOCHS)

# 💾 Save model & tokenizer
model.save(os.path.join(SAVE_DIR, "image_caption_model.h5"))
with open(os.path.join(SAVE_DIR, "tokenizer.pkl"), 'wb') as f:
    pickle.dump(tokenizer, f)
print(f"✅ Saved model & tokenizer to {SAVE_DIR}")

# 📝 Generate caption
def generate_caption(model, tokenizer, photo, max_len):
    in_text = 'startseq'
    for _ in range(max_len):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_len)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# 🖼 Test
test_img_id = list(features.keys())[0]
print("📢 Generated Caption:", generate_caption(model, tokenizer, features[test_img_id], max_len))


✅ Loaded 8091 image captions.


Extracting Features: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 8091/8091 [18:49<00:00,  7.16it/s]


✅ Extracted features for 8091 images.
🔤 Vocab size: 8633, Max length: 35


Epoch 1/5
[1m7179/7179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m621s[0m 86ms/step - loss: 4.2808
Epoch 2/5
[1m   1/7179[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17:05[0m 143ms/step - loss: 3.1365



[1m7179/7179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m609s[0m 85ms/step - loss: 3.2188
Epoch 3/5
[1m7179/7179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 86ms/step - loss: 2.9773
Epoch 4/5
[1m7179/7179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m630s[0m 88ms/step - loss: 2.8390
Epoch 5/5
[1m7179/7179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m633s[0m 88ms/step - loss: 2.7515




✅ Saved model & tokenizer to C:\Users\sagni\Downloads\Image Captioner
📢 Generated Caption: a man in a red shirt is sitting on a bench with a red umbrella
