In [4]:
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np
import pickle

# 📂 Paths
MODEL_PATH = r"C:\Users\sagni\Downloads\Image Captioner\image_caption_model.keras"
TOKENIZER_PATH = r"C:\Users\sagni\Downloads\Image Captioner\tokenizer.pkl"
IMAGE_PATH = r"C:\Users\sagni\Downloads\Image Captioner\WhatsApp Image 2025-07-21 at 14.39.35_c9b4dbd3.jpg"

# 🔄 Load model & tokenizer
model = load_model(MODEL_PATH)
with open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = pickle.load(f)
print("✅ Model & tokenizer loaded.")

# ✅ Set max_len (from training)
max_len = 35  # <-- Use the exact value from your training

# 📸 Preprocess and extract features
def extract_features(image_path):
    model_incep = InceptionV3(weights='imagenet')
    model_incep = Model(inputs=model_incep.input, outputs=model_incep.layers[-2].output)
    image = load_img(image_path, target_size=(299, 299))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    feature = model_incep.predict(image, verbose=0)
    return feature

# 📝 Generate caption (pads sequence to max_len)
def generate_caption(model, tokenizer, photo, max_len):
    in_text = 'startseq'
    for _ in range(max_len):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_len)  # ✅ Pad to max_len
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# 📸 Generate caption for the new image
photo_feature = extract_features(IMAGE_PATH)
caption = generate_caption(model, tokenizer, photo_feature, max_len)
print(f"📢 Generated Caption: {caption}")


✅ Model & tokenizer loaded.
📢 Generated Caption: a man in a blue shirt is standing on a bench with a bottle of water


In [5]:
!pip install transformers pillow torch




In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import torch

# 📂 Paths
IMAGE_PATH = r"C:\Users\sagni\Downloads\Image Captioner\WhatsApp Image 2025-07-21 at 14.39.35_c9b4dbd3.jpg"

# 🖼 Load image
raw_image = Image.open(IMAGE_PATH).convert('RGB')

# 🌐 Load BLIP-2 from HuggingFace
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

# ✅ Refine using BLIP
inputs = processor(raw_image, return_tensors="pt")
out = model_blip.generate(**inputs, max_new_tokens=50)
caption_blip = processor.decode(out[0], skip_special_tokens=True)

print("✨ BLIP-2 Refined Caption:", caption_blip)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]