In [1]:
# 📦 Imports
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import BlipProcessor, BlipForConditionalGeneration
import numpy as np
import pickle
from PIL import Image

# 📂 Paths
MODEL_PATH = r"C:\Users\sagni\Downloads\Image Captioner\image_caption_model.keras"
TOKENIZER_PATH = r"C:\Users\sagni\Downloads\Image Captioner\tokenizer.pkl"
IMAGE_PATH = r"C:\Users\sagni\Downloads\Image Captioner\WhatsApp Image 2025-07-21 at 14.39.35_c9b4dbd3.jpg"
MAX_LEN = 35  # Use your training max_len

# 🔄 Load your model and tokenizer
model = load_model(MODEL_PATH)
with open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = pickle.load(f)
print("✅ CNN+LSTM model & tokenizer loaded.")

# 🌐 Load BLIP-2
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
print("✅ BLIP-2 model loaded.")

# 📸 Preprocess and extract features from image
def extract_features(image_path):
    inception_model = InceptionV3(weights='imagenet')
    inception_model = Model(inputs=inception_model.input, outputs=inception_model.layers[-2].output)
    image = load_img(image_path, target_size=(299, 299))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    feature = inception_model.predict(image, verbose=0)
    return feature

# 📝 Generate base caption using CNN+LSTM
def generate_base_caption(model, tokenizer, photo, max_len):
    in_text = 'startseq'
    for _ in range(max_len):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_len)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# 🌸 Refine caption using BLIP-2
def refine_caption_blip(image_path, base_caption):
    raw_image = Image.open(image_path).convert('RGB')
    prompt = f"Base caption: {base_caption}. Refine it to a more descriptive sentence."
    inputs = processor(raw_image, text=prompt, return_tensors="pt")
    output = model_blip.generate(**inputs, max_new_tokens=50)
    refined_caption = processor.decode(output[0], skip_special_tokens=True)
    return refined_caption

# 📸 Run the pipeline
photo_feature = extract_features(IMAGE_PATH)
base_caption = generate_base_caption(model, tokenizer, photo_feature, MAX_LEN)
refined_caption = refine_caption_blip(IMAGE_PATH, base_caption)

# 📢 Output
print("📝 Base Caption:", base_caption)
print("✨ Refined Caption:", refined_caption)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ CNN+LSTM model & tokenizer loaded.
✅ BLIP-2 model loaded.
📝 Base Caption: a man in a blue shirt is standing on a bench with a bottle of water
✨ Refined Caption: base caption : a man in a blue shirt is standing on a bench with a bottle of water. refine it to a more descriptive sentence.


In [2]:
# 📦 Imports
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import BlipProcessor, BlipForConditionalGeneration
import numpy as np
import pickle
from PIL import Image

# 📂 Paths
MODEL_PATH = r"C:\Users\sagni\Downloads\Image Captioner\image_caption_model.keras"
TOKENIZER_PATH = r"C:\Users\sagni\Downloads\Image Captioner\tokenizer.pkl"
IMAGE_PATH = r"C:\Users\sagni\Downloads\Image Captioner\WhatsApp Image 2025-07-21 at 14.39.35_c9b4dbd3.jpg"
MAX_LEN = 35  # Match your training max_len

# 🔄 Load your model and tokenizer
model = load_model(MODEL_PATH)
with open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = pickle.load(f)
print("✅ CNN+LSTM model & tokenizer loaded.")

# 🌐 Load BLIP-2
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
print("✅ BLIP-2 model loaded.")

# 📸 Extract features from image
def extract_features(image_path):
    inception_model = InceptionV3(weights='imagenet')
    inception_model = Model(inputs=inception_model.input, outputs=inception_model.layers[-2].output)
    image = load_img(image_path, target_size=(299, 299))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    feature = inception_model.predict(image, verbose=0)
    return feature

# 📝 Generate base caption
def generate_base_caption(model, tokenizer, photo, max_len):
    in_text = 'startseq'
    for _ in range(max_len):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_len)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# 🌸 Refine caption with a poetic prompt
def refine_caption_poetic(image_path, base_caption):
    raw_image = Image.open(image_path).convert('RGB')
    prompt = (
        f"The image shows: {base_caption}. "
        "Rewrite this into a creative, accurate, and slightly poetic caption that brings the scene to life."
    )
    inputs = processor(raw_image, text=prompt, return_tensors="pt")
    output = model_blip.generate(**inputs, max_new_tokens=50)
    refined_caption = processor.decode(output[0], skip_special_tokens=True)
    return refined_caption

# 📸 Run the pipeline
photo_feature = extract_features(IMAGE_PATH)
base_caption = generate_base_caption(model, tokenizer, photo_feature, MAX_LEN)
poetic_caption = refine_caption_poetic(IMAGE_PATH, base_caption)

# 📢 Output
print("📝 Base Caption:", base_caption)
print("🌸 Poetic Refined Caption:", poetic_caption)


✅ CNN+LSTM model & tokenizer loaded.
✅ BLIP-2 model loaded.
📝 Base Caption: a man in a blue shirt is standing on a bench with a bottle of water
🌸 Poetic Refined Caption: the image shows : a man in a blue shirt is standing on a bench with a bottle of water. rewrite this into a creative, accurate, and slightly poetic caption that brings the scene to life.


In [3]:
# 📦 Imports
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from transformers import BlipProcessor, BlipForConditionalGeneration
import numpy as np
import pickle
from PIL import Image

# 📂 Paths
MODEL_PATH = r"C:\Users\sagni\Downloads\Image Captioner\image_caption_model.keras"
TOKENIZER_PATH = r"C:\Users\sagni\Downloads\Image Captioner\tokenizer.pkl"
IMAGE_PATH = r"C:\Users\sagni\Downloads\Image Captioner\WhatsApp Image 2025-07-21 at 14.39.35_c9b4dbd3.jpg"
MAX_LEN = 35  # Match your training max_len

# 🔄 Load your model and tokenizer
model = load_model(MODEL_PATH)
with open(TOKENIZER_PATH, 'rb') as f:
    tokenizer = pickle.load(f)
print("✅ CNN+LSTM model & tokenizer loaded.")

# 🌐 Load BLIP-2
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
print("✅ BLIP-2 model loaded.")

# 📸 Extract features from image
def extract_features(image_path):
    inception_model = InceptionV3(weights='imagenet')
    inception_model = Model(inputs=inception_model.input, outputs=inception_model.layers[-2].output)
    image = load_img(image_path, target_size=(299, 299))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    feature = inception_model.predict(image, verbose=0)
    return feature

# 📝 Generate base caption
def generate_base_caption(model, tokenizer, photo, max_len):
    in_text = 'startseq'
    for _ in range(max_len):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_len)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()

# 🌸 Refine caption with BLIP-2 using optional base hint
def refine_caption_blip(image_path, base_caption=None):
    raw_image = Image.open(image_path).convert('RGB')
    if base_caption:
        # 🎯 Blend base caption as a "hint" for BLIP-2
        prompt = (
            f"The base caption is: '{base_caption}'. "
            "Using this as a hint and the image itself, write a creative, accurate, and poetic description that vividly brings the scene to life."
        )
    else:
        # 🌱 Pure BLIP-2 without hint
        prompt = (
            "Describe this image creatively, accurately, and with a poetic touch. "
            "Focus on small details and emotions in the scene."
        )

    inputs = processor(raw_image, text=prompt, return_tensors="pt")
    output = model_blip.generate(**inputs, max_new_tokens=50)
    refined_caption = processor.decode(output[0], skip_special_tokens=True)
    return refined_caption

# 📸 Run the pipeline
photo_feature = extract_features(IMAGE_PATH)
base_caption = generate_base_caption(model, tokenizer, photo_feature, MAX_LEN)
poetic_caption = refine_caption_blip(IMAGE_PATH, base_caption)

# 📢 Output
print("📝 Base Caption:", base_caption)
print("🌸 Poetic Refined Caption:", poetic_caption)


✅ CNN+LSTM model & tokenizer loaded.
✅ BLIP-2 model loaded.
📝 Base Caption: a man in a blue shirt is standing on a bench with a bottle of water
🌸 Poetic Refined Caption: the base caption is : ' a man in a blue shirt is standing on a bench with a bottle of water '. using this as a hint and the image itself, write a creative, accurate, and poetic description that vividly brings the scene to life.
