# Image Captioning and Explanation Generation

This notebook shows how to generate descriptive captions and query-relevant explanations for images using the BLIP model from Hugging Face Transformers.

In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import torch

###  Load Image Utility

In [2]:
def load_image(image_path_or_url):
    if image_path_or_url.startswith("http://") or image_path_or_url.startswith("https://"):
        return Image.open(requests.get(image_path_or_url, stream=True).raw).convert("RGB")
    else:
        image_path_or_url = '../data/' + image_path_or_url
        return Image.open(image_path_or_url).convert("RGB")

### Load BLIP Model and Processor

In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.eval()

  return self.fget.__get__(instance, owner)()


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

### Generate Caption Function (sync)

In [4]:
def generate_caption(image_path_or_url, query=None):
    image = load_image(image_path_or_url)
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    if query:
        return f"{caption}. Relevant to query: '{query}'"
    return caption

In [5]:
image_path = "../data/processed/10001.jpg"  # Or an image URL
query = "dog with hoodie"

caption = generate_caption(image_path)
explanation = generate_caption(image_path, query=query)

print("Caption:", caption)
print("Explanation:", explanation)



Caption: a dog wearing a yellow and black shirt
Explanation: a dog wearing a yellow and black shirt. Relevant to query: 'dog with hoodie'
