# BLIP: Inference Demo
 - [Image Captioning](#Image-Captioning)
 - [VQA](#VQA)
 - [Image Text Matching](#Image-Text-Matching)

In [2]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_demo_image(image_size,device):
    img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

    w,h = raw_image.size
    display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])
    image = transform(raw_image).unsqueeze(0).to(device)
    return image

# Image Captioning
Perform image captioning using finetuned BLIP model

In [8]:
from huggingface_hub import login

#login your huggingface

In [9]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import torch

# Load Model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load Image
img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")


inputs = processor(images=raw_image, return_tensors="pt").to(device)

# Generate Description
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)

from IPython.display import display
display(raw_image)
print("🖼️ Generated Caption:", caption)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

🖼️ Generated Caption: a woman sitting on the beach with her dog


In [11]:
from IPython.display import display
display(raw_image)

Output hidden; open in https://colab.research.google.com to view.

# VQA
Perform visual question answering using finetuned BLIP model

In [12]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import requests
import torch

image_size = 480

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Load Model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
model.eval()
model = model.to(device)

# Load Image
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")

# Question
question = 'where is the woman sitting?'

display(raw_image)
# Reasoning
with torch.no_grad():
    inputs = processor(raw_image, question, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    answer = processor.decode(output[0], skip_special_tokens=True)
    print('answer: ' + answer)

Output hidden; open in https://colab.research.google.com to view.

In [3]:
import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForImageTextRetrieval


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if device.type == "cuda" else torch.float32

# Load Model
processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
model = BlipForImageTextRetrieval.from_pretrained(
    "Salesforce/blip-itm-base-coco", torch_dtype=dtype
).to(device)
model.eval()

# Load Image
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# Text
caption = "A woman and a dog sitting together in a beach."
print(f"text: {caption}")


inputs = processor(image, caption, return_tensors="pt").to(device, dtype)
display(image)
# ITM score
with torch.no_grad():
    itm_logits = model(**inputs)[0]  # shape: [1, 2]
    itm_prob = torch.nn.functional.softmax(itm_logits, dim=1)[:, 1]  # matched prob
    print("The image and text is matched with a probability of %.4f" % itm_prob.item())


# ITC score
with torch.no_grad():
    cosine_sim = model(**inputs, use_itm_head=False)[0]  # shape: [1]
    print("The image and text cosine similarity is %.4f" % cosine_sim.item())

Output hidden; open in https://colab.research.google.com to view.