In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git accelerate bitsandbytes

Importing the processor and the InstructBlip model

In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch

# processor
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
# model
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", load_in_4bit=True, torch_dtype=torch.float16)

Checking a sample image

In [None]:
from PIL import Image
import requests

# url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
image = Image.open("/kaggle/input/image-dataset/image-20.jpg").convert("RGB")
image

Create description of the image using Instruct Blip

In [None]:
import csv

# prompt for the model
prompts = ["create the description of main figure in the image",
           "create the desciption of everything except the main figure",
           "create the perfect desciption of the scene"]

csv_file_path = 'image_desc.csv'

# open the csv_file of images
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['index', 'Description'])

    # iterate over the images
    for idx in range(1,51):
        image = Image.open(f"/kaggle/input/image-dataset/image-{idx}.jpg").convert("RGB")
        desc = ""
        for prompt in prompts:
            inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16)
            outputs = model.generate(
                    **inputs,
                    do_sample=True,
                    min_length=50,
                    repetition_penalty=1.5,
                    length_penalty=1.0,
                    temperature=1,
            )
            generated_text = processor.batch_decode(
              outputs,
              skip_special_tokens = True,
            )[0].strip()
            desc += generated_text + " "

        writer.writerow([idx, desc])
        torch.cuda.empty_cache()