In [1]:
from PIL import Image

In [2]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)

Downloading (…)rocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

In [7]:
import os
from tqdm.auto import tqdm
filenames = sorted(os.listdir("datasets/scale1_2/unsplash2000_raw"))

In [8]:
texts = {}
for filename in tqdm(filenames):
    image = Image.open(f"datasets/scale1_2/unsplash2000_raw/{filename}/image.png")
    inputs = processor(image, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=20)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    texts[filename] = generated_text


  0%|          | 0/2000 [00:00<?, ?it/s]

In [9]:
texts

{'1.00000000': 'a black and white photo of people walking on the beach',
 '1.00019737': 'a black and orange axe sitting on top of a stump',
 '1.00021182': 'a plane flying through the clouds at sunset',
 '1.00102300': 'the karlskrona tower in black and white',
 '1.00173970': 'a mountain with a cloud cover over it',
 '1.00187476': 'a man is kneeling down in front of a waterfall',
 '1.00196995': 'a toy airplane on a desk',
 '1.00291991': 'a dark ocean with waves and water',
 '1.00324787': 'a waterfall in the background',
 '1.00586515': 'a man standing on a snowy mountain top',
 '1.00610399': 'a waterfall in the middle of a green valley',
 '1.00630171': 'top view of palm trees in the jungle',
 '1.00756587': 'flowers in the sun',
 '1.00764210': 'a dark road with mountains in the background',
 '1.00771052': 'a lone plant in the middle of a desert',
 '1.00774183': 'the milky way over the ocean at night',
 '1.00783084': 'an empty road in the middle of a forest',
 '1.00835092': 'the glacier in 

In [10]:
import json 
with open('datasets/scale1_2/unplash2000_blip2.json', 'w') as f:
    json.dump(texts, f, indent=4)