In [1]:
from transformers import Blip2ForConditionalGeneration, AutoProcessor
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
model = model.to(device)

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from datasets import Dataset
import requests
from PIL import Image

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
prompt = "How many cats are there?"

encoding = processor(image, prompt, return_tensors="pt")

data_list = [
    {
        'images': image,
        'prompt': prompt,
        'chosen': "two",
        'rejected': 'three',
        'input_ids': encoding['input_ids'][0],
        'attention_mask': encoding['attention_mask'][0],
        'pixel_values': encoding['pixel_values'][0],
        'prompt_pixel_values': encoding['pixel_values'][0],
    }
]

def gen():
    for data in data_list:
        yield data

dataset = Dataset.from_generator(gen)
print("dataset 0 chosen", dataset[0]['chosen'])
print("dataset 0 rejected", dataset[0]['rejected'])
print("dataset 0 prompt", dataset[0]['prompt'])
print("dataset 0 input_ids", dataset[0]['input_ids'])
print("dataset 0 pixel_values len", len(dataset[0]['pixel_values']), len(dataset[0]['pixel_values'][0]), len(dataset[0]['pixel_values'][0][0]))

dataset 0 chosen two
dataset 0 rejected three
dataset 0 prompt How many cats are there?
dataset 0 input_ids [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 2, 6179, 171, 10017, 32, 89, 116]
dataset 0 pixel_values len 3 224 224


In [3]:
from trl import DPOConfig, DPOTrainer
from peft import LoraConfig

# Train the model
training_args = DPOConfig(
    output_dir="sadmankiba/blip2-dpo",
    bf16=True,
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    logging_steps=1,
)
trainer = DPOTrainer(
    model,
    ref_model=None,  # not needed when using peft
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
    peft_config=LoraConfig(),
)

trainer.train()

is_vision_model True
tokenizing train dataset
is_vision_model True
train_dataset Dataset({
    features: ['images', 'prompt', 'chosen', 'rejected', 'input_ids', 'attention_mask', 'pixel_values', 'prompt_pixel_values', 'prompt_input_ids', 'chosen_input_ids', 'rejected_input_ids'],
    num_rows: 1
})
train_dataset [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 2, 6179, 171, 10017, 32, 89, 116]


TypeError: Blip2ForConditionalGeneration.forward() missing 1 required positional argument: 'pixel_values'