In [1]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from PIL import Image

In [2]:
processor = AutoProcessor.from_pretrained('./SmolVLM-256M-Detection')

In [3]:
DEVICE = 'cuda'

model = AutoModelForVision2Seq.from_pretrained(
    "SmolVLM-256M-Detection",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [20]:
example = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": f"what do you see?"}
        ]
    },
]

In [21]:
image = Image.open("pets.jpg").convert("RGB")
text = processor.apply_chat_template(example,tokenize=False,add_generation_prompt=True)
text

'<|im_start|>User:<image>what do you see?<end_of_utterance>\nAssistant:'

In [22]:
inputs = processor(
        text=text, 
        images=image, 
        return_tensors='pt',
        size={'longest_edge': 512},
        padding=True
    ).to(DEVICE)

# Generate output
with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=128)

output_text = processor.tokenizer.decode(output_ids[0], skip_special_tokens=False)
print(output_text)

<|im_start|>User:<fake_token_around_image><global-img><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_image>what do you see?<end_of_utterance>
Assistant:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
