In [1]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from peft import PeftModel
from PIL import Image

In [2]:
processor = AutoProcessor.from_pretrained('./SmolVLM-256M-Detection')

In [3]:
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-256M-Instruct",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2"
).cuda()
model.resize_token_embeddings(len(processor.tokenizer))

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50305, 576, padding_idx=2)

In [4]:
peft_model = PeftModel.from_pretrained(model, './SmolVLM-256M-Detection/checkpoint-8000')

In [5]:
peft_model.device

device(type='cuda', index=0)

In [12]:
example = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": f"<detect> person"}
        ]
    },
]

In [13]:
image = Image.open("pets.jpg").convert("RGB")
text = processor.apply_chat_template(example,tokenize=False,add_generation_prompt=True)
text

'<|im_start|>User:<image><detect> person<end_of_utterance>\nAssistant:'

In [14]:
inputs = processor(
        text=text, 
        images=image, 
        return_tensors='pt',
        size={'longest_edge': 512},
    ).to('cuda')

# Generate output
with torch.no_grad():
    output_ids = peft_model.generate(**inputs, max_new_tokens=128, do_sample=True)

print(processor.tokenizer.decode(output_ids.cpu().flatten().numpy(), skip_special_tokens=False))

<|im_start|>User:<fake_token_around_image><global-img><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><image><fake_token_around_image><detect> person<end_of_utterance>
Assistant:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::


In [9]:
print()


