In [110]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-3B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [111]:
# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", max_pixels=768 * 768)

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("mps")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)


['The image depicts a serene beach scene at sunset. A person is sitting on the sandy shore, facing a large, light-colored dog that appears to be a Labrador Retriever. The dog is sitting on its hind legs and is giving the person a high-five with its front paws. The person is wearing a plaid shirt and black pants, and they are smiling warmly. The background shows the ocean with gentle waves lapping against the shore, and the sky is filled with warm hues of orange and yellow from the setting sun. The overall atmosphere is peaceful and joyful, capturing a moment of connection between the person and their dog.']


In [33]:
{k: v.shape for k, v in inputs.items()}

{'input_ids': torch.Size([1, 751]),
 'attention_mask': torch.Size([1, 751]),
 'pixel_values': torch.Size([2904, 1176]),
 'image_grid_thw': torch.Size([1, 3])}

In [34]:
(inputs["input_ids"][0] == 151655).sum()

tensor(726, device='mps:0')

## Tokenizer

In [28]:
"</EMBED>" in processor.tokenizer.get_vocab()

False

In [37]:
image_inputs, video_inputs

([<PIL.Image.Image image mode=RGB size=2044x1372>], None)

In [35]:
text

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'

In [38]:
processor.tokenizer.add_special_tokens(
    {
        "additional_special_tokens": [
            "<EMBED>",
            "</EMBED>",
        ]
    }
)
model.resize_token_embeddings(len(processor.tokenizer))

Embedding(151667, 2048)

In [39]:
embed_start_token_id = processor.tokenizer.convert_tokens_to_ids("<EMBED>")
embed_end_token_id = processor.tokenizer.convert_tokens_to_ids("</EMBED>")
embed_start_token_id, embed_end_token_id

(151665, 151666)

In [47]:
text

['<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n']

In [51]:
inputs = processor(
    text=[f"<EMBED>{text_line}</EMBED>" for text_line in text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
).to(model.device)

In [52]:
{k: v.shape for k, v in inputs.items()}

{'input_ids': torch.Size([1, 749]),
 'attention_mask': torch.Size([1, 749]),
 'pixel_values': torch.Size([2904, 1176]),
 'image_grid_thw': torch.Size([1, 3])}

In [53]:
inputs["input_ids"][0]

tensor([151665, 151644,   8948,    198,   2610,    525,    264,  10950,  17847,
            13, 151645,    198, 151644,    872,    198, 151652, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
        151655, 151655, 151655, 151655, 

In [None]:
import torch

with torch.inference_mode():
    out = model(**inputs, output_hidden_states=True)

In [72]:
len(out.hidden_states)

37

In [75]:
out.hidden_states[-1][inputs["input_ids"] == embed_end_token_id].shape

torch.Size([1, 2048])

In [62]:
out.hidden_states[-1].shape

torch.Size([1, 749, 2048])

In [None]:
messages = [
    [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
                },
            ],
        }
    ]
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text

['<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n']

In [83]:
import peft
from loguru import logger

In [94]:
lora_config = peft.LoraConfig(
    r=32,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["qkv", "fc1", "fc2", "linear", "proj"],
    use_dora=True,
    init_lora_weights="gaussian",
)
lora_model = peft.get_peft_model(model, lora_config)



In [95]:
for name, param in model.named_parameters():
    print(name, param.shape)


model.visual.patch_embed.proj.base_layer.weight torch.Size([1280, 3, 2, 14, 14])
model.visual.patch_embed.proj.lora_A.default.weight torch.Size([32, 3, 2, 14, 14])
model.visual.patch_embed.proj.lora_B.default.weight torch.Size([1280, 32, 1, 1, 1])
model.visual.patch_embed.proj.lora_magnitude_vector.default.weight torch.Size([1, 1280, 1, 1, 1])
model.visual.blocks.0.norm1.weight torch.Size([1280])
model.visual.blocks.0.norm2.weight torch.Size([1280])
model.visual.blocks.0.attn.qkv.base_layer.weight torch.Size([3840, 1280])
model.visual.blocks.0.attn.qkv.base_layer.bias torch.Size([3840])
model.visual.blocks.0.attn.qkv.lora_A.default.weight torch.Size([32, 1280])
model.visual.blocks.0.attn.qkv.lora_B.default.weight torch.Size([3840, 32])
model.visual.blocks.0.attn.qkv.lora_magnitude_vector.default.weight torch.Size([3840])
model.visual.blocks.0.attn.proj.base_layer.weight torch.Size([1280, 1280])
model.visual.blocks.0.attn.proj.base_layer.bias torch.Size([1280])
model.visual.blocks.0.att

In [97]:
for name, param in lora_model.named_parameters():
    if "lora" in name:
        print(name, param.shape)


base_model.model.model.visual.patch_embed.proj.lora_A.default.weight torch.Size([32, 3, 2, 14, 14])
base_model.model.model.visual.patch_embed.proj.lora_B.default.weight torch.Size([1280, 32, 1, 1, 1])
base_model.model.model.visual.patch_embed.proj.lora_magnitude_vector.default.weight torch.Size([1, 1280, 1, 1, 1])
base_model.model.model.visual.blocks.0.attn.qkv.lora_A.default.weight torch.Size([32, 1280])
base_model.model.model.visual.blocks.0.attn.qkv.lora_B.default.weight torch.Size([3840, 32])
base_model.model.model.visual.blocks.0.attn.qkv.lora_magnitude_vector.default.weight torch.Size([3840])
base_model.model.model.visual.blocks.0.attn.proj.lora_A.default.weight torch.Size([32, 1280])
base_model.model.model.visual.blocks.0.attn.proj.lora_B.default.weight torch.Size([1280, 32])
base_model.model.model.visual.blocks.0.attn.proj.lora_magnitude_vector.default.weight torch.Size([1280])
base_model.model.model.visual.blocks.1.attn.qkv.lora_A.default.weight torch.Size([32, 1280])
base_mod

In [84]:
trainable_params, all_params = lora_model.get_nb_trainable_parameters()
logger.info(f"Trainable portion: {trainable_params / all_params:.4f}, trainable params: {trainable_params}")


[32m2025-10-04 17:07:28.674[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTrainable portion: 0.0014, trainable params: 5365760[0m


In [101]:
len([p for p in lora_model.parameters()])

1019

In [100]:
len([p for n, p in lora_model.named_parameters() if "lora" in n])

195

In [105]:
[(n, p.shape, p.requires_grad) for n, p in lora_model.named_parameters() if "embed_tokens" in n]

[('base_model.model.model.language_model.embed_tokens.weight',
  torch.Size([151667, 2048]),
  False)]

In [106]:
lora_config = peft.LoraConfig(
    r=32,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["qkv", "fc1", "fc2", "linear", "proj"],
    use_dora=True,
    init_lora_weights="gaussian",
    modules_to_save=["embed_tokens", "lm_head"],
)
lora_model = peft.get_peft_model(model, lora_config)



In [109]:
[(n, p.shape, p.requires_grad) for n, p in lora_model.named_parameters() if "lm_head" in n]

[('base_model.model.lm_head.modules_to_save.default.weight',
  torch.Size([151667, 2048]),
  True)]

In [108]:
[(n, p.shape, p.requires_grad) for n, p in model.named_parameters() if "embed_tokens" in n]


[('model.language_model.embed_tokens.original_module.weight',
  torch.Size([151667, 2048]),
  False),
 ('model.language_model.embed_tokens.modules_to_save.default.weight',
  torch.Size([151667, 2048]),
  True)]