In [1]:
from transformers import Qwen2VLForConditionalGeneration
from transformers import AutoTokenizer
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info
import os

In [2]:
# 设置可以使用的卡
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

In [3]:
# 设置模型地址
model_dir = "Qwen2-VL-2B-Instruct"

In [4]:
# 加载模型
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir, 
    torch_dtype="auto", 
    device_map="auto"
)

`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model

Qwen2VLForConditionalGeneration(
  (visual): Qwen2VisionTransformerPretrainedModel(
    (patch_embed): PatchEmbed(
      (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
    )
    (rotary_pos_emb): VisionRotaryEmbedding()
    (blocks): ModuleList(
      (0-31): 32 x Qwen2VLVisionBlock(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): VisionSdpaAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): VisionMlp(
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): QuickGELUActivation()
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        )
      )
    )
    (merger): PatchMerger(
      (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
      (mlp): Seq

In [6]:
nums = 0
for param in model.visual.parameters():
    nums += param.numel()

In [7]:
nums

665271296

In [8]:
nums = 0
for param in model.model.parameters():
    nums += param.numel()
nums

1543714304

In [9]:
# default processer
processor = AutoProcessor.from_pretrained(model_dir)

In [10]:
processor

Qwen2VLProcessor:
- image_processor: Qwen2VLImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Qwen2VLImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "max_pixels": 12845056,
  "merge_size": 2,
  "min_pixels": 3136,
  "patch_size": 14,
  "processor_class": "Qwen2VLProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "max_pixels": 12845056,
    "min_pixels": 3136
  },
  "temporal_patch_size": 2
}

- tokenizer: Qwen2TokenizerFast(name_or_path='Qwen2-VL-2B-Instruct', vocab_size=151643, model_max_length=32768, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|bo

In [11]:
# 构建聊天消息
messages = [
    {
        "role": "system",
        "content": "你是一个有用的助手！"
    },
    {
        "role": "user",
        "content": [
            {"type": "text", 
             "text": "请描述一下这张图像"},
            {
                "type": "image",
                "image": "ViT.png",
            }
        ],
    }
]

In [12]:
# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

In [13]:
print(text)

<|im_start|>system
你是一个有用的助手！<|im_end|>
<|im_start|>user
请描述一下这张图像<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant



In [14]:
# 处理视频和图像
image_inputs, video_inputs = process_vision_info(messages)

In [15]:
image_inputs

[<PIL.Image.Image image mode=RGB size=1428x728>]

In [16]:
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

In [17]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])

In [18]:
inputs.pixel_values.numel()

6237504

In [19]:
inputs.image_grid_thw

tensor([[  1,  52, 102]])

In [20]:
98 * 146 + 52 * 102

19612

In [21]:
inputs = inputs.to("cuda")

In [22]:
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=512)

In [23]:
# 删掉生成部分
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# 解码输出
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

['这张图像展示了视觉Transformer（ViT）模型的结构。ViT是一种用于图像分类和视觉理解的神经网络模型，它结合了Transformer和卷积神经网络（CNN）的优点。\n\n### 左侧部分\n- **输入层**：图像作为输入，通过一个分类头（MLP Head）进行分类。\n- **Transformer Encoder**：ViT的核心部分，它使用Transformer架构对输入图像进行编码。\n- **Patch + Position Embedding**：图像被分割成多个矩形区域（patch）并附加位置编码，用于捕捉图像的局部特征。\n- **Linear Projection of Flattened Patches**：将每个patch的特征投影到一个共享的特征空间中。\n- **Transformer Encoder**：使用Transformer架构对投影后的特征进行编码，以捕捉图像的全局结构和特征。\n\n### 右侧部分\n- **Transformer Encoder**：与左侧部分相同，但使用了不同的结构和组件。\n- **Multi-Head Attention**：用于处理不同位置的特征之间的交互。\n- **Norm**：用于正则化和缩放输出。\n- **Embedded Patches**：将图像分割成嵌入的特征向量。\n\n### 结论\nViT通过将Transformer架构与CNN结合，能够有效地处理图像的局部和全局特征，从而在图像分类和视觉理解任务中表现出色。']
