From 58f156a7552c0f128ae6fc21860b6fb33351e229 Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <quic_dipankar@quicinc.com>
Date: Wed, 10 Sep 2025 09:17:26 +0000
Subject: [PATCH] Onboarding Qwen2_5_vl

Signed-off-by: Dipankar Sarkar <dipankar@qti.qualcomm.com>
---
 .../transformers/models/pytorch_transforms.py |   8 +
 .../models/qwen_2_5_vl/__init__.py            |   6 +
 .../models/qwen_2_5_vl/modeling_qwen2_5_vl.py | 138 ++++++++++++++++++
 .../qwenvl_example/qwen2_5_vl_inference.py    | 107 ++++++++++++++
 examples/qwenvl_example/test_hf.py            |  59 ++++++++
 5 files changed, 318 insertions(+)
 create mode 100644 QEfficient/transformers/models/qwen_2_5_vl/__init__.py
 create mode 100644 QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py
 create mode 100644 examples/qwenvl_example/qwen2_5_vl_inference.py
 create mode 100644 examples/qwenvl_example/test_hf.py

diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index ca74c0ddd..362febec5 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -136,6 +136,9 @@
     Qwen2Model,
     Qwen2RMSNorm,
 )
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VLForConditionalGeneration,
+)
 from transformers.models.starcoder2.modeling_starcoder2 import (
     Starcoder2Attention,
     Starcoder2DecoderLayer,
@@ -303,6 +306,9 @@
     QEffQwen2ForCausalLM,
     QEffQwen2Model,
 )
+from QEfficient.transformers.models.qwen_2_5_vl.modeling_qwen2_5_vl import (
+    QEffQwen_2_5_vl_ForConditionalGeneration,
+)
 from QEfficient.transformers.models.starcoder2.modeling_starcoder2 import (
     QEffStarcoder2Attention,
     QEFFStarcoder2DecoderLayer,
@@ -383,6 +389,8 @@ class KVCacheTransform(ModuleMappingTransform):
         LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration,
         # Llava Next
         LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration,
+        # Qwen2.5 VL
+        Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
         # Gemma
         GemmaAttention: QEffGemmaAttention,
         GemmaDecoderLayer: QEffGemmaDecoderLayer,
diff --git a/QEfficient/transformers/models/qwen_2_5_vl/__init__.py b/QEfficient/transformers/models/qwen_2_5_vl/__init__.py
new file mode 100644
index 000000000..d647b73a6
--- /dev/null
+++ b/QEfficient/transformers/models/qwen_2_5_vl/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py
new file mode 100644
index 000000000..8cc6f74ac
--- /dev/null
+++ b/QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py
@@ -0,0 +1,138 @@
+import torch
+import torch.nn as nn
+from transformers import Qwen2_5_VLForConditionalGeneration
+
+from QEfficient.utils import constants
+
+
+class QEffQwen_2_5_vl_EncoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.model.vision_model = self.model.visual
+
+    def forward(self, pixel_values, image_grid_thw):
+        pixel_values = pixel_values.type(self.model.visual.dtype)
+        image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw)
+        return image_embeds
+
+
+class QEffQwen_2_5_vl_DecoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.config = self.model.config
+        self.language_model = self.model.model
+
+    def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_values):
+        breakpoint()
+        pass
+
+
+class QEffQwen_2_5_vl_ForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
+    def get_qeff_vision_encoder(self):
+        return QEffQwen_2_5_vl_EncoderWrapper(self)
+
+    def get_qeff_language_decoder(self):
+        return QEffQwen_2_5_vl_DecoderWrapper(self)
+
+    def get_dummy_inputs(self, kv_offload: bool = False, **kwargs):
+        num_layers = self.config.num_hidden_layers
+        num_key_value_heads = self.config.num_key_value_heads
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+        vision_inputs = {
+            "pixel_values": torch.zeros(
+                (11016, 1176),
+                dtype=torch.float32,
+            ),
+            "image_grid_thw": torch.tensor([[1, 108, 102]]),
+        }
+
+        lang_inputs = {
+            "input_ids": torch.ones((1, 2779), dtype=torch.int64),
+            "attention_mask": torch.ones((1, 2779), dtype=torch.int64),
+            "vision_embeds": torch.ones(
+                (11016, 1176),
+                dtype=torch.float32,
+            ),
+            "image_idx": torch.zeros((1, 1), dtype=torch.int64),
+        }
+        lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1)
+        lang_inputs["past_key_values"] = []
+        for i in range(num_layers):
+            lang_inputs["past_key_values"].append(
+                (
+                    torch.zeros(
+                        1,
+                        num_key_value_heads,
+                        6000,
+                        head_dim,
+                    ),
+                    torch.zeros(
+                        1,
+                        num_key_value_heads,
+                        6000,
+                        head_dim,
+                    ),
+                )
+            )
+
+        lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, constants.GRANITEVISION_CTX_LEN - 1)
+        inputs = {}
+        if kv_offload:
+            inputs["vision"] = vision_inputs
+            inputs["lang"] = lang_inputs
+        breakpoint()
+        return inputs
+
+    def get_specializations(
+        self,
+        batch_size: int,
+        prefill_seq_len: int,
+        ctx_len: int,
+        img_size: int,
+        kv_offload: bool = False,
+        **compiler_options,
+    ):
+        pass
+
+    def get_onnx_dynamic_axes(self, kv_offload: bool = False):
+        # Define dynamic axes
+        num_layers = self.config.num_hidden_layers
+        vision_dynamic_axes = {
+            "pixel_values": {0: "batch_size", 1: "num_patches"},
+            "image_grid_thw": {0: "batch_size", 1: "batch_size"},
+        }
+        lang_dynamic_axes = {
+            "input_ids": {0: "batch_size", 1: "seq_len"},
+            "position_ids": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
+            "vision_embeds": {0: "batch_size", 1: "vision_size"},
+        }
+        for i in range(num_layers):
+            lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
+            lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
+        dynamic_axes = {}
+        if kv_offload:
+            dynamic_axes["vision"] = vision_dynamic_axes
+            dynamic_axes["lang"] = lang_dynamic_axes
+        return dynamic_axes
+
+    def get_inputs_info(self):
+        pass
+
+    def get_output_names(self, kv_offload: bool = False):
+        vision_output_names = ["vision_embeds"]
+        lang_output_names = ["logits"]
+        # breakpoint()
+        for i in range(64):
+            for kv in ["key", "value"]:
+                lang_output_names.append(f"past_{kv}.{i}_RetainedState")
+
+        output_names = {}
+        if kv_offload:
+            lang_output_names.insert(1, "vision_embeds_RetainedState")
+            lang_output_names.insert(2, "image_idx_output")
+            output_names["vision"] = vision_output_names
+            output_names["lang"] = lang_output_names
+
+        return output_names
diff --git a/examples/qwenvl_example/qwen2_5_vl_inference.py b/examples/qwenvl_example/qwen2_5_vl_inference.py
new file mode 100644
index 000000000..36e38342a
--- /dev/null
+++ b/examples/qwenvl_example/qwen2_5_vl_inference.py
@@ -0,0 +1,107 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+from PIL import Image
+from transformers import AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+# Add HuggingFace Token to access the model
+HF_TOKEN = ""
+
+
+def run_model(
+    model_name,
+    token,
+    query,
+    image_url,
+    kv_offload=False,
+    prefill_seq_len=5500,
+    ctx_len=6000,
+    generation_len=128,
+    img_size=384,
+    num_cores=16,
+    num_devices=1,
+):
+    ## STEP - 1 Load the Processor and Model
+
+    processor = AutoProcessor.from_pretrained(model_name, token=token)
+
+    # `kv_offload` is used to compile the model in a 2 QPCs.Currently we are not supporting 1 qpc so the flag false is not allowed.
+    # The `kv_offload` flag should always be set to True.
+    # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
+    # The outputs of the Vision Encoder are then passed to the Language model via host in this case.
+
+    model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, kv_offload=kv_offload)
+
+    ## STEP - 2 Export & Compile the Model
+
+    model.compile(
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        img_size=img_size,
+        num_cores=num_cores,
+        num_devices=num_devices,
+        mxfp6_matmul=False,
+    )
+
+    ## STEP - 3 Load and process the inputs for Inference
+
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}]
+    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt")
+
+    ## STEP - 4 Run Inference on the compiled model
+
+    streamer = TextStreamer(processor.tokenizer)
+    output = model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+    breakpoint()
+    print(output)
+
+
+if __name__ == "__main__":
+    # Model name and Input parameters
+    model_name = "Qwen/Qwen2.5-VL-32B-Instruct"
+
+    # Please add prompt here
+    query = "Describe the image"
+
+    # Please pass image url or image path .The format of the image should be jpg.
+    image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    # Compilation parameters for the model
+    kv_offload = True
+    prefill_seq_len = 5500
+    ctx_len = 6000
+    generation_len = 128
+    img_size = 384
+    num_cores = 16
+    num_devices = 4
+
+    run_model(
+        model_name=model_name,
+        token=HF_TOKEN,
+        query=query,
+        kv_offload=kv_offload,
+        image_url=image_url,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        generation_len=generation_len,
+        img_size=img_size,
+        num_cores=num_cores,
+        num_devices=num_devices,
+    )
+
+
+"""
+Expected Response:
+
+
+
+"""
diff --git a/examples/qwenvl_example/test_hf.py b/examples/qwenvl_example/test_hf.py
new file mode 100644
index 000000000..79f919a68
--- /dev/null
+++ b/examples/qwenvl_example/test_hf.py
@@ -0,0 +1,59 @@
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+
+# default: Load the model on the available device(s)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2.5-VL-32B-Instruct", torch_dtype="auto", device_map="auto"
+)
+
+
+# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+#     "Qwen/Qwen2.5-VL-32B-Instruct",
+#     torch_dtype=torch.bfloat16,
+#     attn_implementation="flash_attention_2",
+#     device_map="auto",
+# )
+
+# default processer
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")
+
+# The default range for the number of visual tokens per image in the model is 4-16384.
+# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
+# min_pixels = 256*28*28
+# max_pixels = 1280*28*28
+# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+
+# Preparation for inference
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+inputs = inputs.to("cuda")
+# breakpoint()
+# Inference: Generation of the output
+breakpoint()
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)