From 58f156a7552c0f128ae6fc21860b6fb33351e229 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 10 Sep 2025 09:17:26 +0000 Subject: [PATCH] Onboarding Qwen2_5_vl Signed-off-by: Dipankar Sarkar --- .../transformers/models/pytorch_transforms.py | 8 + .../models/qwen_2_5_vl/__init__.py | 6 + .../models/qwen_2_5_vl/modeling_qwen2_5_vl.py | 138 ++++++++++++++++++ .../qwenvl_example/qwen2_5_vl_inference.py | 107 ++++++++++++++ examples/qwenvl_example/test_hf.py | 59 ++++++++ 5 files changed, 318 insertions(+) create mode 100644 QEfficient/transformers/models/qwen_2_5_vl/__init__.py create mode 100644 QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py create mode 100644 examples/qwenvl_example/qwen2_5_vl_inference.py create mode 100644 examples/qwenvl_example/test_hf.py diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index ca74c0ddd..362febec5 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -136,6 +136,9 @@ Qwen2Model, Qwen2RMSNorm, ) +from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( + Qwen2_5_VLForConditionalGeneration, +) from transformers.models.starcoder2.modeling_starcoder2 import ( Starcoder2Attention, Starcoder2DecoderLayer, @@ -303,6 +306,9 @@ QEffQwen2ForCausalLM, QEffQwen2Model, ) +from QEfficient.transformers.models.qwen_2_5_vl.modeling_qwen2_5_vl import ( + QEffQwen_2_5_vl_ForConditionalGeneration, +) from QEfficient.transformers.models.starcoder2.modeling_starcoder2 import ( QEffStarcoder2Attention, QEFFStarcoder2DecoderLayer, @@ -383,6 +389,8 @@ class KVCacheTransform(ModuleMappingTransform): LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration, # Llava Next LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration, + # Qwen2.5 VL + Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration, # Gemma GemmaAttention: QEffGemmaAttention, GemmaDecoderLayer: QEffGemmaDecoderLayer, diff --git a/QEfficient/transformers/models/qwen_2_5_vl/__init__.py b/QEfficient/transformers/models/qwen_2_5_vl/__init__.py new file mode 100644 index 000000000..d647b73a6 --- /dev/null +++ b/QEfficient/transformers/models/qwen_2_5_vl/__init__.py @@ -0,0 +1,6 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py new file mode 100644 index 000000000..8cc6f74ac --- /dev/null +++ b/QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py @@ -0,0 +1,138 @@ +import torch +import torch.nn as nn +from transformers import Qwen2_5_VLForConditionalGeneration + +from QEfficient.utils import constants + + +class QEffQwen_2_5_vl_EncoderWrapper(nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + self.model.vision_model = self.model.visual + + def forward(self, pixel_values, image_grid_thw): + pixel_values = pixel_values.type(self.model.visual.dtype) + image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw) + return image_embeds + + +class QEffQwen_2_5_vl_DecoderWrapper(nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + self.config = self.model.config + self.language_model = self.model.model + + def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_values): + breakpoint() + pass + + +class QEffQwen_2_5_vl_ForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): + def get_qeff_vision_encoder(self): + return QEffQwen_2_5_vl_EncoderWrapper(self) + + def get_qeff_language_decoder(self): + return QEffQwen_2_5_vl_DecoderWrapper(self) + + def get_dummy_inputs(self, kv_offload: bool = False, **kwargs): + num_layers = self.config.num_hidden_layers + num_key_value_heads = self.config.num_key_value_heads + head_dim = self.config.hidden_size // self.config.num_attention_heads + vision_inputs = { + "pixel_values": torch.zeros( + (11016, 1176), + dtype=torch.float32, + ), + "image_grid_thw": torch.tensor([[1, 108, 102]]), + } + + lang_inputs = { + "input_ids": torch.ones((1, 2779), dtype=torch.int64), + "attention_mask": torch.ones((1, 2779), dtype=torch.int64), + "vision_embeds": torch.ones( + (11016, 1176), + dtype=torch.float32, + ), + "image_idx": torch.zeros((1, 1), dtype=torch.int64), + } + lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1) + lang_inputs["past_key_values"] = [] + for i in range(num_layers): + lang_inputs["past_key_values"].append( + ( + torch.zeros( + 1, + num_key_value_heads, + 6000, + head_dim, + ), + torch.zeros( + 1, + num_key_value_heads, + 6000, + head_dim, + ), + ) + ) + + lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, constants.GRANITEVISION_CTX_LEN - 1) + inputs = {} + if kv_offload: + inputs["vision"] = vision_inputs + inputs["lang"] = lang_inputs + breakpoint() + return inputs + + def get_specializations( + self, + batch_size: int, + prefill_seq_len: int, + ctx_len: int, + img_size: int, + kv_offload: bool = False, + **compiler_options, + ): + pass + + def get_onnx_dynamic_axes(self, kv_offload: bool = False): + # Define dynamic axes + num_layers = self.config.num_hidden_layers + vision_dynamic_axes = { + "pixel_values": {0: "batch_size", 1: "num_patches"}, + "image_grid_thw": {0: "batch_size", 1: "batch_size"}, + } + lang_dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len", 2: "seq_len"}, + "vision_embeds": {0: "batch_size", 1: "vision_size"}, + } + for i in range(num_layers): + lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + dynamic_axes = {} + if kv_offload: + dynamic_axes["vision"] = vision_dynamic_axes + dynamic_axes["lang"] = lang_dynamic_axes + return dynamic_axes + + def get_inputs_info(self): + pass + + def get_output_names(self, kv_offload: bool = False): + vision_output_names = ["vision_embeds"] + lang_output_names = ["logits"] + # breakpoint() + for i in range(64): + for kv in ["key", "value"]: + lang_output_names.append(f"past_{kv}.{i}_RetainedState") + + output_names = {} + if kv_offload: + lang_output_names.insert(1, "vision_embeds_RetainedState") + lang_output_names.insert(2, "image_idx_output") + output_names["vision"] = vision_output_names + output_names["lang"] = lang_output_names + + return output_names diff --git a/examples/qwenvl_example/qwen2_5_vl_inference.py b/examples/qwenvl_example/qwen2_5_vl_inference.py new file mode 100644 index 000000000..36e38342a --- /dev/null +++ b/examples/qwenvl_example/qwen2_5_vl_inference.py @@ -0,0 +1,107 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import requests +from PIL import Image +from transformers import AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +# Add HuggingFace Token to access the model +HF_TOKEN = "" + + +def run_model( + model_name, + token, + query, + image_url, + kv_offload=False, + prefill_seq_len=5500, + ctx_len=6000, + generation_len=128, + img_size=384, + num_cores=16, + num_devices=1, +): + ## STEP - 1 Load the Processor and Model + + processor = AutoProcessor.from_pretrained(model_name, token=token) + + # `kv_offload` is used to compile the model in a 2 QPCs.Currently we are not supporting 1 qpc so the flag false is not allowed. + # The `kv_offload` flag should always be set to True. + # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs. + # The outputs of the Vision Encoder are then passed to the Language model via host in this case. + + model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, kv_offload=kv_offload) + + ## STEP - 2 Export & Compile the Model + + model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + img_size=img_size, + num_cores=num_cores, + num_devices=num_devices, + mxfp6_matmul=False, + ) + + ## STEP - 3 Load and process the inputs for Inference + + image = Image.open(requests.get(image_url, stream=True).raw) + messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}] + input_text = processor.apply_chat_template(messages, add_generation_prompt=True) + inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt") + + ## STEP - 4 Run Inference on the compiled model + + streamer = TextStreamer(processor.tokenizer) + output = model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) + breakpoint() + print(output) + + +if __name__ == "__main__": + # Model name and Input parameters + model_name = "Qwen/Qwen2.5-VL-32B-Instruct" + + # Please add prompt here + query = "Describe the image" + + # Please pass image url or image path .The format of the image should be jpg. + image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" + + # Compilation parameters for the model + kv_offload = True + prefill_seq_len = 5500 + ctx_len = 6000 + generation_len = 128 + img_size = 384 + num_cores = 16 + num_devices = 4 + + run_model( + model_name=model_name, + token=HF_TOKEN, + query=query, + kv_offload=kv_offload, + image_url=image_url, + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + generation_len=generation_len, + img_size=img_size, + num_cores=num_cores, + num_devices=num_devices, + ) + + +""" +Expected Response: + + + +""" diff --git a/examples/qwenvl_example/test_hf.py b/examples/qwenvl_example/test_hf.py new file mode 100644 index 000000000..79f919a68 --- /dev/null +++ b/examples/qwenvl_example/test_hf.py @@ -0,0 +1,59 @@ +from qwen_vl_utils import process_vision_info +from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration + +# default: Load the model on the available device(s) +model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + "Qwen/Qwen2.5-VL-32B-Instruct", torch_dtype="auto", device_map="auto" +) + + +# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios. +# model = Qwen2_5_VLForConditionalGeneration.from_pretrained( +# "Qwen/Qwen2.5-VL-32B-Instruct", +# torch_dtype=torch.bfloat16, +# attn_implementation="flash_attention_2", +# device_map="auto", +# ) + +# default processer +processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct") + +# The default range for the number of visual tokens per image in the model is 4-16384. +# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost. +# min_pixels = 256*28*28 +# max_pixels = 1280*28*28 +# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + }, + {"type": "text", "text": "Describe this image."}, + ], + } +] + +# Preparation for inference +text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +image_inputs, video_inputs = process_vision_info(messages) +inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", +) +inputs = inputs.to("cuda") +# breakpoint() +# Inference: Generation of the output +breakpoint() +generated_ids = model.generate(**inputs, max_new_tokens=128) +generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] +output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False +) +print(output_text)