From 1d98635a50f9ff692c0b3ed3abbb5562fc626721 Mon Sep 17 00:00:00 2001
From: Mamta Singh <mamtsing@qti.qualcomm.com>
Date: Sun, 5 Oct 2025 21:22:20 +0000
Subject: [PATCH 1/4] Enable CB for vlms with multiple images and multiple
 prompts

Signed-off-by: Mamta Singh <mamtsing@qti.qualcomm.com>
---
 .../generation/text_generation_inference.py   | 252 ++++++++++++++----
 .../models/llama4/modeling_llama4.py          |  98 +++++--
 .../transformers/models/modeling_auto.py      |  72 +++--
 examples/llama4_CB_example_vision_lang.py     |  65 +++++
 4 files changed, 384 insertions(+), 103 deletions(-)
 create mode 100644 examples/llama4_CB_example_vision_lang.py

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index cf9cbcacc..554f030af 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -13,6 +13,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+import torch
 import transformers
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -313,7 +314,10 @@ def calculate_latency(total_decoded_tokens, loop_start, start, end, decode_pause
 
 def cloud_ai_100_exec_kv(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    qpc_path: str,
+    processor,
+    lang_qpc_path: str,
+    vision_qpc_path: str,
+    images,
     prompt: Optional[str] = None,
     prompts_txt_file_path: Optional[str] = None,
     device_id: Optional[List[int]] = None,
@@ -370,7 +374,7 @@ def cloud_ai_100_exec_kv(
         exec_info = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_id=[0])
 
     """
-    batch_size, ctx_len, full_batch_size = get_compilation_dims(qpc_path)
+    batch_size, ctx_len, full_batch_size = get_compilation_dims(lang_qpc_path)
     prompt: List[str] = get_input_prompts(prompt, prompts_txt_file_path)
     prompt = fix_prompts(prompt, batch_size, full_batch_size)
     if prompt_to_lora_id_mapping is not None:
@@ -379,7 +383,9 @@ def cloud_ai_100_exec_kv(
         )
     generate_text = TextGeneration(
         tokenizer=tokenizer,
-        qpc_path=qpc_path,
+        processor=processor,
+        lang_qpc_path=lang_qpc_path,
+        vision_qpc_path=vision_qpc_path,
         device_id=device_id,
         ctx_len=ctx_len,
         enable_debug_logs=enable_debug_logs,
@@ -410,7 +416,10 @@ def cloud_ai_100_exec_kv(
         )
     else:
         exec_info = generate_text.generate(
-            prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
+            images=images,
+            prompt=prompt,
+            generation_len=generation_len,
+            prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
         )
 
     print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
@@ -420,8 +429,10 @@ def cloud_ai_100_exec_kv(
 class QEffTextGenerationBase:
     def __init__(
         self,
+        processor,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        qpc_path: str,
+        lang_qpc_path: str,
+        vision_qpc_path: Optional[str] = None,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
         device_id: Optional[List[int]] = None,
@@ -439,11 +450,15 @@ def __init__(
         self.sampling_params = sampling_params
 
         # Load QPC
-        self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs)
+        if not lang_qpc_path:
+            raise TypeError("Please run compile API for language model first!")
+        self._lang_session = QAICInferenceSession(lang_qpc_path, device_id, activate=False)
+        if vision_qpc_path:
+            self._vision_session = QAICInferenceSession(vision_qpc_path, device_id)
 
         # Validate sampler inputs for On-Device Sampling
         self.include_sampler = validate_sampler_inputs(
-            session_inputs=set(self._session.input_names), include_sampler=include_sampler
+            session_inputs=set(self._lang_session.input_names), include_sampler=include_sampler
         )
 
         # Fetch the variables from the QPC
@@ -468,10 +483,23 @@ def __init__(
         self.generation_len = None
 
         self.tokenizer = tokenizer
+        self.processor = processor
         self._set_tokenizer_params()  # set tokenizer params
         # Skip inputs/outputs
-        self._session.skip_buffers(
-            [x for x in self._session.input_names + self._session.output_names if x.startswith("past_")]
+        if self._vision_session:
+            self._vision_session.skip_buffers(
+                [
+                    x
+                    for x in self._vision_session.input_names + self._vision_session.output_names
+                    if x.startswith("past_") or x.endswith("_RetainedState")
+                ]
+            )
+        self._lang_session.skip_buffers(
+            [
+                x
+                for x in self._lang_session.input_names + self._lang_session.output_names
+                if x.startswith("past_") or x.endswith("_RetainedState")
+            ]
         )
 
     def _set_tokenizer_params(self):
@@ -496,13 +524,16 @@ def _fetch_full_batch_size(
 
         """
         full_batch_size = None
-        if "batch_index" in self._session.binding_index_map:
-            if self._session.allowed_shapes:
+        if "batch_index" in self._lang_session.binding_index_map:
+            if self._lang_session.allowed_shapes:
                 full_batch_size, _ = [
-                    x[self._session.binding_index_map["batch_index"]][1][0] for x in self._session.allowed_shapes
+                    x[self._lang_session.binding_index_map["batch_index"]][1][0]
+                    for x in self._lang_session.allowed_shapes
                 ]
             else:
-                full_batch_size, _ = self._session.bindings[self._session.binding_index_map["batch_index"]].dims
+                full_batch_size, _ = self._lang_session.bindings[
+                    self._lang_session.binding_index_map["batch_index"]
+                ].dims
         return full_batch_size
 
     def _fetch_batch_size_prefill_seq_len(
@@ -515,15 +546,17 @@ def _fetch_batch_size_prefill_seq_len(
             batch_size: The batch size fetched from the session's bindings or allowed shapes.
             prefill_seq_len: The prefill sequence length fetched from the session's bindings or allowed shapes.
         """
-        if self._session.allowed_shapes:
+        if self._lang_session.allowed_shapes:
             batch_size = max(
-                [x[self._session.binding_index_map["input_ids"]][1][0] for x in self._session.allowed_shapes]
+                [x[self._lang_session.binding_index_map["input_ids"]][1][0] for x in self._lang_session.allowed_shapes]
             )
             prefill_seq_len = max(
-                [x[self._session.binding_index_map["input_ids"]][1][1] for x in self._session.allowed_shapes]
+                [x[self._lang_session.binding_index_map["input_ids"]][1][1] for x in self._lang_session.allowed_shapes]
             )
         else:
-            batch_size, prefill_seq_len = self._session.bindings[self._session.binding_index_map["input_ids"]].dims
+            batch_size, prefill_seq_len = self._lang_session.bindings[
+                self._lang_session.binding_index_map["input_ids"]
+            ].dims
         return batch_size, prefill_seq_len
 
     def _fetch_decode_seq_len(
@@ -536,9 +569,9 @@ def _fetch_decode_seq_len(
             decode_seq_len: The decode sequence length fetched from the session's bindings or allowed shapes.
         """
         decode_seq_len = None
-        if self._session.allowed_shapes:
+        if self._lang_session.allowed_shapes:
             decode_seq_len = min(
-                [x[self._session.binding_index_map["input_ids"]][1][1] for x in self._session.allowed_shapes]
+                [x[self._lang_session.binding_index_map["input_ids"]][1][1] for x in self._lang_session.allowed_shapes]
             )
         return decode_seq_len
 
@@ -557,10 +590,10 @@ def _fetch_vocab_size(
             if self.include_sampler
             else "logits"
         )
-        if self._session.allowed_shapes:
-            return [x[self._session.binding_index_map[key]] for x in self._session.allowed_shapes][0][1][2]
+        if self._lang_session.allowed_shapes:
+            return [x[self._lang_session.binding_index_map[key]] for x in self._lang_session.allowed_shapes][0][1][2]
 
-        return self._session.bindings[self._session.binding_index_map[key]].dims[2]
+        return self._lang_session.bindings[self._lang_session.binding_index_map[key]].dims[2]
 
     def _fetch_generation_len(self, generation_len, max_gen_len):
         """
@@ -649,7 +682,7 @@ def _fetch_next_token_id(self, outputs):
                 logits = np.expand_dims(logits, 1)
             return logits.argmax(2)
 
-    def initialize_decode_inputs(self, num_prompts, execution_batch_size, max_gen_length):
+    def initialize_decode_inputs(self, num_images, num_prompts, execution_batch_size, max_gen_length):
         """
         Initialize np arrays for storing the prefill output for all the decode batch size.
         """
@@ -696,7 +729,7 @@ def update_decode_input(self, outputs, position_ids, generation_len, decode_batc
         self.generation_len[decode_batch_id or slice(None)] = generation_len
         return next_token_id
 
-    def run_prefill_for_all_inputs(self, prompt_queue, generation_len):
+    def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len):
         """
         Runs prefill for all inputs in the prompt queue and updates the decode input.
 
@@ -709,10 +742,14 @@ def run_prefill_for_all_inputs(self, prompt_queue, generation_len):
         """
         for decode_batch_id in range(self.full_batch_size):
             next_prompt = prompt_queue.popleft()
+            next_image = image_queue.popleft()
 
             # run prefill for num_chunks
             outputs, position_ids, generation_len = self.run_prefill(
-                next_prompt, generation_len, decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1)
+                next_image,
+                next_prompt,
+                generation_len,
+                decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1),
             )
 
             _ = self.update_decode_input(outputs, position_ids, generation_len, decode_batch_id)
@@ -727,14 +764,39 @@ def _set_output_buffers(self, batch_size: int = 1, sequence_length: int = 1):
         if self.include_sampler:
             if self.return_pdfs:
                 probs_out_placeholder = np.zeros((batch_size, sequence_length, self._vocab_size), dtype=np.float32)
-                self._session.set_buffers({"probs": probs_out_placeholder})
+                self._lang_session.set_buffers({"probs": probs_out_placeholder})
             next_tokens_out_placeholder = np.zeros((batch_size, sequence_length, 1), dtype=np.int64)
-            self._session.set_buffers({"next_tokens": next_tokens_out_placeholder})
+            self._lang_session.set_buffers({"next_tokens": next_tokens_out_placeholder})
         else:
             logits_out_placeholder = np.zeros((batch_size, sequence_length, self._vocab_size), dtype=np.float32)
-            self._session.set_buffers({"logits": logits_out_placeholder})
+            self._lang_session.set_buffers({"logits": logits_out_placeholder})
+
+            vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16)
+            self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder})
+
+    def prepare_vision_language_inputs(self, prompt, image_url):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": image_url},
+                    {"type": "text", "text": prompt},
+                ],
+            },
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            # padding="max_length",
+            # max_length=padded_len,
+        )
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+        return inputs
 
-    def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_id=None):
+    def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_batch_id=None):
         """
         Runs prefill for a given prompt and generation length.
 
@@ -752,7 +814,8 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
             generation_len (int): The generation length.
         """
         # Run prefill
-        inputs = self.tokenizer(prompt, return_tensors="np", padding=True)
+        inputs = self.prepare_vision_language_inputs(prompt, image)
+
         position_ids = inputs["attention_mask"].sum(1, keepdims=True)
         padded_len = inputs["input_ids"].shape[1]
         num_chunks = -(padded_len // -self._prefill_seq_len)  # ceil divide without float
@@ -766,51 +829,110 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
         # Set the prefill output buffers
         self._set_output_buffers(batch_size=prefill_logit_bs, sequence_length=1)
 
-        inputs = self.tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
-        inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
-        inputs.pop("token_type_ids", None)
+        pad_token_id = 1
+        input_ids_length = inputs["input_ids"].shape[1]
+        num_chunks = -(input_ids_length // -self._prefill_seq_len)  # ceil divide without float
+        padded_len = num_chunks * self._prefill_seq_len  # Convert to a multiple of prompt_len
+
+        inputs["input_ids"] = torch.nn.functional.pad(
+            inputs["input_ids"],
+            (0, padded_len - input_ids_length),
+            "constant",
+            pad_token_id,
+        )
+        inputs["attention_mask"] = torch.nn.functional.pad(
+            inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0
+        )
+        if "cross_attention_mask" in inputs:
+            inputs["cross_attention_mask"] = torch.nn.functional.pad(
+                inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length)
+            )
+
+        for k, v in inputs.items():
+            inputs[k] = np.array(v)
+
+        vision_inputs = {
+            k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
+        }
+        if vision_inputs:
+            vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
+
+        vision_outputs = {}
+        if self._vision_session:
+            self._vision_session.activate()
+        # Run vision prefill
+        if vision_inputs:
+            vision_outputs = self._vision_session.run(vision_inputs)
+
+        lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
+        lang_inputs["position_ids"] = np.where(
+            lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
+        )  # Need to use -1 as position_ids for invalid tokens
+
+        # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
+        # if not_mllama:
+        lang_inputs["image_idx"] = np.array([[0]])
+
+        if self._vision_session:
+            self._vision_session.deactivate()
+        self._lang_session.activate()
+        self._lang_session.set_buffers(vision_outputs)
 
         if decode_batch_id is not None:
-            inputs["batch_index"] = decode_batch_id
+            lang_inputs["batch_index"] = decode_batch_id
         if self.is_tlm:
-            inputs["num_logits_to_keep"] = np.zeros((1, 1))
+            lang_inputs["num_logits_to_keep"] = np.zeros((1, 1))
         if self.include_sampler:
-            inputs["last_accepted_output_tokens"] = inputs["input_ids"]
+            lang_inputs["last_accepted_output_tokens"] = lang_inputs["input_ids"]
             for op in Constants.SAMPLER_OPS:
                 if decode_batch_id is not None:
-                    inputs[op] = self.sampling_params[op][decode_batch_id.flatten()]
+                    lang_inputs[op] = self.sampling_params[op][decode_batch_id.flatten()]
                 else:
-                    inputs[op] = self.sampling_params[op]
+                    lang_inputs[op] = self.sampling_params[op]
 
         if self._prompt_to_lora_id_mapping_prefill:
             if self.full_batch_size:
-                inputs["lora_ids"] = np.array(
+                lang_inputs["lora_ids"] = np.array(
                     self._prompt_to_lora_id_mapping_prefill.popleft(), dtype=np.int64
                 ).reshape(1, 1)
             else:
                 batch_lora_ids = [self._prompt_to_lora_id_mapping_prefill.popleft() for i in range(self.batch_size)]
-                inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
+                lang_inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
 
+        # Run language prefill
+        chunk_inputs = lang_inputs.copy()
         for i in range(num_chunks):
-            chunk_inputs = inputs.copy()
-            chunk_inputs["input_ids"] = inputs["input_ids"][
+            chunk_inputs = lang_inputs.copy()
+            chunk_inputs["input_ids"] = lang_inputs["input_ids"][
                 :, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len
             ]
-            chunk_inputs["position_ids"] = inputs["position_ids"][
+            chunk_inputs["position_ids"] = lang_inputs["position_ids"][
                 :, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len
             ]
             if self.include_sampler:
                 chunk_inputs["last_accepted_output_tokens"] = chunk_inputs["input_ids"]
-            outputs = self._session.run(chunk_inputs)
+            outputs = self._lang_session.run(chunk_inputs)
+            chunk_inputs["image_idx"] = outputs["image_idx_output"]
             if self._write_io_dir is not None:
                 write_io_files(inputs, outputs, self._write_io_dir, "prefill", "aic_batch_io", True, False)
+
+        # Skip inputs/outputs again
+        self._lang_session.skip_buffers(
+            [
+                x
+                for x in self._lang_session.input_names + self._lang_session.output_names
+                if x.startswith("past_") or x.endswith("_RetainedState")
+            ]
+        )
+        if self._lang_session:
+            self._lang_session.deactivate()
         return (
             outputs,
             position_ids,
             generation_len,
         )
 
-    def run_continuous_batching_decode(self, prompt_queue, generation_len):
+    def run_continuous_batching_decode(self, image_queue, prompt_queue, generation_len):
         """
         Runs continuous batching decode for the given prompt queue and generation length.
 
@@ -842,7 +964,8 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
         decode_inputs = self.prepare_decode_inputs()
 
         while prompt_queue or current_decode_ongoing.any():
-            outputs = self._session.run(decode_inputs)
+            self._lang_session.activate()
+            outputs = self._lang_session.run(decode_inputs)
 
             # Prepare inputs for next iteration
             next_token_id = self._fetch_next_token_id(outputs)
@@ -892,6 +1015,8 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
 
                     generated_id_current_index[decode_batch_id] += 1
 
+        self._lang_session.deactivate()
+
         return decode_pause_time
 
     def run_decode(self, decode_inputs, generation_len, streamer: Optional[transformers.TextStreamer] = None):
@@ -911,13 +1036,13 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
             logits_out_placeholder = np.zeros(
                 (self.batch_size, self._decode_seq_len, self._vocab_size), dtype=np.float32
             )
-            self._session.set_buffers({"logits": logits_out_placeholder})
+            self._lang_session.set_buffers({"logits": logits_out_placeholder})
         finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id
         num_token = 0
         for num_token in range(1, generation_len):
             if streamer:
                 streamer.put(decode_inputs["input_ids"][0])
-            outputs = self._session.run(decode_inputs)
+            outputs = self._lang_session.run(decode_inputs)
 
             if self._write_io_dir is not None:
                 write_io_files(decode_inputs, outputs, self._write_io_dir, "decode", "aic_batch_io", True, False)
@@ -951,7 +1076,7 @@ def generate_decode_stream(self, decode_inputs, generation_len):
         finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id
         for num_token in range(1, generation_len):
             yield decode_inputs["input_ids"]
-            outputs = self._session.run(decode_inputs)
+            outputs = self._lang_session.run(decode_inputs)
 
             if self._write_io_dir is not None:
                 write_io_files(decode_inputs, outputs, self._write_io_dir, "decode", "aic_batch_io", True, False)
@@ -971,8 +1096,10 @@ def generate_decode_stream(self, decode_inputs, generation_len):
 class TextGeneration:
     def __init__(
         self,
+        processor,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        qpc_path: str,
+        vision_qpc_path: str,
+        lang_qpc_path: str,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
         device_id: Optional[List[int]] = None,
@@ -984,8 +1111,10 @@ def __init__(
         sampling_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self._qaic_model = QEffTextGenerationBase(
+            processor,
             tokenizer=tokenizer,
-            qpc_path=qpc_path,
+            lang_qpc_path=lang_qpc_path,
+            vision_qpc_path=vision_qpc_path,
             full_batch_size=full_batch_size,
             ctx_len=ctx_len,
             device_id=device_id,
@@ -998,9 +1127,11 @@ def __init__(
         )
         self._full_batch_size = self._qaic_model.full_batch_size
         self._tokenizer = self._qaic_model.tokenizer
+        self._processor = self._qaic_model.processor
         self._ctx_len = ctx_len
         self._perf_metrics = None
         self._prompt_queue = None
+        self._image_queue = None
         self._text_streamer = None
 
     @property
@@ -1009,6 +1140,7 @@ def perf_metrics(self):
 
     def _setup_model_execution_inputs(
         self,
+        images,
         prompt: List[str],
         generation_len: Optional[int] = None,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
@@ -1027,13 +1159,15 @@ def _setup_model_execution_inputs(
 
         # Create a prompt queue.
         self._prompt_queue = deque(prompt)
+        self._image_queue = deque(images)
         # Initialize np arrays for storing the prefill output for all the decode batch size.
         num_prompts = len(self._prompt_queue)
+        num_images = len(self._image_queue)
 
         if prompt_to_lora_id_mapping:
             self._qaic_model.initialize_lora_id_mapping(prompt_to_lora_id_mapping)
 
-        self._qaic_model.initialize_decode_inputs(num_prompts, execution_batch_size, max_gen_length)
+        self._qaic_model.initialize_decode_inputs(num_images, num_prompts, execution_batch_size, max_gen_length)
 
     def _regular_model_execution(
         self,
@@ -1080,6 +1214,7 @@ def _regular_model_execution(
 
     def _continuous_batching_execution(
         self,
+        images,
         prompt: List[str],
         generation_len: Optional[int] = None,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
@@ -1096,13 +1231,17 @@ def _continuous_batching_execution(
         Returns:
         :tuple: A tuple containing performance metrics and generated texts.
         """
-        self._setup_model_execution_inputs(prompt, generation_len, prompt_to_lora_id_mapping)
+        self._setup_model_execution_inputs(images, prompt, generation_len, prompt_to_lora_id_mapping)
         self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1)
         start = perf_counter()
-        self._qaic_model.run_prefill_for_all_inputs(self._prompt_queue, generation_len)
+        self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, generation_len)
+
+        print("\n\n\n\n Prefill for all inputs completed\n\n\n\n")
 
         loop_start = perf_counter()  # Start decode loop timer
-        decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len)
+        decode_pause_time = self._qaic_model.run_continuous_batching_decode(
+            self._image_queue, self._prompt_queue, generation_len
+        )
         end = perf_counter()
 
         generated_texts = self._tokenizer.batch_decode(self._qaic_model.generated_ids, skip_special_tokens=True)
@@ -1166,6 +1305,7 @@ def generate_stream_tokens(
 
     def generate(
         self,
+        images,
         prompt: List[str],
         generation_len: Optional[int] = None,
         stream: bool = True,
@@ -1186,7 +1326,7 @@ def generate(
         if self._full_batch_size is not None:
             logger.warning("Streamer is currently unavailable for continuous batch execution.")
             perf_metrics, generated_texts = self._continuous_batching_execution(
-                prompt, generation_len, prompt_to_lora_id_mapping
+                images, prompt, generation_len, prompt_to_lora_id_mapping
             )
         else:
             if stream:
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
index 4b957ebec..d46ca8b14 100644
--- a/QEfficient/transformers/models/llama4/modeling_llama4.py
+++ b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -815,7 +815,7 @@ def forward(self, pixel_values):
         )
         vision_flat = image_features.view(-1, image_features.size(-1))
         projected_vision_flat = self.model.multi_modal_projector(vision_flat)
-        return projected_vision_flat
+        return projected_vision_flat  # , pixel_values
 
 
 # This wrapper utilizes the 'vision_embeds', which contains vision embeddings, and an 'image_idx' index starting at 0.
@@ -831,7 +831,15 @@ def __init__(self, model):
         self.language_model = self.model.language_model
         self.config = self.model.config
 
-    def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_values):
+    def forward(
+        self,
+        input_ids,
+        vision_embeds,
+        position_ids,
+        image_idx,
+        past_key_values,
+        batch_index: Optional[torch.LongTensor] = None,
+    ):
         inputs_embeds = self.model.language_model.get_input_embeddings()(input_ids)
         selected = input_ids == self.model.config.image_token_index
         indices1 = selected.to(torch.int64).cumsum(1) - 1
@@ -841,7 +849,11 @@ def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_va
         image_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds)
         inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_embeds)
         outputs = self.model.language_model(
-            inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            batch_index=batch_index,
+            use_cache=True,
         )
         next_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0)
         image_idx = torch.where(image_idx < next_idx, next_idx, image_idx)
@@ -888,6 +900,9 @@ def get_specializations(
         ctx_len: int,
         img_size: int,
         kv_offload: bool = False,
+        continuous_batching: bool = False,
+        kv_cache_batch_size: Optional[int] = None,
+        full_batch_size: Optional[int] = None,
         **compiler_options,
     ):
         max_num_tiles = compiler_options.pop("max_num_tiles", None)
@@ -936,28 +951,42 @@ def get_specializations(
                 "img_size": img_size,
             }
         ]
-        lang = [
-            {
-                "batch_size": batch_size,
-                "seq_len": prefill_seq_len,
-                "ctx_len": ctx_len,
-                "max_num_tiles": max_num_tiles,
-                "img_size": img_size,
-                "vision_size": vision_size,
-                "chunk_length": prefill_seq_len,
-                "chunk_ctx_len": chunk_ctx_len,
-            },
-            {
-                "batch_size": batch_size,
-                "seq_len": "1",
-                "ctx_len": ctx_len,
-                "max_num_tiles": max_num_tiles,
-                "img_size": img_size,
-                "vision_size": vision_size,
-                "chunk_length": prefill_seq_len,
-                "chunk_ctx_len": chunk_ctx_len,
-            },
-        ]
+
+        lang_prefill = {
+            "batch_size": 1 if continuous_batching else batch_size,
+            "seq_len": prefill_seq_len,
+            "ctx_len": ctx_len,
+            "max_num_tiles": max_num_tiles,
+            "img_size": img_size,
+            "vision_size": vision_size,
+            "chunk_length": prefill_seq_len,
+            "chunk_ctx_len": chunk_ctx_len,
+        }
+        if continuous_batching:
+            lang_prefill["full_batch_size"] = kv_cache_batch_size
+        else:
+            lang_prefill["batch_size"] = kv_cache_batch_size
+        if full_batch_size:
+            lang_prefill["full_batch_exec_size"] = full_batch_size
+
+        lang_decode = {
+            "batch_size": full_batch_size if continuous_batching else batch_size,
+            "seq_len": 1,
+            "ctx_len": ctx_len,
+            "max_num_tiles": max_num_tiles,
+            "img_size": img_size,
+            "vision_size": vision_size,
+            "chunk_length": prefill_seq_len,
+            "chunk_ctx_len": chunk_ctx_len,
+        }
+        if continuous_batching:
+            lang_decode["full_batch_size"] = kv_cache_batch_size
+        else:
+            lang_decode["batch_size"] = kv_cache_batch_size
+
+        lang = []
+        lang.append(lang_prefill)
+        lang.append(lang_decode)
 
         specializations = {}
 
@@ -966,18 +995,22 @@ def get_specializations(
             specializations["lang"] = lang
             return specializations, compiler_options
         else:
+            lang[0].pop("vision_size")
+            lang[1].pop("vision_size")
             return lang, compiler_options
 
-    def get_onnx_dynamic_axes(self, kv_offload: bool = False):
+    def get_onnx_dynamic_axes(self, kv_offload: bool = False, continuous_batching: bool = False):
         # Define dynamic axes
         vision_dynamic_axes = {}
         lang_dynamic_axes = {}
         lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
         lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
         lang_dynamic_axes["vision_embeds"] = {0: "vision_size"}
+        if continuous_batching:
+            lang_dynamic_axes["batch_index"] = {0: "batch_size"}
         vision_dynamic_axes["pixel_values"] = {0: "max_num_tiles", 2: "img_size", 3: "img_size"}
 
-        pkv_dynamic_axes = {0: "batch_size"}
+        pkv_dynamic_axes = {0: "full_batch_size" if continuous_batching else "batch_size"}
         for i in range(self.language_model.config.num_hidden_layers):
             # switch between chunk_ctx_len and ctx_len for RoPE and NoPE layers.
             if int((i + 1) % 4 != 0):
@@ -1006,6 +1039,7 @@ def get_output_names(self, kv_offload: bool = False):
 
         output_names = {}
         if kv_offload:
+            # vision_output_names.insert(1, "pixel_values_RetainedState")
             lang_output_names.insert(1, "vision_embeds_RetainedState")
             lang_output_names.insert(2, "image_idx_output")
             output_names["vision"] = vision_output_names
@@ -1040,7 +1074,7 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len):
             past_key_values.append(pkv)
         return past_key_values
 
-    def get_dummy_inputs(self, kv_offload: bool = False):
+    def get_dummy_inputs(self, kv_offload: bool = False, continuous_batching: bool = False):
         if vis_cfg := getattr(self.config, "vision_config", None):
             img_size = getattr(vis_cfg, "image_size", 336)
         else:
@@ -1085,10 +1119,14 @@ def get_dummy_inputs(self, kv_offload: bool = False):
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
         lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64)
+
+        bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+        fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS
+
         # Add data for KV
         past_key_values = self.get_dummy_pkv_cache(
             config=self.language_model.config,
-            batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
+            batch_size=fbs if continuous_batching else bs,
             seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
@@ -1097,6 +1135,8 @@ def get_dummy_inputs(self, kv_offload: bool = False):
             for kv in ["key", "value"]:
                 lang_inputs["past_key_values"][i].append(torch.zeros(past_key_values[0][0].shape, dtype=torch.float32))
 
+        if continuous_batching:
+            lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1)
         inputs = {}
         if kv_offload:
             inputs["vision"] = vision_inputs
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index cd1c13a00..c0efb7277 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -851,6 +851,7 @@ class _QEffAutoModelForImageTextToTextDualQPC:
     def __init__(
         self,
         model: nn.Module,
+        continuous_batching,
         **kwargs,
     ):
         """
@@ -874,6 +875,7 @@ def __init__(
         self.config = model.config
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model, **kwargs)
+        self.continuous_batching = continuous_batching
         self.input_shapes, self.output_names = None, None
 
     @property
@@ -973,8 +975,8 @@ def export(
         List[str]
             A list containing the paths to the generated ONNX graph files for both components.
         """
-        inputs = self.model.get_dummy_inputs(kv_offload=True)
-        dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True)
+        inputs = self.model.get_dummy_inputs(kv_offload=True, continuous_batching=self.continuous_batching)
+        dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True, continuous_batching=self.continuous_batching)
         output_names = self.model.get_output_names(kv_offload=True)
 
         self.vision_model.export(
@@ -1063,14 +1065,20 @@ def compile(
             If `full_batch_size`, `kv_cache_batch_size`, or `num_speculative_tokens` are not None.
             If both `skip_lang` and `skip_vision` are True.
         """
-        if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]):
+        if skip_lang and skip_vision:
+            raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False")
+
+        if self.continuous_batching and full_batch_size is None:
+            raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
+
+        if kv_cache_batch_size and not full_batch_size:
             raise ValueError(
-                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: "
-                f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
+                "KV caching requires continuous batching. Please set `full_batch_size` and "
+                "enable `continuous_batching=True` in `from_pretrained`."
             )
 
-        if skip_lang and skip_vision:
-            raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False")
+        # Infer kv_cache_batch_size if not provided
+        kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size
 
         output_names = self.model.get_output_names(kv_offload=True)
 
@@ -1080,6 +1088,9 @@ def compile(
             ctx_len=ctx_len,
             img_size=img_size,
             kv_offload=True,
+            continuous_batching=self.continuous_batching,
+            kv_cache_batch_size=kv_cache_batch_size,
+            full_batch_size=full_batch_size,
             **compiler_options,
         )
 
@@ -1147,7 +1158,11 @@ def compile(
 
     def generate(
         self,
-        inputs: torch.Tensor,
+        inputs: Optional[torch.Tensor] = None,
+        tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer] = None,
+        processor=None,
+        images: List[str] = None,
+        prompts: List[str] = None,
         streamer: Optional[TextStreamer] = None,
         device_ids: List[int] = None,
         runtime_ai100: bool = True,
@@ -1187,6 +1202,17 @@ def generate(
         if not runtime_ai100:
             raise NotImplementedError("PyTorch execution is not supported yet for this model!")
 
+        if (processor and images) or (tokenizer and prompts):
+            return QEfficient.cloud_ai_100_exec_kv(
+                tokenizer,
+                processor,
+                self.lang_model.qpc_path,
+                self.vision_model.qpc_path,
+                images=images,
+                prompt=prompts,
+                device_id=device_ids,
+                generation_len=generation_len,
+            )
         return self.kv_offload_generate(
             inputs=inputs, device_ids=device_ids, streamer=streamer, generation_len=generation_len
         )
@@ -1314,9 +1340,7 @@ def kv_offload_generate(
 
         lang_session.set_buffers(vision_outputs)
 
-        # Prepare inputs for prefill
-        chunk_inputs = lang_inputs.copy()
-        prefill_start = perf_counter()
+        lang_start = perf_counter()
 
         # Run prefill
         chunk_inputs = lang_inputs.copy()
@@ -1328,7 +1352,7 @@ def kv_offload_generate(
             outputs = lang_session.run(chunk_inputs)
             chunk_inputs["image_idx"] = outputs["image_idx_output"]
 
-        prefill_time = perf_counter() - prefill_start + vision_end - vision_start
+        prefill_time = perf_counter() - lang_start + vision_end - vision_start
         # Skip inputs/outputs again
         lang_session.skip_buffers(
             [
@@ -1909,7 +1933,7 @@ class QEFFAutoModelForImageTextToText:
 
     _hf_auto_class = AutoModelForImageTextToText
 
-    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs):
+    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, continuous_batching: bool = False, **kwargs):
         """
         Instantiate the appropriate internal class for single or dual QPC mode.
 
@@ -1930,13 +1954,19 @@ def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs)
             The wrapped model instance, configured for either dual or single QPC.
         """
         if kv_offload:
-            return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs)
+            return _QEffAutoModelForImageTextToTextDualQPC(model, continuous_batching, **kwargs)
         else:
             return _QEFFAutoModelForImageTextToTextSingleQPC(model, **kwargs)
 
     @classmethod
     @with_replaced_quantizers
-    def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optional[bool] = None, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        kv_offload: Optional[bool] = None,
+        continuous_batching: bool = False,
+        **kwargs,
+    ):
         """
         Load a QEfficient image-text-to-text model from a pretrained HuggingFace model or local path.
 
@@ -1971,12 +2001,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona
         if kwargs.get("low_cpu_mem_usage", None):
             logger.warning("Updating low_cpu_mem_usage=False")
 
-        if kwargs.pop("continuous_batching", None):
-            NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
+        if continuous_batching and not kv_offload:
+            NotImplementedError("Continuous batching is not supported for kv_offload = False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        return cls(model, kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
+        return cls(
+            model,
+            kv_offload=kv_offload,
+            continuous_batching=continuous_batching,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            **kwargs,
+        )
 
 
 MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {"InternVLChatModel": QEFFAutoModelForImageTextToText}
diff --git a/examples/llama4_CB_example_vision_lang.py b/examples/llama4_CB_example_vision_lang.py
new file mode 100644
index 000000000..f6cd2bf5c
--- /dev/null
+++ b/examples/llama4_CB_example_vision_lang.py
@@ -0,0 +1,65 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import transformers
+from transformers import AutoConfig, AutoProcessor
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+# For Testing Purpose Only
+config.text_config.num_hidden_layers = 4
+config.vision_config.num_hidden_layers = 2
+
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id,
+    attn_implementation="eager",
+    kv_offload=True,
+    config=config,
+    continuous_batching=True,
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+qeff_model.compile(
+    prefill_seq_len=128,
+    ctx_len=3072,
+    img_size=336,
+    num_cores=16,
+    num_devices=4,
+    max_num_tiles=17,
+    batch_size=1,
+    full_batch_size=4,
+    mxfp6_matmul=True,
+    mxint8_kv_cache=True,
+    aic_enable_depth_first=True,
+    mos=1,
+)
+
+image_urls = [
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
+]
+
+prompts = [
+    "Can you describe the image in detail?",
+    "What are the objects in the image?",
+    "What is the main subject of the image?",
+    "What colors are predominant in the image?",
+]
+
+output = qeff_model.generate(
+    images=image_urls,
+    tokenizer=tokenizer,
+    processor=processor,
+    device_ids=[0, 1, 2, 3],
+    prompts=prompts,
+    generation_len=100,
+)

From 38e777a7407607967bbb1a99f18b6699b510ed52 Mon Sep 17 00:00:00 2001
From: Mamta Singh <mamtsing@qti.qualcomm.com>
Date: Tue, 7 Oct 2025 07:57:45 +0000
Subject: [PATCH 2/4] update text_generation_interface

Signed-off-by: Mamta Singh <mamtsing@qti.qualcomm.com>
---
 .../generation/text_generation_inference.py   | 182 ++++++++++--------
 .../transformers/models/modeling_auto.py      |  15 +-
 examples/llama4_CB_example_vision_lang.py     |   4 +-
 3 files changed, 115 insertions(+), 86 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 554f030af..9b8d84ef7 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -15,7 +15,7 @@
 import numpy as np
 import torch
 import transformers
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import AutoImageProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 from QEfficient.utils import padding_check_and_fix
@@ -314,10 +314,10 @@ def calculate_latency(total_decoded_tokens, loop_start, start, end, decode_pause
 
 def cloud_ai_100_exec_kv(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    processor,
     lang_qpc_path: str,
-    vision_qpc_path: str,
-    images,
+    processor: Optional[AutoImageProcessor] = None,
+    vision_qpc_path: Optional[str] = None,
+    images: Optional[str] = None,
     prompt: Optional[str] = None,
     prompts_txt_file_path: Optional[str] = None,
     device_id: Optional[List[int]] = None,
@@ -398,7 +398,12 @@ def cloud_ai_100_exec_kv(
     )
     if full_batch_size is None:
         exec_info = [
-            generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping)
+            generate_text.generate(
+                prompt=prompt[i : i + batch_size],
+                generation_len=generation_len,
+                stream=stream,
+                prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
+            )
             for i in range(0, len(prompt), batch_size)
         ]
         prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info])
@@ -416,8 +421,8 @@ def cloud_ai_100_exec_kv(
         )
     else:
         exec_info = generate_text.generate(
-            images=images,
             prompt=prompt,
+            images=images,
             generation_len=generation_len,
             prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
         )
@@ -429,9 +434,9 @@ def cloud_ai_100_exec_kv(
 class QEffTextGenerationBase:
     def __init__(
         self,
-        processor,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
         lang_qpc_path: str,
+        processor: Optional[AutoImageProcessor] = None,
         vision_qpc_path: Optional[str] = None,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
@@ -450,11 +455,13 @@ def __init__(
         self.sampling_params = sampling_params
 
         # Load QPC
+        self._lang_session = None
+        self._vision_session = None
         if not lang_qpc_path:
             raise TypeError("Please run compile API for language model first!")
         self._lang_session = QAICInferenceSession(lang_qpc_path, device_id, activate=False)
         if vision_qpc_path:
-            self._vision_session = QAICInferenceSession(vision_qpc_path, device_id)
+            self._vision_session = QAICInferenceSession(vision_qpc_path, device_id, activate=False)
 
         # Validate sampler inputs for On-Device Sampling
         self.include_sampler = validate_sampler_inputs(
@@ -682,7 +689,7 @@ def _fetch_next_token_id(self, outputs):
                 logits = np.expand_dims(logits, 1)
             return logits.argmax(2)
 
-    def initialize_decode_inputs(self, num_images, num_prompts, execution_batch_size, max_gen_length):
+    def initialize_decode_inputs(self, num_prompts, execution_batch_size, max_gen_length):
         """
         Initialize np arrays for storing the prefill output for all the decode batch size.
         """
@@ -740,14 +747,18 @@ def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len):
             generation_len (int): The generation length.
 
         """
+        next_prompt = None
+        next_image = None
         for decode_batch_id in range(self.full_batch_size):
-            next_prompt = prompt_queue.popleft()
-            next_image = image_queue.popleft()
+            if prompt_queue:
+                next_prompt = prompt_queue.popleft()
+            if image_queue:
+                next_image = image_queue.popleft()
 
             # run prefill for num_chunks
             outputs, position_ids, generation_len = self.run_prefill(
-                next_image,
                 next_prompt,
+                next_image,
                 generation_len,
                 decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1),
             )
@@ -771,8 +782,9 @@ def _set_output_buffers(self, batch_size: int = 1, sequence_length: int = 1):
             logits_out_placeholder = np.zeros((batch_size, sequence_length, self._vocab_size), dtype=np.float32)
             self._lang_session.set_buffers({"logits": logits_out_placeholder})
 
-            vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16)
-            self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder})
+            if self._vision_session:
+                vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16)
+                self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder})
 
     def prepare_vision_language_inputs(self, prompt, image_url):
         messages = [
@@ -790,13 +802,18 @@ def prepare_vision_language_inputs(self, prompt, image_url):
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
-            # padding="max_length",
-            # max_length=padded_len,
         )
         inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
         return inputs
 
-    def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_batch_id=None):
+    def run_prefill(
+        self,
+        prompt: str,
+        image: Optional[str] = None,
+        generation_len: Optional[int] = None,
+        prefill_logit_bs=1,
+        decode_batch_id=None,
+    ):
         """
         Runs prefill for a given prompt and generation length.
 
@@ -813,8 +830,12 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_
             position_ids (array): The position IDs.
             generation_len (int): The generation length.
         """
+
         # Run prefill
-        inputs = self.prepare_vision_language_inputs(prompt, image)
+        if image:
+            inputs = self.prepare_vision_language_inputs(prompt, image)
+        else:
+            inputs = self.tokenizer(prompt, return_tensors="np", padding=True)
 
         position_ids = inputs["attention_mask"].sum(1, keepdims=True)
         padded_len = inputs["input_ids"].shape[1]
@@ -829,40 +850,45 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_
         # Set the prefill output buffers
         self._set_output_buffers(batch_size=prefill_logit_bs, sequence_length=1)
 
-        pad_token_id = 1
-        input_ids_length = inputs["input_ids"].shape[1]
-        num_chunks = -(input_ids_length // -self._prefill_seq_len)  # ceil divide without float
-        padded_len = num_chunks * self._prefill_seq_len  # Convert to a multiple of prompt_len
-
-        inputs["input_ids"] = torch.nn.functional.pad(
-            inputs["input_ids"],
-            (0, padded_len - input_ids_length),
-            "constant",
-            pad_token_id,
-        )
-        inputs["attention_mask"] = torch.nn.functional.pad(
-            inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0
-        )
-        if "cross_attention_mask" in inputs:
-            inputs["cross_attention_mask"] = torch.nn.functional.pad(
-                inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length)
+        vision_inputs = {}
+        vision_outputs = {}
+        if image:
+            pad_token_id = 1
+            input_ids_length = inputs["input_ids"].shape[1]
+            num_chunks = -(input_ids_length // -self._prefill_seq_len)  # ceil divide without float
+            padded_len = num_chunks * self._prefill_seq_len  # Convert to a multiple of prompt_len
+
+            inputs["input_ids"] = torch.nn.functional.pad(
+                inputs["input_ids"],
+                (0, padded_len - input_ids_length),
+                "constant",
+                pad_token_id,
             )
+            inputs["attention_mask"] = torch.nn.functional.pad(
+                inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0
+            )
+            if "cross_attention_mask" in inputs:
+                inputs["cross_attention_mask"] = torch.nn.functional.pad(
+                    inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length)
+                )
 
-        for k, v in inputs.items():
-            inputs[k] = np.array(v)
+            for k, v in inputs.items():
+                inputs[k] = np.array(v)
 
-        vision_inputs = {
-            k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
-        }
-        if vision_inputs:
-            vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
+            vision_inputs = {
+                k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
+            }
+            if vision_inputs:
+                vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
 
-        vision_outputs = {}
-        if self._vision_session:
-            self._vision_session.activate()
-        # Run vision prefill
-        if vision_inputs:
-            vision_outputs = self._vision_session.run(vision_inputs)
+            # Run vision prefill
+            if vision_inputs:
+                self._vision_session.activate()
+                vision_outputs = self._vision_session.run(vision_inputs)
+                self._vision_session.deactivate()
+        else:
+            inputs = self.tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
+            inputs.pop("token_type_ids", None)
 
         lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
         lang_inputs["position_ids"] = np.where(
@@ -871,10 +897,9 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_
 
         # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
         # if not_mllama:
-        lang_inputs["image_idx"] = np.array([[0]])
+        if image:
+            lang_inputs["image_idx"] = np.array([[0]])
 
-        if self._vision_session:
-            self._vision_session.deactivate()
         self._lang_session.activate()
         self._lang_session.set_buffers(vision_outputs)
 
@@ -900,7 +925,7 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_
                 lang_inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
 
         # Run language prefill
-        chunk_inputs = lang_inputs.copy()
+
         for i in range(num_chunks):
             chunk_inputs = lang_inputs.copy()
             chunk_inputs["input_ids"] = lang_inputs["input_ids"][
@@ -912,7 +937,8 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_
             if self.include_sampler:
                 chunk_inputs["last_accepted_output_tokens"] = chunk_inputs["input_ids"]
             outputs = self._lang_session.run(chunk_inputs)
-            chunk_inputs["image_idx"] = outputs["image_idx_output"]
+            if image:
+                chunk_inputs["image_idx"] = outputs["image_idx_output"]
             if self._write_io_dir is not None:
                 write_io_files(inputs, outputs, self._write_io_dir, "prefill", "aic_batch_io", True, False)
 
@@ -924,15 +950,15 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_
                 if x.startswith("past_") or x.endswith("_RetainedState")
             ]
         )
-        if self._lang_session:
-            self._lang_session.deactivate()
+        self._lang_session.deactivate()
+
         return (
             outputs,
             position_ids,
             generation_len,
         )
 
-    def run_continuous_batching_decode(self, image_queue, prompt_queue, generation_len):
+    def run_continuous_batching_decode(self, prompt_queue, generation_len):
         """
         Runs continuous batching decode for the given prompt queue and generation length.
 
@@ -979,8 +1005,8 @@ def run_continuous_batching_decode(self, image_queue, prompt_queue, generation_l
                         start = perf_counter()
                         # run prefill for next prompt input.
                         outputs, position_ids, generation_len = self.run_prefill(
-                            prompt_queue.popleft(),
-                            generation_len,
+                            prompt=prompt_queue.popleft(),
+                            generation_len=generation_len,
                             decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1),
                         )
 
@@ -1039,6 +1065,7 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
             self._lang_session.set_buffers({"logits": logits_out_placeholder})
         finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id
         num_token = 0
+        self._lang_session.activate()
         for num_token in range(1, generation_len):
             if streamer:
                 streamer.put(decode_inputs["input_ids"][0])
@@ -1058,6 +1085,7 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
 
             if finished_sequences.all():
                 break
+        self._lang_session.deactivate()
         return num_token
 
     def generate_decode_stream(self, decode_inputs, generation_len):
@@ -1074,6 +1102,7 @@ def generate_decode_stream(self, decode_inputs, generation_len):
             token_id (int): The token generated in the decoding process.
         """
         finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id
+        self._lang_session.activate()
         for num_token in range(1, generation_len):
             yield decode_inputs["input_ids"]
             outputs = self._lang_session.run(decode_inputs)
@@ -1090,16 +1119,17 @@ def generate_decode_stream(self, decode_inputs, generation_len):
 
             if finished_sequences.all():
                 break
+        self._lang_session.deactivate()
         yield decode_inputs["input_ids"]  # yield the last token
 
 
 class TextGeneration:
     def __init__(
         self,
-        processor,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        vision_qpc_path: str,
         lang_qpc_path: str,
+        processor: Optional[AutoImageProcessor] = None,
+        vision_qpc_path: Optional[str] = None,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
         device_id: Optional[List[int]] = None,
@@ -1111,9 +1141,9 @@ def __init__(
         sampling_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self._qaic_model = QEffTextGenerationBase(
-            processor,
             tokenizer=tokenizer,
             lang_qpc_path=lang_qpc_path,
+            processor=processor,
             vision_qpc_path=vision_qpc_path,
             full_batch_size=full_batch_size,
             ctx_len=ctx_len,
@@ -1140,8 +1170,8 @@ def perf_metrics(self):
 
     def _setup_model_execution_inputs(
         self,
-        images,
         prompt: List[str],
+        images: Optional[List[str]] = None,
         generation_len: Optional[int] = None,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
     ):
@@ -1159,15 +1189,15 @@ def _setup_model_execution_inputs(
 
         # Create a prompt queue.
         self._prompt_queue = deque(prompt)
-        self._image_queue = deque(images)
+        if images:
+            self._image_queue = deque(images)
         # Initialize np arrays for storing the prefill output for all the decode batch size.
         num_prompts = len(self._prompt_queue)
-        num_images = len(self._image_queue)
 
         if prompt_to_lora_id_mapping:
             self._qaic_model.initialize_lora_id_mapping(prompt_to_lora_id_mapping)
 
-        self._qaic_model.initialize_decode_inputs(num_images, num_prompts, execution_batch_size, max_gen_length)
+        self._qaic_model.initialize_decode_inputs(num_prompts, execution_batch_size, max_gen_length)
 
     def _regular_model_execution(
         self,
@@ -1189,12 +1219,14 @@ def _regular_model_execution(
         :tuple: A tuple containing performance metrics and generated texts.
 
         """
-        self._setup_model_execution_inputs(prompt, generation_len, prompt_to_lora_id_mapping)
+        self._setup_model_execution_inputs(
+            prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
+        )
         if stream and self._text_streamer is None:
             self._text_streamer = transformers.TextStreamer(self._tokenizer)
         start = perf_counter()
         outputs, position_ids, generation_len = self._qaic_model.run_prefill(
-            prompt, generation_len, prefill_logit_bs=self._qaic_model.batch_size
+            prompt=prompt, generation_len=generation_len, prefill_logit_bs=self._qaic_model.batch_size
         )
         self._qaic_model.update_decode_input(outputs, position_ids, generation_len)
 
@@ -1214,8 +1246,8 @@ def _regular_model_execution(
 
     def _continuous_batching_execution(
         self,
-        images,
         prompt: List[str],
+        images: Optional[List[str]] = None,
         generation_len: Optional[int] = None,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
     ):
@@ -1231,17 +1263,13 @@ def _continuous_batching_execution(
         Returns:
         :tuple: A tuple containing performance metrics and generated texts.
         """
-        self._setup_model_execution_inputs(images, prompt, generation_len, prompt_to_lora_id_mapping)
+        self._setup_model_execution_inputs(prompt, images, generation_len, prompt_to_lora_id_mapping)
         self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1)
         start = perf_counter()
         self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, generation_len)
 
-        print("\n\n\n\n Prefill for all inputs completed\n\n\n\n")
-
         loop_start = perf_counter()  # Start decode loop timer
-        decode_pause_time = self._qaic_model.run_continuous_batching_decode(
-            self._image_queue, self._prompt_queue, generation_len
-        )
+        decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len)
         end = perf_counter()
 
         generated_texts = self._tokenizer.batch_decode(self._qaic_model.generated_ids, skip_special_tokens=True)
@@ -1281,7 +1309,7 @@ def generate_stream_tokens(
         self._setup_model_execution_inputs(prompt, generation_len, prompt_to_lora_id_mapping)
         start = perf_counter()
         outputs, position_ids, generation_len = self._qaic_model.run_prefill(
-            prompt, generation_len, prefill_logit_bs=self._qaic_model.batch_size
+            prompt=prompt, generation_len=generation_len, prefill_logit_bs=self._qaic_model.batch_size
         )
         self._qaic_model.update_decode_input(outputs, position_ids, generation_len)
 
@@ -1305,8 +1333,8 @@ def generate_stream_tokens(
 
     def generate(
         self,
-        images,
         prompt: List[str],
+        images: Optional[List[str]] = None,
         generation_len: Optional[int] = None,
         stream: bool = True,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
@@ -1326,7 +1354,7 @@ def generate(
         if self._full_batch_size is not None:
             logger.warning("Streamer is currently unavailable for continuous batch execution.")
             perf_metrics, generated_texts = self._continuous_batching_execution(
-                images, prompt, generation_len, prompt_to_lora_id_mapping
+                prompt, images, generation_len, prompt_to_lora_id_mapping
             )
         else:
             if stream:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index c0efb7277..2cada6ab4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -14,6 +14,7 @@
 import torch
 import torch.nn as nn
 from transformers import (
+    AutoImageProcessor,
     AutoModel,
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
@@ -1160,7 +1161,7 @@ def generate(
         self,
         inputs: Optional[torch.Tensor] = None,
         tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer] = None,
-        processor=None,
+        processor: Optional[AutoImageProcessor] = None,
         images: List[str] = None,
         prompts: List[str] = None,
         streamer: Optional[TextStreamer] = None,
@@ -1204,10 +1205,10 @@ def generate(
 
         if (processor and images) or (tokenizer and prompts):
             return QEfficient.cloud_ai_100_exec_kv(
-                tokenizer,
-                processor,
-                self.lang_model.qpc_path,
-                self.vision_model.qpc_path,
+                tokenizer=tokenizer,
+                processor=processor,
+                lang_qpc_path=self.lang_model.qpc_path,
+                vision_qpc_path=self.vision_model.qpc_path,
                 images=images,
                 prompt=prompts,
                 device_id=device_ids,
@@ -2717,8 +2718,8 @@ def generate(
                 raise TypeError("Please run compile API first!")
             generation_len = kwargs.pop("generation_len", None)
             return QEfficient.cloud_ai_100_exec_kv(
-                tokenizer,
-                self.qpc_path,
+                tokenizer=tokenizer,
+                lang_qpc_path=self.qpc_path,
                 prompt=prompts,
                 device_id=device_id,
                 generation_len=generation_len,
diff --git a/examples/llama4_CB_example_vision_lang.py b/examples/llama4_CB_example_vision_lang.py
index f6cd2bf5c..ebe65bf82 100644
--- a/examples/llama4_CB_example_vision_lang.py
+++ b/examples/llama4_CB_example_vision_lang.py
@@ -56,10 +56,10 @@
 ]
 
 output = qeff_model.generate(
-    images=image_urls,
     tokenizer=tokenizer,
+    prompts=prompts,
     processor=processor,
+    images=image_urls,
     device_ids=[0, 1, 2, 3],
-    prompts=prompts,
     generation_len=100,
 )

From 7944a7e85f1c9610ed6426239e5604c1b082b8d2 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Thu, 23 Oct 2025 08:40:16 +0000
Subject: [PATCH 3/4] Updated text_generation to run CB for VLMs

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 QEfficient/generation/cloud_infer.py          | 31 ++++--
 .../generation/text_generation_inference.py   | 98 ++++++++-----------
 2 files changed, 66 insertions(+), 63 deletions(-)

diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py
index 8519d824c..42c8b342e 100644
--- a/QEfficient/generation/cloud_infer.py
+++ b/QEfficient/generation/cloud_infer.py
@@ -90,8 +90,10 @@ def __init__(
         self.program = qaicrt.Program(self.context, None, qpc, prog_properties)
         if self.program.load() != qaicrt.QStatus.QS_SUCCESS:
             raise RuntimeError("Failed to load program")
+        self.is_active = False
         if activate:
             self.activate()
+            self.is_active = True
         # Create input qbuffers and buf_dims
         self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings]
         self.buf_dims = qaicrt.BufferDimensionsVecRef(
@@ -108,15 +110,32 @@ def output_names(self) -> List[str]:
 
     def activate(self):
         """Activate qpc"""
-
-        self.program.activate()
-        self.execObj = qaicrt.ExecObj(self.context, self.program)
+        if not self.is_active:
+            self.program.activate()
+            self.execObj = qaicrt.ExecObj(self.context, self.program)
+            self.is_active = True
 
     def deactivate(self):
         """Deactivate qpc"""
-
-        del self.execObj
-        self.program.deactivate()
+        if self.is_active:
+            del self.execObj
+            self.program.deactivate()
+            self.is_active = False
+
+    def pause(self):
+        """Pause the session while preserving state"""
+        if self.is_active:
+            # Just deactivate the program and set state
+            self.program.deactivate()
+            self.is_active = False
+
+    def resume(self):
+        """Resume a paused session"""
+        if not self.is_active:
+            # Reactivate program and create new execObj
+            self.program.activate()
+            self.execObj = qaicrt.ExecObj(self.context, self.program)
+            self.is_active = True
 
     def set_buffers(self, buffers: Dict[str, np.ndarray]):
         """
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index ea0e155f7..f014bf6a0 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -11,6 +11,8 @@
 from dataclasses import dataclass
 from time import perf_counter
 from typing import Any, Dict, List, Optional, Tuple, Union
+import requests
+from PIL import Image
 
 import numpy as np
 import torch
@@ -399,41 +401,15 @@ def cloud_ai_100_exec_kv(
         sampling_params=sampling_params,
     )
 
-    if full_batch_size is None:
-        exec_info = [
-            generate_text.generate(
-                prompt=prompt[i : i + batch_size],
-                generation_len=generation_len,
-                stream=stream,
-                prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
-            )
-            for i in range(0, len(prompt), batch_size)
-        ]
-        prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info])
-        decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info])
-        total_perf = np.average([info.perf_metrics.total_perf for info in exec_info])
-        total_time = np.average([info.perf_metrics.total_time for info in exec_info])
-        generated_texts = [info.generated_texts for info in exec_info]
-        generated_ids = [info.generated_ids for info in exec_info]
-
-        exec_info = CloudAI100ExecInfo(
-            batch_size=batch_size,
-            generated_texts=generated_texts,
-            generated_ids=generated_ids,
-            perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
-        )
-    else:
-        exec_info = generate_text.generate(
-            prompt=prompt,
-            images=images,
-            generation_len=generation_len,
-            prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
-        )
-
     for _ in range(0, int(iteration)):
         if full_batch_size is None:
             exec_info = [
-                generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping)
+                generate_text.generate(
+                    prompt=prompt[i : i + batch_size],
+                    generation_len=generation_len,
+                    stream=stream,
+                    prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
+                )
                 for i in range(0, len(prompt), batch_size)
             ]
             prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info])
@@ -451,7 +427,10 @@ def cloud_ai_100_exec_kv(
             )
         else:
             exec_info = generate_text.generate(
-                prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
+                prompt=prompt,
+                images=images,
+                generation_len=generation_len,
+                prompt_to_lora_id_mapping=prompt_to_lora_id_mapping,
             )
 
         print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
@@ -765,7 +744,7 @@ def update_decode_input(self, outputs, position_ids, generation_len, decode_batc
         self.generation_len[decode_batch_id or slice(None)] = generation_len
         return next_token_id
 
-    def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len):
+    def run_prefill_for_all_inputs(self, image_queue, prompt_queue, processor, generation_len):
         """
         Runs prefill for all inputs in the prompt queue and updates the decode input.
 
@@ -788,6 +767,7 @@ def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len):
             outputs, position_ids, generation_len = self.run_prefill(
                 next_prompt,
                 next_image,
+                processor,
                 generation_len,
                 decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1),
             )
@@ -815,30 +795,28 @@ def _set_output_buffers(self, batch_size: int = 1, sequence_length: int = 1):
                 vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16)
                 self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder})
 
-    def prepare_vision_language_inputs(self, prompt, image_url):
-        messages = [
+    def prepare_vision_language_inputs(self, processor, query, image_url):
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        conversation = [
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "url": image_url},
-                    {"type": "text", "text": prompt},
+                    {"type": "text", "text": query},
+                    {"type": "image"},
                 ],
             },
         ]
-        inputs = self.processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+        if "pixel_values" in inputs:
+            inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
         return inputs
 
     def run_prefill(
         self,
         prompt: str,
         image: Optional[str] = None,
+        processor: Optional[AutoImageProcessor] = None,
         generation_len: Optional[int] = None,
         prefill_logit_bs=1,
         decode_batch_id=None,
@@ -862,7 +840,7 @@ def run_prefill(
 
         # Run prefill
         if image:
-            inputs = self.prepare_vision_language_inputs(prompt, image)
+            inputs = self.prepare_vision_language_inputs(processor, prompt, image)
         else:
             inputs = self.tokenizer(prompt, return_tensors="np", padding=True)
 
@@ -905,16 +883,22 @@ def run_prefill(
                 inputs[k] = np.array(v)
 
             vision_inputs = {
-                k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
+                k: v for k, v in inputs.items() if k in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"}
             }
-            if vision_inputs:
-                vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
+            # if vision_inputs:
+            #     vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
+            vision_inputs_fp16 = {"pixel_values", "image_masks"}
+            vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})
 
+            # if not(self._lang_session.is_active):
+            #     self._lang_session.activate()
             # Run vision prefill
             if vision_inputs:
+                # self._lang_session.pause()
                 self._vision_session.activate()
                 vision_outputs = self._vision_session.run(vision_inputs)
                 self._vision_session.deactivate()
+                # self._lang_session.resume()
         else:
             inputs = self.tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
             inputs.pop("token_type_ids", None)
@@ -924,8 +908,9 @@ def run_prefill(
             lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
         )  # Need to use -1 as position_ids for invalid tokens
 
-        # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
-        # if not_mllama:
+    #    not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
+    #     if not_mllama:
+    #         lang_inputs["image_idx"] = np.array([[0]])
         if image:
             lang_inputs["image_idx"] = np.array([[0]])
 
@@ -954,9 +939,8 @@ def run_prefill(
                 lang_inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
 
         # Run language prefill
-
+        chunk_inputs = lang_inputs.copy()
         for i in range(num_chunks):
-            chunk_inputs = lang_inputs.copy()
             chunk_inputs["input_ids"] = lang_inputs["input_ids"][
                 :, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len
             ]
@@ -979,7 +963,7 @@ def run_prefill(
                 if x.startswith("past_") or x.endswith("_RetainedState")
             ]
         )
-        self._lang_session.deactivate()
+        # self._lang_session.deactivate()
 
         return (
             outputs,
@@ -1018,8 +1002,8 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
         # Prepare decode inputs inputs.
         decode_inputs = self.prepare_decode_inputs()
 
+        # self._lang_session.activate() # Due to activating new session (new exec_obj) run values are changing
         while prompt_queue or current_decode_ongoing.any():
-            self._lang_session.activate()
             outputs = self._lang_session.run(decode_inputs)
 
             # Prepare inputs for next iteration
@@ -1298,7 +1282,7 @@ def _continuous_batching_execution(
         self._setup_model_execution_inputs(prompt, images, generation_len, prompt_to_lora_id_mapping)
         self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1)
         start = perf_counter()
-        self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, generation_len)
+        self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, self._processor, generation_len)
 
         loop_start = perf_counter()  # Start decode loop timer
         decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len)

From 3065310a6d4c3bb7415479f4cbf82f91a614f9b2 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Thu, 23 Oct 2025 08:50:48 +0000
Subject: [PATCH 4/4] Ruff format

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 .../generation/text_generation_inference.py   | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index f014bf6a0..190bcf764 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -11,12 +11,12 @@
 from dataclasses import dataclass
 from time import perf_counter
 from typing import Any, Dict, List, Optional, Tuple, Union
-import requests
-from PIL import Image
 
 import numpy as np
+import requests
 import torch
 import transformers
+from PIL import Image
 from transformers import AutoImageProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.generation.cloud_infer import QAICInferenceSession
@@ -883,12 +883,24 @@ def run_prefill(
                 inputs[k] = np.array(v)
 
             vision_inputs = {
-                k: v for k, v in inputs.items() if k in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"}
+                k: v
+                for k, v in inputs.items()
+                if k
+                in {
+                    "pixel_values",
+                    "image_masks",
+                    "image_input_idx",
+                    "valid_idx",
+                    "aspect_ratio_ids",
+                    "aspect_ratio_mask",
+                }
             }
             # if vision_inputs:
             #     vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
             vision_inputs_fp16 = {"pixel_values", "image_masks"}
-            vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})
+            vision_inputs.update(
+                {k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs}
+            )
 
             # if not(self._lang_session.is_active):
             #     self._lang_session.activate()
@@ -908,9 +920,9 @@ def run_prefill(
             lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
         )  # Need to use -1 as position_ids for invalid tokens
 
-    #    not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
-    #     if not_mllama:
-    #         lang_inputs["image_idx"] = np.array([[0]])
+        #    not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
+        #     if not_mllama:
+        #         lang_inputs["image_idx"] = np.array([[0]])
         if image:
             lang_inputs["image_idx"] = np.array([[0]])
 
@@ -1282,7 +1294,9 @@ def _continuous_batching_execution(
         self._setup_model_execution_inputs(prompt, images, generation_len, prompt_to_lora_id_mapping)
         self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1)
         start = perf_counter()
-        self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, self._processor, generation_len)
+        self._qaic_model.run_prefill_for_all_inputs(
+            self._image_queue, self._prompt_queue, self._processor, generation_len
+        )
 
         loop_start = perf_counter()  # Start decode loop timer
         decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len)