From 1d98635a50f9ff692c0b3ed3abbb5562fc626721 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Sun, 5 Oct 2025 21:22:20 +0000 Subject: [PATCH 1/4] Enable CB for vlms with multiple images and multiple prompts Signed-off-by: Mamta Singh --- .../generation/text_generation_inference.py | 252 ++++++++++++++---- .../models/llama4/modeling_llama4.py | 98 +++++-- .../transformers/models/modeling_auto.py | 72 +++-- examples/llama4_CB_example_vision_lang.py | 65 +++++ 4 files changed, 384 insertions(+), 103 deletions(-) create mode 100644 examples/llama4_CB_example_vision_lang.py diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index cf9cbcacc..554f030af 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np +import torch import transformers from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast @@ -313,7 +314,10 @@ def calculate_latency(total_decoded_tokens, loop_start, start, end, decode_pause def cloud_ai_100_exec_kv( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - qpc_path: str, + processor, + lang_qpc_path: str, + vision_qpc_path: str, + images, prompt: Optional[str] = None, prompts_txt_file_path: Optional[str] = None, device_id: Optional[List[int]] = None, @@ -370,7 +374,7 @@ def cloud_ai_100_exec_kv( exec_info = QEfficient.cloud_ai_100_exec_kv(tokenizer=tokenizer, qpc_path=qpc_path, prompt="Hi there!!", device_id=[0]) """ - batch_size, ctx_len, full_batch_size = get_compilation_dims(qpc_path) + batch_size, ctx_len, full_batch_size = get_compilation_dims(lang_qpc_path) prompt: List[str] = get_input_prompts(prompt, prompts_txt_file_path) prompt = fix_prompts(prompt, batch_size, full_batch_size) if prompt_to_lora_id_mapping is not None: @@ -379,7 +383,9 @@ def cloud_ai_100_exec_kv( ) generate_text = TextGeneration( tokenizer=tokenizer, - qpc_path=qpc_path, + processor=processor, + lang_qpc_path=lang_qpc_path, + vision_qpc_path=vision_qpc_path, device_id=device_id, ctx_len=ctx_len, enable_debug_logs=enable_debug_logs, @@ -410,7 +416,10 @@ def cloud_ai_100_exec_kv( ) else: exec_info = generate_text.generate( - prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping + images=images, + prompt=prompt, + generation_len=generation_len, + prompt_to_lora_id_mapping=prompt_to_lora_id_mapping, ) print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation) @@ -420,8 +429,10 @@ def cloud_ai_100_exec_kv( class QEffTextGenerationBase: def __init__( self, + processor, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - qpc_path: str, + lang_qpc_path: str, + vision_qpc_path: Optional[str] = None, full_batch_size: Optional[int] = None, ctx_len: Optional[int] = None, device_id: Optional[List[int]] = None, @@ -439,11 +450,15 @@ def __init__( self.sampling_params = sampling_params # Load QPC - self._session = QAICInferenceSession(qpc_path, device_id, enable_debug_logs=enable_debug_logs) + if not lang_qpc_path: + raise TypeError("Please run compile API for language model first!") + self._lang_session = QAICInferenceSession(lang_qpc_path, device_id, activate=False) + if vision_qpc_path: + self._vision_session = QAICInferenceSession(vision_qpc_path, device_id) # Validate sampler inputs for On-Device Sampling self.include_sampler = validate_sampler_inputs( - session_inputs=set(self._session.input_names), include_sampler=include_sampler + session_inputs=set(self._lang_session.input_names), include_sampler=include_sampler ) # Fetch the variables from the QPC @@ -468,10 +483,23 @@ def __init__( self.generation_len = None self.tokenizer = tokenizer + self.processor = processor self._set_tokenizer_params() # set tokenizer params # Skip inputs/outputs - self._session.skip_buffers( - [x for x in self._session.input_names + self._session.output_names if x.startswith("past_")] + if self._vision_session: + self._vision_session.skip_buffers( + [ + x + for x in self._vision_session.input_names + self._vision_session.output_names + if x.startswith("past_") or x.endswith("_RetainedState") + ] + ) + self._lang_session.skip_buffers( + [ + x + for x in self._lang_session.input_names + self._lang_session.output_names + if x.startswith("past_") or x.endswith("_RetainedState") + ] ) def _set_tokenizer_params(self): @@ -496,13 +524,16 @@ def _fetch_full_batch_size( """ full_batch_size = None - if "batch_index" in self._session.binding_index_map: - if self._session.allowed_shapes: + if "batch_index" in self._lang_session.binding_index_map: + if self._lang_session.allowed_shapes: full_batch_size, _ = [ - x[self._session.binding_index_map["batch_index"]][1][0] for x in self._session.allowed_shapes + x[self._lang_session.binding_index_map["batch_index"]][1][0] + for x in self._lang_session.allowed_shapes ] else: - full_batch_size, _ = self._session.bindings[self._session.binding_index_map["batch_index"]].dims + full_batch_size, _ = self._lang_session.bindings[ + self._lang_session.binding_index_map["batch_index"] + ].dims return full_batch_size def _fetch_batch_size_prefill_seq_len( @@ -515,15 +546,17 @@ def _fetch_batch_size_prefill_seq_len( batch_size: The batch size fetched from the session's bindings or allowed shapes. prefill_seq_len: The prefill sequence length fetched from the session's bindings or allowed shapes. """ - if self._session.allowed_shapes: + if self._lang_session.allowed_shapes: batch_size = max( - [x[self._session.binding_index_map["input_ids"]][1][0] for x in self._session.allowed_shapes] + [x[self._lang_session.binding_index_map["input_ids"]][1][0] for x in self._lang_session.allowed_shapes] ) prefill_seq_len = max( - [x[self._session.binding_index_map["input_ids"]][1][1] for x in self._session.allowed_shapes] + [x[self._lang_session.binding_index_map["input_ids"]][1][1] for x in self._lang_session.allowed_shapes] ) else: - batch_size, prefill_seq_len = self._session.bindings[self._session.binding_index_map["input_ids"]].dims + batch_size, prefill_seq_len = self._lang_session.bindings[ + self._lang_session.binding_index_map["input_ids"] + ].dims return batch_size, prefill_seq_len def _fetch_decode_seq_len( @@ -536,9 +569,9 @@ def _fetch_decode_seq_len( decode_seq_len: The decode sequence length fetched from the session's bindings or allowed shapes. """ decode_seq_len = None - if self._session.allowed_shapes: + if self._lang_session.allowed_shapes: decode_seq_len = min( - [x[self._session.binding_index_map["input_ids"]][1][1] for x in self._session.allowed_shapes] + [x[self._lang_session.binding_index_map["input_ids"]][1][1] for x in self._lang_session.allowed_shapes] ) return decode_seq_len @@ -557,10 +590,10 @@ def _fetch_vocab_size( if self.include_sampler else "logits" ) - if self._session.allowed_shapes: - return [x[self._session.binding_index_map[key]] for x in self._session.allowed_shapes][0][1][2] + if self._lang_session.allowed_shapes: + return [x[self._lang_session.binding_index_map[key]] for x in self._lang_session.allowed_shapes][0][1][2] - return self._session.bindings[self._session.binding_index_map[key]].dims[2] + return self._lang_session.bindings[self._lang_session.binding_index_map[key]].dims[2] def _fetch_generation_len(self, generation_len, max_gen_len): """ @@ -649,7 +682,7 @@ def _fetch_next_token_id(self, outputs): logits = np.expand_dims(logits, 1) return logits.argmax(2) - def initialize_decode_inputs(self, num_prompts, execution_batch_size, max_gen_length): + def initialize_decode_inputs(self, num_images, num_prompts, execution_batch_size, max_gen_length): """ Initialize np arrays for storing the prefill output for all the decode batch size. """ @@ -696,7 +729,7 @@ def update_decode_input(self, outputs, position_ids, generation_len, decode_batc self.generation_len[decode_batch_id or slice(None)] = generation_len return next_token_id - def run_prefill_for_all_inputs(self, prompt_queue, generation_len): + def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len): """ Runs prefill for all inputs in the prompt queue and updates the decode input. @@ -709,10 +742,14 @@ def run_prefill_for_all_inputs(self, prompt_queue, generation_len): """ for decode_batch_id in range(self.full_batch_size): next_prompt = prompt_queue.popleft() + next_image = image_queue.popleft() # run prefill for num_chunks outputs, position_ids, generation_len = self.run_prefill( - next_prompt, generation_len, decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1) + next_image, + next_prompt, + generation_len, + decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1), ) _ = self.update_decode_input(outputs, position_ids, generation_len, decode_batch_id) @@ -727,14 +764,39 @@ def _set_output_buffers(self, batch_size: int = 1, sequence_length: int = 1): if self.include_sampler: if self.return_pdfs: probs_out_placeholder = np.zeros((batch_size, sequence_length, self._vocab_size), dtype=np.float32) - self._session.set_buffers({"probs": probs_out_placeholder}) + self._lang_session.set_buffers({"probs": probs_out_placeholder}) next_tokens_out_placeholder = np.zeros((batch_size, sequence_length, 1), dtype=np.int64) - self._session.set_buffers({"next_tokens": next_tokens_out_placeholder}) + self._lang_session.set_buffers({"next_tokens": next_tokens_out_placeholder}) else: logits_out_placeholder = np.zeros((batch_size, sequence_length, self._vocab_size), dtype=np.float32) - self._session.set_buffers({"logits": logits_out_placeholder}) + self._lang_session.set_buffers({"logits": logits_out_placeholder}) + + vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16) + self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder}) + + def prepare_vision_language_inputs(self, prompt, image_url): + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "url": image_url}, + {"type": "text", "text": prompt}, + ], + }, + ] + inputs = self.processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + # padding="max_length", + # max_length=padded_len, + ) + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + return inputs - def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_id=None): + def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_batch_id=None): """ Runs prefill for a given prompt and generation length. @@ -752,7 +814,8 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i generation_len (int): The generation length. """ # Run prefill - inputs = self.tokenizer(prompt, return_tensors="np", padding=True) + inputs = self.prepare_vision_language_inputs(prompt, image) + position_ids = inputs["attention_mask"].sum(1, keepdims=True) padded_len = inputs["input_ids"].shape[1] num_chunks = -(padded_len // -self._prefill_seq_len) # ceil divide without float @@ -766,51 +829,110 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i # Set the prefill output buffers self._set_output_buffers(batch_size=prefill_logit_bs, sequence_length=1) - inputs = self.tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) - inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) - inputs.pop("token_type_ids", None) + pad_token_id = 1 + input_ids_length = inputs["input_ids"].shape[1] + num_chunks = -(input_ids_length // -self._prefill_seq_len) # ceil divide without float + padded_len = num_chunks * self._prefill_seq_len # Convert to a multiple of prompt_len + + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], + (0, padded_len - input_ids_length), + "constant", + pad_token_id, + ) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0 + ) + if "cross_attention_mask" in inputs: + inputs["cross_attention_mask"] = torch.nn.functional.pad( + inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length) + ) + + for k, v in inputs.items(): + inputs[k] = np.array(v) + + vision_inputs = { + k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} + } + if vision_inputs: + vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") + + vision_outputs = {} + if self._vision_session: + self._vision_session.activate() + # Run vision prefill + if vision_inputs: + vision_outputs = self._vision_session.run(vision_inputs) + + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + lang_inputs["position_ids"] = np.where( + lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 + ) # Need to use -1 as position_ids for invalid tokens + + # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" + # if not_mllama: + lang_inputs["image_idx"] = np.array([[0]]) + + if self._vision_session: + self._vision_session.deactivate() + self._lang_session.activate() + self._lang_session.set_buffers(vision_outputs) if decode_batch_id is not None: - inputs["batch_index"] = decode_batch_id + lang_inputs["batch_index"] = decode_batch_id if self.is_tlm: - inputs["num_logits_to_keep"] = np.zeros((1, 1)) + lang_inputs["num_logits_to_keep"] = np.zeros((1, 1)) if self.include_sampler: - inputs["last_accepted_output_tokens"] = inputs["input_ids"] + lang_inputs["last_accepted_output_tokens"] = lang_inputs["input_ids"] for op in Constants.SAMPLER_OPS: if decode_batch_id is not None: - inputs[op] = self.sampling_params[op][decode_batch_id.flatten()] + lang_inputs[op] = self.sampling_params[op][decode_batch_id.flatten()] else: - inputs[op] = self.sampling_params[op] + lang_inputs[op] = self.sampling_params[op] if self._prompt_to_lora_id_mapping_prefill: if self.full_batch_size: - inputs["lora_ids"] = np.array( + lang_inputs["lora_ids"] = np.array( self._prompt_to_lora_id_mapping_prefill.popleft(), dtype=np.int64 ).reshape(1, 1) else: batch_lora_ids = [self._prompt_to_lora_id_mapping_prefill.popleft() for i in range(self.batch_size)] - inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1) + lang_inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1) + # Run language prefill + chunk_inputs = lang_inputs.copy() for i in range(num_chunks): - chunk_inputs = inputs.copy() - chunk_inputs["input_ids"] = inputs["input_ids"][ + chunk_inputs = lang_inputs.copy() + chunk_inputs["input_ids"] = lang_inputs["input_ids"][ :, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len ] - chunk_inputs["position_ids"] = inputs["position_ids"][ + chunk_inputs["position_ids"] = lang_inputs["position_ids"][ :, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len ] if self.include_sampler: chunk_inputs["last_accepted_output_tokens"] = chunk_inputs["input_ids"] - outputs = self._session.run(chunk_inputs) + outputs = self._lang_session.run(chunk_inputs) + chunk_inputs["image_idx"] = outputs["image_idx_output"] if self._write_io_dir is not None: write_io_files(inputs, outputs, self._write_io_dir, "prefill", "aic_batch_io", True, False) + + # Skip inputs/outputs again + self._lang_session.skip_buffers( + [ + x + for x in self._lang_session.input_names + self._lang_session.output_names + if x.startswith("past_") or x.endswith("_RetainedState") + ] + ) + if self._lang_session: + self._lang_session.deactivate() return ( outputs, position_ids, generation_len, ) - def run_continuous_batching_decode(self, prompt_queue, generation_len): + def run_continuous_batching_decode(self, image_queue, prompt_queue, generation_len): """ Runs continuous batching decode for the given prompt queue and generation length. @@ -842,7 +964,8 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len): decode_inputs = self.prepare_decode_inputs() while prompt_queue or current_decode_ongoing.any(): - outputs = self._session.run(decode_inputs) + self._lang_session.activate() + outputs = self._lang_session.run(decode_inputs) # Prepare inputs for next iteration next_token_id = self._fetch_next_token_id(outputs) @@ -892,6 +1015,8 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len): generated_id_current_index[decode_batch_id] += 1 + self._lang_session.deactivate() + return decode_pause_time def run_decode(self, decode_inputs, generation_len, streamer: Optional[transformers.TextStreamer] = None): @@ -911,13 +1036,13 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform logits_out_placeholder = np.zeros( (self.batch_size, self._decode_seq_len, self._vocab_size), dtype=np.float32 ) - self._session.set_buffers({"logits": logits_out_placeholder}) + self._lang_session.set_buffers({"logits": logits_out_placeholder}) finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id num_token = 0 for num_token in range(1, generation_len): if streamer: streamer.put(decode_inputs["input_ids"][0]) - outputs = self._session.run(decode_inputs) + outputs = self._lang_session.run(decode_inputs) if self._write_io_dir is not None: write_io_files(decode_inputs, outputs, self._write_io_dir, "decode", "aic_batch_io", True, False) @@ -951,7 +1076,7 @@ def generate_decode_stream(self, decode_inputs, generation_len): finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id for num_token in range(1, generation_len): yield decode_inputs["input_ids"] - outputs = self._session.run(decode_inputs) + outputs = self._lang_session.run(decode_inputs) if self._write_io_dir is not None: write_io_files(decode_inputs, outputs, self._write_io_dir, "decode", "aic_batch_io", True, False) @@ -971,8 +1096,10 @@ def generate_decode_stream(self, decode_inputs, generation_len): class TextGeneration: def __init__( self, + processor, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - qpc_path: str, + vision_qpc_path: str, + lang_qpc_path: str, full_batch_size: Optional[int] = None, ctx_len: Optional[int] = None, device_id: Optional[List[int]] = None, @@ -984,8 +1111,10 @@ def __init__( sampling_params: Optional[Dict[str, Any]] = None, ) -> None: self._qaic_model = QEffTextGenerationBase( + processor, tokenizer=tokenizer, - qpc_path=qpc_path, + lang_qpc_path=lang_qpc_path, + vision_qpc_path=vision_qpc_path, full_batch_size=full_batch_size, ctx_len=ctx_len, device_id=device_id, @@ -998,9 +1127,11 @@ def __init__( ) self._full_batch_size = self._qaic_model.full_batch_size self._tokenizer = self._qaic_model.tokenizer + self._processor = self._qaic_model.processor self._ctx_len = ctx_len self._perf_metrics = None self._prompt_queue = None + self._image_queue = None self._text_streamer = None @property @@ -1009,6 +1140,7 @@ def perf_metrics(self): def _setup_model_execution_inputs( self, + images, prompt: List[str], generation_len: Optional[int] = None, prompt_to_lora_id_mapping: Optional[List[int]] = None, @@ -1027,13 +1159,15 @@ def _setup_model_execution_inputs( # Create a prompt queue. self._prompt_queue = deque(prompt) + self._image_queue = deque(images) # Initialize np arrays for storing the prefill output for all the decode batch size. num_prompts = len(self._prompt_queue) + num_images = len(self._image_queue) if prompt_to_lora_id_mapping: self._qaic_model.initialize_lora_id_mapping(prompt_to_lora_id_mapping) - self._qaic_model.initialize_decode_inputs(num_prompts, execution_batch_size, max_gen_length) + self._qaic_model.initialize_decode_inputs(num_images, num_prompts, execution_batch_size, max_gen_length) def _regular_model_execution( self, @@ -1080,6 +1214,7 @@ def _regular_model_execution( def _continuous_batching_execution( self, + images, prompt: List[str], generation_len: Optional[int] = None, prompt_to_lora_id_mapping: Optional[List[int]] = None, @@ -1096,13 +1231,17 @@ def _continuous_batching_execution( Returns: :tuple: A tuple containing performance metrics and generated texts. """ - self._setup_model_execution_inputs(prompt, generation_len, prompt_to_lora_id_mapping) + self._setup_model_execution_inputs(images, prompt, generation_len, prompt_to_lora_id_mapping) self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1) start = perf_counter() - self._qaic_model.run_prefill_for_all_inputs(self._prompt_queue, generation_len) + self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, generation_len) + + print("\n\n\n\n Prefill for all inputs completed\n\n\n\n") loop_start = perf_counter() # Start decode loop timer - decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len) + decode_pause_time = self._qaic_model.run_continuous_batching_decode( + self._image_queue, self._prompt_queue, generation_len + ) end = perf_counter() generated_texts = self._tokenizer.batch_decode(self._qaic_model.generated_ids, skip_special_tokens=True) @@ -1166,6 +1305,7 @@ def generate_stream_tokens( def generate( self, + images, prompt: List[str], generation_len: Optional[int] = None, stream: bool = True, @@ -1186,7 +1326,7 @@ def generate( if self._full_batch_size is not None: logger.warning("Streamer is currently unavailable for continuous batch execution.") perf_metrics, generated_texts = self._continuous_batching_execution( - prompt, generation_len, prompt_to_lora_id_mapping + images, prompt, generation_len, prompt_to_lora_id_mapping ) else: if stream: diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py index 4b957ebec..d46ca8b14 100644 --- a/QEfficient/transformers/models/llama4/modeling_llama4.py +++ b/QEfficient/transformers/models/llama4/modeling_llama4.py @@ -815,7 +815,7 @@ def forward(self, pixel_values): ) vision_flat = image_features.view(-1, image_features.size(-1)) projected_vision_flat = self.model.multi_modal_projector(vision_flat) - return projected_vision_flat + return projected_vision_flat # , pixel_values # This wrapper utilizes the 'vision_embeds', which contains vision embeddings, and an 'image_idx' index starting at 0. @@ -831,7 +831,15 @@ def __init__(self, model): self.language_model = self.model.language_model self.config = self.model.config - def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_values): + def forward( + self, + input_ids, + vision_embeds, + position_ids, + image_idx, + past_key_values, + batch_index: Optional[torch.LongTensor] = None, + ): inputs_embeds = self.model.language_model.get_input_embeddings()(input_ids) selected = input_ids == self.model.config.image_token_index indices1 = selected.to(torch.int64).cumsum(1) - 1 @@ -841,7 +849,11 @@ def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_va image_embeds = torch.where(selected.unsqueeze(-1), image_features_expanded, inputs_embeds) inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_embeds) outputs = self.model.language_model( - inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True + inputs_embeds=inputs_embeds, + position_ids=position_ids, + past_key_values=past_key_values, + batch_index=batch_index, + use_cache=True, ) next_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0) image_idx = torch.where(image_idx < next_idx, next_idx, image_idx) @@ -888,6 +900,9 @@ def get_specializations( ctx_len: int, img_size: int, kv_offload: bool = False, + continuous_batching: bool = False, + kv_cache_batch_size: Optional[int] = None, + full_batch_size: Optional[int] = None, **compiler_options, ): max_num_tiles = compiler_options.pop("max_num_tiles", None) @@ -936,28 +951,42 @@ def get_specializations( "img_size": img_size, } ] - lang = [ - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "max_num_tiles": max_num_tiles, - "img_size": img_size, - "vision_size": vision_size, - "chunk_length": prefill_seq_len, - "chunk_ctx_len": chunk_ctx_len, - }, - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "max_num_tiles": max_num_tiles, - "img_size": img_size, - "vision_size": vision_size, - "chunk_length": prefill_seq_len, - "chunk_ctx_len": chunk_ctx_len, - }, - ] + + lang_prefill = { + "batch_size": 1 if continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_tiles": max_num_tiles, + "img_size": img_size, + "vision_size": vision_size, + "chunk_length": prefill_seq_len, + "chunk_ctx_len": chunk_ctx_len, + } + if continuous_batching: + lang_prefill["full_batch_size"] = kv_cache_batch_size + else: + lang_prefill["batch_size"] = kv_cache_batch_size + if full_batch_size: + lang_prefill["full_batch_exec_size"] = full_batch_size + + lang_decode = { + "batch_size": full_batch_size if continuous_batching else batch_size, + "seq_len": 1, + "ctx_len": ctx_len, + "max_num_tiles": max_num_tiles, + "img_size": img_size, + "vision_size": vision_size, + "chunk_length": prefill_seq_len, + "chunk_ctx_len": chunk_ctx_len, + } + if continuous_batching: + lang_decode["full_batch_size"] = kv_cache_batch_size + else: + lang_decode["batch_size"] = kv_cache_batch_size + + lang = [] + lang.append(lang_prefill) + lang.append(lang_decode) specializations = {} @@ -966,18 +995,22 @@ def get_specializations( specializations["lang"] = lang return specializations, compiler_options else: + lang[0].pop("vision_size") + lang[1].pop("vision_size") return lang, compiler_options - def get_onnx_dynamic_axes(self, kv_offload: bool = False): + def get_onnx_dynamic_axes(self, kv_offload: bool = False, continuous_batching: bool = False): # Define dynamic axes vision_dynamic_axes = {} lang_dynamic_axes = {} lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} lang_dynamic_axes["vision_embeds"] = {0: "vision_size"} + if continuous_batching: + lang_dynamic_axes["batch_index"] = {0: "batch_size"} vision_dynamic_axes["pixel_values"] = {0: "max_num_tiles", 2: "img_size", 3: "img_size"} - pkv_dynamic_axes = {0: "batch_size"} + pkv_dynamic_axes = {0: "full_batch_size" if continuous_batching else "batch_size"} for i in range(self.language_model.config.num_hidden_layers): # switch between chunk_ctx_len and ctx_len for RoPE and NoPE layers. if int((i + 1) % 4 != 0): @@ -1006,6 +1039,7 @@ def get_output_names(self, kv_offload: bool = False): output_names = {} if kv_offload: + # vision_output_names.insert(1, "pixel_values_RetainedState") lang_output_names.insert(1, "vision_embeds_RetainedState") lang_output_names.insert(2, "image_idx_output") output_names["vision"] = vision_output_names @@ -1040,7 +1074,7 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len): past_key_values.append(pkv) return past_key_values - def get_dummy_inputs(self, kv_offload: bool = False): + def get_dummy_inputs(self, kv_offload: bool = False, continuous_batching: bool = False): if vis_cfg := getattr(self.config, "vision_config", None): img_size = getattr(vis_cfg, "image_size", 336) else: @@ -1085,10 +1119,14 @@ def get_dummy_inputs(self, kv_offload: bool = False): .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) ) lang_inputs["image_idx"] = torch.zeros((inputs_shapes["image_idx"]), dtype=torch.int64) + + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + # Add data for KV past_key_values = self.get_dummy_pkv_cache( config=self.language_model.config, - batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + batch_size=fbs if continuous_batching else bs, seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) @@ -1097,6 +1135,8 @@ def get_dummy_inputs(self, kv_offload: bool = False): for kv in ["key", "value"]: lang_inputs["past_key_values"][i].append(torch.zeros(past_key_values[0][0].shape, dtype=torch.float32)) + if continuous_batching: + lang_inputs["batch_index"] = torch.arange(bs).view(bs, 1) inputs = {} if kv_offload: inputs["vision"] = vision_inputs diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index cd1c13a00..c0efb7277 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -851,6 +851,7 @@ class _QEffAutoModelForImageTextToTextDualQPC: def __init__( self, model: nn.Module, + continuous_batching, **kwargs, ): """ @@ -874,6 +875,7 @@ def __init__( self.config = model.config self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs) self.lang_model = QEffCausalLMForTextImageToTextModel(model, **kwargs) + self.continuous_batching = continuous_batching self.input_shapes, self.output_names = None, None @property @@ -973,8 +975,8 @@ def export( List[str] A list containing the paths to the generated ONNX graph files for both components. """ - inputs = self.model.get_dummy_inputs(kv_offload=True) - dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True) + inputs = self.model.get_dummy_inputs(kv_offload=True, continuous_batching=self.continuous_batching) + dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True, continuous_batching=self.continuous_batching) output_names = self.model.get_output_names(kv_offload=True) self.vision_model.export( @@ -1063,14 +1065,20 @@ def compile( If `full_batch_size`, `kv_cache_batch_size`, or `num_speculative_tokens` are not None. If both `skip_lang` and `skip_vision` are True. """ - if any(param is not None for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens]): + if skip_lang and skip_vision: + raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False") + + if self.continuous_batching and full_batch_size is None: + raise TypeError("`full_batch_size` is required when `continuous_batching=True`.") + + if kv_cache_batch_size and not full_batch_size: raise ValueError( - f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens' to be None but got: " - f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, " + "KV caching requires continuous batching. Please set `full_batch_size` and " + "enable `continuous_batching=True` in `from_pretrained`." ) - if skip_lang and skip_vision: - raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False") + # Infer kv_cache_batch_size if not provided + kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size output_names = self.model.get_output_names(kv_offload=True) @@ -1080,6 +1088,9 @@ def compile( ctx_len=ctx_len, img_size=img_size, kv_offload=True, + continuous_batching=self.continuous_batching, + kv_cache_batch_size=kv_cache_batch_size, + full_batch_size=full_batch_size, **compiler_options, ) @@ -1147,7 +1158,11 @@ def compile( def generate( self, - inputs: torch.Tensor, + inputs: Optional[torch.Tensor] = None, + tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer] = None, + processor=None, + images: List[str] = None, + prompts: List[str] = None, streamer: Optional[TextStreamer] = None, device_ids: List[int] = None, runtime_ai100: bool = True, @@ -1187,6 +1202,17 @@ def generate( if not runtime_ai100: raise NotImplementedError("PyTorch execution is not supported yet for this model!") + if (processor and images) or (tokenizer and prompts): + return QEfficient.cloud_ai_100_exec_kv( + tokenizer, + processor, + self.lang_model.qpc_path, + self.vision_model.qpc_path, + images=images, + prompt=prompts, + device_id=device_ids, + generation_len=generation_len, + ) return self.kv_offload_generate( inputs=inputs, device_ids=device_ids, streamer=streamer, generation_len=generation_len ) @@ -1314,9 +1340,7 @@ def kv_offload_generate( lang_session.set_buffers(vision_outputs) - # Prepare inputs for prefill - chunk_inputs = lang_inputs.copy() - prefill_start = perf_counter() + lang_start = perf_counter() # Run prefill chunk_inputs = lang_inputs.copy() @@ -1328,7 +1352,7 @@ def kv_offload_generate( outputs = lang_session.run(chunk_inputs) chunk_inputs["image_idx"] = outputs["image_idx_output"] - prefill_time = perf_counter() - prefill_start + vision_end - vision_start + prefill_time = perf_counter() - lang_start + vision_end - vision_start # Skip inputs/outputs again lang_session.skip_buffers( [ @@ -1909,7 +1933,7 @@ class QEFFAutoModelForImageTextToText: _hf_auto_class = AutoModelForImageTextToText - def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs): + def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, continuous_batching: bool = False, **kwargs): """ Instantiate the appropriate internal class for single or dual QPC mode. @@ -1930,13 +1954,19 @@ def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs) The wrapped model instance, configured for either dual or single QPC. """ if kv_offload: - return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs) + return _QEffAutoModelForImageTextToTextDualQPC(model, continuous_batching, **kwargs) else: return _QEFFAutoModelForImageTextToTextSingleQPC(model, **kwargs) @classmethod @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optional[bool] = None, **kwargs): + def from_pretrained( + cls, + pretrained_model_name_or_path: str, + kv_offload: Optional[bool] = None, + continuous_batching: bool = False, + **kwargs, + ): """ Load a QEfficient image-text-to-text model from a pretrained HuggingFace model or local path. @@ -1971,12 +2001,18 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona if kwargs.get("low_cpu_mem_usage", None): logger.warning("Updating low_cpu_mem_usage=False") - if kwargs.pop("continuous_batching", None): - NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + if continuous_batching and not kv_offload: + NotImplementedError("Continuous batching is not supported for kv_offload = False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) - return cls(model, kv_offload=kv_offload, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) + return cls( + model, + kv_offload=kv_offload, + continuous_batching=continuous_batching, + pretrained_model_name_or_path=pretrained_model_name_or_path, + **kwargs, + ) MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {"InternVLChatModel": QEFFAutoModelForImageTextToText} diff --git a/examples/llama4_CB_example_vision_lang.py b/examples/llama4_CB_example_vision_lang.py new file mode 100644 index 000000000..f6cd2bf5c --- /dev/null +++ b/examples/llama4_CB_example_vision_lang.py @@ -0,0 +1,65 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ---------------------------------------------------------------------------- + +import transformers +from transformers import AutoConfig, AutoProcessor + +from QEfficient import QEFFAutoModelForImageTextToText + +model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" +config = AutoConfig.from_pretrained(model_id) +# For Testing Purpose Only +config.text_config.num_hidden_layers = 4 +config.vision_config.num_hidden_layers = 2 + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, + attn_implementation="eager", + kv_offload=True, + config=config, + continuous_batching=True, +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +qeff_model.compile( + prefill_seq_len=128, + ctx_len=3072, + img_size=336, + num_cores=16, + num_devices=4, + max_num_tiles=17, + batch_size=1, + full_batch_size=4, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, +) + +image_urls = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", +] + +prompts = [ + "Can you describe the image in detail?", + "What are the objects in the image?", + "What is the main subject of the image?", + "What colors are predominant in the image?", +] + +output = qeff_model.generate( + images=image_urls, + tokenizer=tokenizer, + processor=processor, + device_ids=[0, 1, 2, 3], + prompts=prompts, + generation_len=100, +) From 38e777a7407607967bbb1a99f18b6699b510ed52 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 7 Oct 2025 07:57:45 +0000 Subject: [PATCH 2/4] update text_generation_interface Signed-off-by: Mamta Singh --- .../generation/text_generation_inference.py | 182 ++++++++++-------- .../transformers/models/modeling_auto.py | 15 +- examples/llama4_CB_example_vision_lang.py | 4 +- 3 files changed, 115 insertions(+), 86 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 554f030af..9b8d84ef7 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -15,7 +15,7 @@ import numpy as np import torch import transformers -from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers import AutoImageProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.utils import padding_check_and_fix @@ -314,10 +314,10 @@ def calculate_latency(total_decoded_tokens, loop_start, start, end, decode_pause def cloud_ai_100_exec_kv( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - processor, lang_qpc_path: str, - vision_qpc_path: str, - images, + processor: Optional[AutoImageProcessor] = None, + vision_qpc_path: Optional[str] = None, + images: Optional[str] = None, prompt: Optional[str] = None, prompts_txt_file_path: Optional[str] = None, device_id: Optional[List[int]] = None, @@ -398,7 +398,12 @@ def cloud_ai_100_exec_kv( ) if full_batch_size is None: exec_info = [ - generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping) + generate_text.generate( + prompt=prompt[i : i + batch_size], + generation_len=generation_len, + stream=stream, + prompt_to_lora_id_mapping=prompt_to_lora_id_mapping, + ) for i in range(0, len(prompt), batch_size) ] prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info]) @@ -416,8 +421,8 @@ def cloud_ai_100_exec_kv( ) else: exec_info = generate_text.generate( - images=images, prompt=prompt, + images=images, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping, ) @@ -429,9 +434,9 @@ def cloud_ai_100_exec_kv( class QEffTextGenerationBase: def __init__( self, - processor, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], lang_qpc_path: str, + processor: Optional[AutoImageProcessor] = None, vision_qpc_path: Optional[str] = None, full_batch_size: Optional[int] = None, ctx_len: Optional[int] = None, @@ -450,11 +455,13 @@ def __init__( self.sampling_params = sampling_params # Load QPC + self._lang_session = None + self._vision_session = None if not lang_qpc_path: raise TypeError("Please run compile API for language model first!") self._lang_session = QAICInferenceSession(lang_qpc_path, device_id, activate=False) if vision_qpc_path: - self._vision_session = QAICInferenceSession(vision_qpc_path, device_id) + self._vision_session = QAICInferenceSession(vision_qpc_path, device_id, activate=False) # Validate sampler inputs for On-Device Sampling self.include_sampler = validate_sampler_inputs( @@ -682,7 +689,7 @@ def _fetch_next_token_id(self, outputs): logits = np.expand_dims(logits, 1) return logits.argmax(2) - def initialize_decode_inputs(self, num_images, num_prompts, execution_batch_size, max_gen_length): + def initialize_decode_inputs(self, num_prompts, execution_batch_size, max_gen_length): """ Initialize np arrays for storing the prefill output for all the decode batch size. """ @@ -740,14 +747,18 @@ def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len): generation_len (int): The generation length. """ + next_prompt = None + next_image = None for decode_batch_id in range(self.full_batch_size): - next_prompt = prompt_queue.popleft() - next_image = image_queue.popleft() + if prompt_queue: + next_prompt = prompt_queue.popleft() + if image_queue: + next_image = image_queue.popleft() # run prefill for num_chunks outputs, position_ids, generation_len = self.run_prefill( - next_image, next_prompt, + next_image, generation_len, decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1), ) @@ -771,8 +782,9 @@ def _set_output_buffers(self, batch_size: int = 1, sequence_length: int = 1): logits_out_placeholder = np.zeros((batch_size, sequence_length, self._vocab_size), dtype=np.float32) self._lang_session.set_buffers({"logits": logits_out_placeholder}) - vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16) - self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder}) + if self._vision_session: + vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16) + self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder}) def prepare_vision_language_inputs(self, prompt, image_url): messages = [ @@ -790,13 +802,18 @@ def prepare_vision_language_inputs(self, prompt, image_url): tokenize=True, return_dict=True, return_tensors="pt", - # padding="max_length", - # max_length=padded_len, ) inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) return inputs - def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_batch_id=None): + def run_prefill( + self, + prompt: str, + image: Optional[str] = None, + generation_len: Optional[int] = None, + prefill_logit_bs=1, + decode_batch_id=None, + ): """ Runs prefill for a given prompt and generation length. @@ -813,8 +830,12 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_ position_ids (array): The position IDs. generation_len (int): The generation length. """ + # Run prefill - inputs = self.prepare_vision_language_inputs(prompt, image) + if image: + inputs = self.prepare_vision_language_inputs(prompt, image) + else: + inputs = self.tokenizer(prompt, return_tensors="np", padding=True) position_ids = inputs["attention_mask"].sum(1, keepdims=True) padded_len = inputs["input_ids"].shape[1] @@ -829,40 +850,45 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_ # Set the prefill output buffers self._set_output_buffers(batch_size=prefill_logit_bs, sequence_length=1) - pad_token_id = 1 - input_ids_length = inputs["input_ids"].shape[1] - num_chunks = -(input_ids_length // -self._prefill_seq_len) # ceil divide without float - padded_len = num_chunks * self._prefill_seq_len # Convert to a multiple of prompt_len - - inputs["input_ids"] = torch.nn.functional.pad( - inputs["input_ids"], - (0, padded_len - input_ids_length), - "constant", - pad_token_id, - ) - inputs["attention_mask"] = torch.nn.functional.pad( - inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0 - ) - if "cross_attention_mask" in inputs: - inputs["cross_attention_mask"] = torch.nn.functional.pad( - inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length) + vision_inputs = {} + vision_outputs = {} + if image: + pad_token_id = 1 + input_ids_length = inputs["input_ids"].shape[1] + num_chunks = -(input_ids_length // -self._prefill_seq_len) # ceil divide without float + padded_len = num_chunks * self._prefill_seq_len # Convert to a multiple of prompt_len + + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], + (0, padded_len - input_ids_length), + "constant", + pad_token_id, ) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0 + ) + if "cross_attention_mask" in inputs: + inputs["cross_attention_mask"] = torch.nn.functional.pad( + inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length) + ) - for k, v in inputs.items(): - inputs[k] = np.array(v) + for k, v in inputs.items(): + inputs[k] = np.array(v) - vision_inputs = { - k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} - } - if vision_inputs: - vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") + vision_inputs = { + k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} + } + if vision_inputs: + vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") - vision_outputs = {} - if self._vision_session: - self._vision_session.activate() - # Run vision prefill - if vision_inputs: - vision_outputs = self._vision_session.run(vision_inputs) + # Run vision prefill + if vision_inputs: + self._vision_session.activate() + vision_outputs = self._vision_session.run(vision_inputs) + self._vision_session.deactivate() + else: + inputs = self.tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) + inputs.pop("token_type_ids", None) lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} lang_inputs["position_ids"] = np.where( @@ -871,10 +897,9 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_ # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" # if not_mllama: - lang_inputs["image_idx"] = np.array([[0]]) + if image: + lang_inputs["image_idx"] = np.array([[0]]) - if self._vision_session: - self._vision_session.deactivate() self._lang_session.activate() self._lang_session.set_buffers(vision_outputs) @@ -900,7 +925,7 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_ lang_inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1) # Run language prefill - chunk_inputs = lang_inputs.copy() + for i in range(num_chunks): chunk_inputs = lang_inputs.copy() chunk_inputs["input_ids"] = lang_inputs["input_ids"][ @@ -912,7 +937,8 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_ if self.include_sampler: chunk_inputs["last_accepted_output_tokens"] = chunk_inputs["input_ids"] outputs = self._lang_session.run(chunk_inputs) - chunk_inputs["image_idx"] = outputs["image_idx_output"] + if image: + chunk_inputs["image_idx"] = outputs["image_idx_output"] if self._write_io_dir is not None: write_io_files(inputs, outputs, self._write_io_dir, "prefill", "aic_batch_io", True, False) @@ -924,15 +950,15 @@ def run_prefill(self, image, prompt, generation_len, prefill_logit_bs=1, decode_ if x.startswith("past_") or x.endswith("_RetainedState") ] ) - if self._lang_session: - self._lang_session.deactivate() + self._lang_session.deactivate() + return ( outputs, position_ids, generation_len, ) - def run_continuous_batching_decode(self, image_queue, prompt_queue, generation_len): + def run_continuous_batching_decode(self, prompt_queue, generation_len): """ Runs continuous batching decode for the given prompt queue and generation length. @@ -979,8 +1005,8 @@ def run_continuous_batching_decode(self, image_queue, prompt_queue, generation_l start = perf_counter() # run prefill for next prompt input. outputs, position_ids, generation_len = self.run_prefill( - prompt_queue.popleft(), - generation_len, + prompt=prompt_queue.popleft(), + generation_len=generation_len, decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1), ) @@ -1039,6 +1065,7 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform self._lang_session.set_buffers({"logits": logits_out_placeholder}) finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id num_token = 0 + self._lang_session.activate() for num_token in range(1, generation_len): if streamer: streamer.put(decode_inputs["input_ids"][0]) @@ -1058,6 +1085,7 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform if finished_sequences.all(): break + self._lang_session.deactivate() return num_token def generate_decode_stream(self, decode_inputs, generation_len): @@ -1074,6 +1102,7 @@ def generate_decode_stream(self, decode_inputs, generation_len): token_id (int): The token generated in the decoding process. """ finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id + self._lang_session.activate() for num_token in range(1, generation_len): yield decode_inputs["input_ids"] outputs = self._lang_session.run(decode_inputs) @@ -1090,16 +1119,17 @@ def generate_decode_stream(self, decode_inputs, generation_len): if finished_sequences.all(): break + self._lang_session.deactivate() yield decode_inputs["input_ids"] # yield the last token class TextGeneration: def __init__( self, - processor, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], - vision_qpc_path: str, lang_qpc_path: str, + processor: Optional[AutoImageProcessor] = None, + vision_qpc_path: Optional[str] = None, full_batch_size: Optional[int] = None, ctx_len: Optional[int] = None, device_id: Optional[List[int]] = None, @@ -1111,9 +1141,9 @@ def __init__( sampling_params: Optional[Dict[str, Any]] = None, ) -> None: self._qaic_model = QEffTextGenerationBase( - processor, tokenizer=tokenizer, lang_qpc_path=lang_qpc_path, + processor=processor, vision_qpc_path=vision_qpc_path, full_batch_size=full_batch_size, ctx_len=ctx_len, @@ -1140,8 +1170,8 @@ def perf_metrics(self): def _setup_model_execution_inputs( self, - images, prompt: List[str], + images: Optional[List[str]] = None, generation_len: Optional[int] = None, prompt_to_lora_id_mapping: Optional[List[int]] = None, ): @@ -1159,15 +1189,15 @@ def _setup_model_execution_inputs( # Create a prompt queue. self._prompt_queue = deque(prompt) - self._image_queue = deque(images) + if images: + self._image_queue = deque(images) # Initialize np arrays for storing the prefill output for all the decode batch size. num_prompts = len(self._prompt_queue) - num_images = len(self._image_queue) if prompt_to_lora_id_mapping: self._qaic_model.initialize_lora_id_mapping(prompt_to_lora_id_mapping) - self._qaic_model.initialize_decode_inputs(num_images, num_prompts, execution_batch_size, max_gen_length) + self._qaic_model.initialize_decode_inputs(num_prompts, execution_batch_size, max_gen_length) def _regular_model_execution( self, @@ -1189,12 +1219,14 @@ def _regular_model_execution( :tuple: A tuple containing performance metrics and generated texts. """ - self._setup_model_execution_inputs(prompt, generation_len, prompt_to_lora_id_mapping) + self._setup_model_execution_inputs( + prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping + ) if stream and self._text_streamer is None: self._text_streamer = transformers.TextStreamer(self._tokenizer) start = perf_counter() outputs, position_ids, generation_len = self._qaic_model.run_prefill( - prompt, generation_len, prefill_logit_bs=self._qaic_model.batch_size + prompt=prompt, generation_len=generation_len, prefill_logit_bs=self._qaic_model.batch_size ) self._qaic_model.update_decode_input(outputs, position_ids, generation_len) @@ -1214,8 +1246,8 @@ def _regular_model_execution( def _continuous_batching_execution( self, - images, prompt: List[str], + images: Optional[List[str]] = None, generation_len: Optional[int] = None, prompt_to_lora_id_mapping: Optional[List[int]] = None, ): @@ -1231,17 +1263,13 @@ def _continuous_batching_execution( Returns: :tuple: A tuple containing performance metrics and generated texts. """ - self._setup_model_execution_inputs(images, prompt, generation_len, prompt_to_lora_id_mapping) + self._setup_model_execution_inputs(prompt, images, generation_len, prompt_to_lora_id_mapping) self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1) start = perf_counter() self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, generation_len) - print("\n\n\n\n Prefill for all inputs completed\n\n\n\n") - loop_start = perf_counter() # Start decode loop timer - decode_pause_time = self._qaic_model.run_continuous_batching_decode( - self._image_queue, self._prompt_queue, generation_len - ) + decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len) end = perf_counter() generated_texts = self._tokenizer.batch_decode(self._qaic_model.generated_ids, skip_special_tokens=True) @@ -1281,7 +1309,7 @@ def generate_stream_tokens( self._setup_model_execution_inputs(prompt, generation_len, prompt_to_lora_id_mapping) start = perf_counter() outputs, position_ids, generation_len = self._qaic_model.run_prefill( - prompt, generation_len, prefill_logit_bs=self._qaic_model.batch_size + prompt=prompt, generation_len=generation_len, prefill_logit_bs=self._qaic_model.batch_size ) self._qaic_model.update_decode_input(outputs, position_ids, generation_len) @@ -1305,8 +1333,8 @@ def generate_stream_tokens( def generate( self, - images, prompt: List[str], + images: Optional[List[str]] = None, generation_len: Optional[int] = None, stream: bool = True, prompt_to_lora_id_mapping: Optional[List[int]] = None, @@ -1326,7 +1354,7 @@ def generate( if self._full_batch_size is not None: logger.warning("Streamer is currently unavailable for continuous batch execution.") perf_metrics, generated_texts = self._continuous_batching_execution( - images, prompt, generation_len, prompt_to_lora_id_mapping + prompt, images, generation_len, prompt_to_lora_id_mapping ) else: if stream: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c0efb7277..2cada6ab4 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -14,6 +14,7 @@ import torch import torch.nn as nn from transformers import ( + AutoImageProcessor, AutoModel, AutoModelForCausalLM, AutoModelForImageTextToText, @@ -1160,7 +1161,7 @@ def generate( self, inputs: Optional[torch.Tensor] = None, tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer] = None, - processor=None, + processor: Optional[AutoImageProcessor] = None, images: List[str] = None, prompts: List[str] = None, streamer: Optional[TextStreamer] = None, @@ -1204,10 +1205,10 @@ def generate( if (processor and images) or (tokenizer and prompts): return QEfficient.cloud_ai_100_exec_kv( - tokenizer, - processor, - self.lang_model.qpc_path, - self.vision_model.qpc_path, + tokenizer=tokenizer, + processor=processor, + lang_qpc_path=self.lang_model.qpc_path, + vision_qpc_path=self.vision_model.qpc_path, images=images, prompt=prompts, device_id=device_ids, @@ -2717,8 +2718,8 @@ def generate( raise TypeError("Please run compile API first!") generation_len = kwargs.pop("generation_len", None) return QEfficient.cloud_ai_100_exec_kv( - tokenizer, - self.qpc_path, + tokenizer=tokenizer, + lang_qpc_path=self.qpc_path, prompt=prompts, device_id=device_id, generation_len=generation_len, diff --git a/examples/llama4_CB_example_vision_lang.py b/examples/llama4_CB_example_vision_lang.py index f6cd2bf5c..ebe65bf82 100644 --- a/examples/llama4_CB_example_vision_lang.py +++ b/examples/llama4_CB_example_vision_lang.py @@ -56,10 +56,10 @@ ] output = qeff_model.generate( - images=image_urls, tokenizer=tokenizer, + prompts=prompts, processor=processor, + images=image_urls, device_ids=[0, 1, 2, 3], - prompts=prompts, generation_len=100, ) From 7944a7e85f1c9610ed6426239e5604c1b082b8d2 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 23 Oct 2025 08:40:16 +0000 Subject: [PATCH 3/4] Updated text_generation to run CB for VLMs Signed-off-by: Asmita Goswami --- QEfficient/generation/cloud_infer.py | 31 ++++-- .../generation/text_generation_inference.py | 98 ++++++++----------- 2 files changed, 66 insertions(+), 63 deletions(-) diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py index 8519d824c..42c8b342e 100644 --- a/QEfficient/generation/cloud_infer.py +++ b/QEfficient/generation/cloud_infer.py @@ -90,8 +90,10 @@ def __init__( self.program = qaicrt.Program(self.context, None, qpc, prog_properties) if self.program.load() != qaicrt.QStatus.QS_SUCCESS: raise RuntimeError("Failed to load program") + self.is_active = False if activate: self.activate() + self.is_active = True # Create input qbuffers and buf_dims self.qbuffers = [qaicrt.QBuffer(bytes(binding.size)) for binding in self.bindings] self.buf_dims = qaicrt.BufferDimensionsVecRef( @@ -108,15 +110,32 @@ def output_names(self) -> List[str]: def activate(self): """Activate qpc""" - - self.program.activate() - self.execObj = qaicrt.ExecObj(self.context, self.program) + if not self.is_active: + self.program.activate() + self.execObj = qaicrt.ExecObj(self.context, self.program) + self.is_active = True def deactivate(self): """Deactivate qpc""" - - del self.execObj - self.program.deactivate() + if self.is_active: + del self.execObj + self.program.deactivate() + self.is_active = False + + def pause(self): + """Pause the session while preserving state""" + if self.is_active: + # Just deactivate the program and set state + self.program.deactivate() + self.is_active = False + + def resume(self): + """Resume a paused session""" + if not self.is_active: + # Reactivate program and create new execObj + self.program.activate() + self.execObj = qaicrt.ExecObj(self.context, self.program) + self.is_active = True def set_buffers(self, buffers: Dict[str, np.ndarray]): """ diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index ea0e155f7..f014bf6a0 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -11,6 +11,8 @@ from dataclasses import dataclass from time import perf_counter from typing import Any, Dict, List, Optional, Tuple, Union +import requests +from PIL import Image import numpy as np import torch @@ -399,41 +401,15 @@ def cloud_ai_100_exec_kv( sampling_params=sampling_params, ) - if full_batch_size is None: - exec_info = [ - generate_text.generate( - prompt=prompt[i : i + batch_size], - generation_len=generation_len, - stream=stream, - prompt_to_lora_id_mapping=prompt_to_lora_id_mapping, - ) - for i in range(0, len(prompt), batch_size) - ] - prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info]) - decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info]) - total_perf = np.average([info.perf_metrics.total_perf for info in exec_info]) - total_time = np.average([info.perf_metrics.total_time for info in exec_info]) - generated_texts = [info.generated_texts for info in exec_info] - generated_ids = [info.generated_ids for info in exec_info] - - exec_info = CloudAI100ExecInfo( - batch_size=batch_size, - generated_texts=generated_texts, - generated_ids=generated_ids, - perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time), - ) - else: - exec_info = generate_text.generate( - prompt=prompt, - images=images, - generation_len=generation_len, - prompt_to_lora_id_mapping=prompt_to_lora_id_mapping, - ) - for _ in range(0, int(iteration)): if full_batch_size is None: exec_info = [ - generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping) + generate_text.generate( + prompt=prompt[i : i + batch_size], + generation_len=generation_len, + stream=stream, + prompt_to_lora_id_mapping=prompt_to_lora_id_mapping, + ) for i in range(0, len(prompt), batch_size) ] prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info]) @@ -451,7 +427,10 @@ def cloud_ai_100_exec_kv( ) else: exec_info = generate_text.generate( - prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping + prompt=prompt, + images=images, + generation_len=generation_len, + prompt_to_lora_id_mapping=prompt_to_lora_id_mapping, ) print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation) @@ -765,7 +744,7 @@ def update_decode_input(self, outputs, position_ids, generation_len, decode_batc self.generation_len[decode_batch_id or slice(None)] = generation_len return next_token_id - def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len): + def run_prefill_for_all_inputs(self, image_queue, prompt_queue, processor, generation_len): """ Runs prefill for all inputs in the prompt queue and updates the decode input. @@ -788,6 +767,7 @@ def run_prefill_for_all_inputs(self, image_queue, prompt_queue, generation_len): outputs, position_ids, generation_len = self.run_prefill( next_prompt, next_image, + processor, generation_len, decode_batch_id=np.array(decode_batch_id, dtype=np.int64).reshape(1, 1), ) @@ -815,30 +795,28 @@ def _set_output_buffers(self, batch_size: int = 1, sequence_length: int = 1): vision_embeds_out_placeholder = np.zeros((2448, 5120), dtype=np.float16) self._vision_session.set_buffers({"vision_embeds": vision_embeds_out_placeholder}) - def prepare_vision_language_inputs(self, prompt, image_url): - messages = [ + def prepare_vision_language_inputs(self, processor, query, image_url): + image = Image.open(requests.get(image_url, stream=True).raw) + conversation = [ { "role": "user", "content": [ - {"type": "image", "url": image_url}, - {"type": "text", "text": prompt}, + {"type": "text", "text": query}, + {"type": "image"}, ], }, ] - inputs = self.processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_dict=True, - return_tensors="pt", - ) - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + inputs = processor(images=image, text=prompt, return_tensors="pt") + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) return inputs def run_prefill( self, prompt: str, image: Optional[str] = None, + processor: Optional[AutoImageProcessor] = None, generation_len: Optional[int] = None, prefill_logit_bs=1, decode_batch_id=None, @@ -862,7 +840,7 @@ def run_prefill( # Run prefill if image: - inputs = self.prepare_vision_language_inputs(prompt, image) + inputs = self.prepare_vision_language_inputs(processor, prompt, image) else: inputs = self.tokenizer(prompt, return_tensors="np", padding=True) @@ -905,16 +883,22 @@ def run_prefill( inputs[k] = np.array(v) vision_inputs = { - k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} + k: v for k, v in inputs.items() if k in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"} } - if vision_inputs: - vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") + # if vision_inputs: + # vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") + vision_inputs_fp16 = {"pixel_values", "image_masks"} + vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs}) + # if not(self._lang_session.is_active): + # self._lang_session.activate() # Run vision prefill if vision_inputs: + # self._lang_session.pause() self._vision_session.activate() vision_outputs = self._vision_session.run(vision_inputs) self._vision_session.deactivate() + # self._lang_session.resume() else: inputs = self.tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) inputs.pop("token_type_ids", None) @@ -924,8 +908,9 @@ def run_prefill( lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens - # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" - # if not_mllama: + # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" + # if not_mllama: + # lang_inputs["image_idx"] = np.array([[0]]) if image: lang_inputs["image_idx"] = np.array([[0]]) @@ -954,9 +939,8 @@ def run_prefill( lang_inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1) # Run language prefill - + chunk_inputs = lang_inputs.copy() for i in range(num_chunks): - chunk_inputs = lang_inputs.copy() chunk_inputs["input_ids"] = lang_inputs["input_ids"][ :, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len ] @@ -979,7 +963,7 @@ def run_prefill( if x.startswith("past_") or x.endswith("_RetainedState") ] ) - self._lang_session.deactivate() + # self._lang_session.deactivate() return ( outputs, @@ -1018,8 +1002,8 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len): # Prepare decode inputs inputs. decode_inputs = self.prepare_decode_inputs() + # self._lang_session.activate() # Due to activating new session (new exec_obj) run values are changing while prompt_queue or current_decode_ongoing.any(): - self._lang_session.activate() outputs = self._lang_session.run(decode_inputs) # Prepare inputs for next iteration @@ -1298,7 +1282,7 @@ def _continuous_batching_execution( self._setup_model_execution_inputs(prompt, images, generation_len, prompt_to_lora_id_mapping) self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1) start = perf_counter() - self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, generation_len) + self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, self._processor, generation_len) loop_start = perf_counter() # Start decode loop timer decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len) From 3065310a6d4c3bb7415479f4cbf82f91a614f9b2 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 23 Oct 2025 08:50:48 +0000 Subject: [PATCH 4/4] Ruff format Signed-off-by: Asmita Goswami --- .../generation/text_generation_inference.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index f014bf6a0..190bcf764 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -11,12 +11,12 @@ from dataclasses import dataclass from time import perf_counter from typing import Any, Dict, List, Optional, Tuple, Union -import requests -from PIL import Image import numpy as np +import requests import torch import transformers +from PIL import Image from transformers import AutoImageProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast from QEfficient.generation.cloud_infer import QAICInferenceSession @@ -883,12 +883,24 @@ def run_prefill( inputs[k] = np.array(v) vision_inputs = { - k: v for k, v in inputs.items() if k in {"pixel_values", "image_masks", "image_input_idx", "valid_idx", "aspect_ratio_ids", "aspect_ratio_mask"} + k: v + for k, v in inputs.items() + if k + in { + "pixel_values", + "image_masks", + "image_input_idx", + "valid_idx", + "aspect_ratio_ids", + "aspect_ratio_mask", + } } # if vision_inputs: # vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") vision_inputs_fp16 = {"pixel_values", "image_masks"} - vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs}) + vision_inputs.update( + {k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs} + ) # if not(self._lang_session.is_active): # self._lang_session.activate() @@ -908,9 +920,9 @@ def run_prefill( lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens - # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" - # if not_mllama: - # lang_inputs["image_idx"] = np.array([[0]]) + # not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" + # if not_mllama: + # lang_inputs["image_idx"] = np.array([[0]]) if image: lang_inputs["image_idx"] = np.array([[0]]) @@ -1282,7 +1294,9 @@ def _continuous_batching_execution( self._setup_model_execution_inputs(prompt, images, generation_len, prompt_to_lora_id_mapping) self._qaic_model.batch_index = np.arange(self._full_batch_size).reshape(-1, 1) start = perf_counter() - self._qaic_model.run_prefill_for_all_inputs(self._image_queue, self._prompt_queue, self._processor, generation_len) + self._qaic_model.run_prefill_for_all_inputs( + self._image_queue, self._prompt_queue, self._processor, generation_len + ) loop_start = perf_counter() # Start decode loop timer decode_pause_time = self._qaic_model.run_continuous_batching_decode(self._prompt_queue, generation_len)