From 8a4cf1f3ce5ac21d06243f82f857e5086a05860c Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 22 Jul 2025 12:59:20 +0000 Subject: [PATCH 1/8] Deleted model after export to save memory Signed-off-by: Asmita Goswami --- QEfficient/base/modeling_qeff.py | 11 +++ .../transformers/models/modeling_auto.py | 95 ++++++++++--------- .../models/test_speech_seq2seq_models.py | 2 +- 3 files changed, 62 insertions(+), 46 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 0b07bb6b3..f5bd40bdc 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -217,6 +217,11 @@ def _export( shutil.rmtree(tmp_onnx_dir, ignore_errors=True) self.onnx_path = onnx_path + + # Clear the model to free up memory + self.model = None + gc.collect() + return onnx_path @dump_qconfig @@ -256,6 +261,12 @@ def _compile( if onnx_path is None and self.onnx_path is None: self.export() + # Method 1 + # with init_empty_weights(): + # self.model = self.public_class.from_pretrained( + # pretrained_model_name_or_path=self.config._name_or_path, + # ).model + onnx_path = Path(onnx_path or self.onnx_path) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3e50a2783..9cfa3aaf0 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -108,7 +108,12 @@ def __new__(cls, *args, **kwargs): def auto_correct_inputs(self, inputs): checked = True - inputs_info = self.model.get_inputs_info() + + # Method 2 + inputs_info = self.inputs_info if hasattr(self, "inputs_info") else None + + # Method 1 + # inputs_info = self.model.get_inputs_info() for valid_input_info in inputs_info: if valid_input_info.name not in inputs: checked = False @@ -230,7 +235,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k @property def get_model_config(self) -> dict: - return self.model.config.__dict__ + return self.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ @@ -340,7 +345,9 @@ def generate( return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids) # PyTorch runtime else: - return self.pytorch_feature_generate(model=self.model, inputs=inputs) + return self.pytorch_feature_generate( + model=self.model, inputs=inputs + ) # TODO: Handle this case when self.model = None def cloud_ai_100_feature_generate( self, @@ -623,9 +630,9 @@ def compile( if skip_lang and skip_vision: raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False") - output_names = self.model.get_output_names(kv_offload=True) + output_names = self.output_names(kv_offload=True) - specializations, compiler_options = self.model.get_specializations( + specializations, compiler_options = self.specializations( batch_size=batch_size, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, @@ -805,7 +812,7 @@ def kv_offload_generate( lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens - not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" + not_mllama = hasattr(self.config, "model_type") and self.config.model_type != "mllama" if not_mllama: lang_inputs["image_idx"] = np.array([[0]]) @@ -901,11 +908,13 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model, **kwargs) + self.public_class = QEFFAutoModelForImageTextToText + # to handle internvl models - if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"): - self.model.config.llm_config.use_cache = True - self.model.config.llm_config._attn_implementation = "eager" - self.model.config.vision_config.use_flash_attn = "false" + if hasattr(self.config, "llm_config") and hasattr(self.config, "vision_config"): + self.config.llm_config.use_cache = True + self.config.llm_config._attn_implementation = "eager" + self.config.vision_config.use_flash_attn = "false" else: self.model.config.text_config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ @@ -929,7 +938,7 @@ def from_pretrained( config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config=config, *args, **kwargs) return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) @@ -967,11 +976,11 @@ def compile( f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, " ) - output_names = self.model.get_output_names() + output_names = self.output_names() # Get specializations from modelling file # TODO: expose this via the auto class as well - specializations, compiler_options = self.model.get_specializations( + specializations, compiler_options = self.specializations( batch_size=batch_size, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, @@ -1170,7 +1179,7 @@ def model_name(self) -> str: @property def get_model_config(self) -> dict: - return self.model.config.__dict__ + return self.config.__dict__ class QEFFAutoModelForImageTextToText: @@ -1364,7 +1373,7 @@ def __init__( # previous transform function. self.model, transformed = SamplerTransform.apply(self.model, qaic_config, **kwargs) if self.is_tlm: - self.model.qaic_config["return_pdfs"] = True + self.qaic_config["return_pdfs"] = True @property def model_name(self) -> str: @@ -1457,7 +1466,7 @@ def from_pretrained( @property def get_model_config(self) -> dict: - return self.model.config.__dict__ + return self.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ @@ -1472,9 +1481,7 @@ def export(self, export_dir: Optional[str] = None) -> str: bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS - kv_cache_shape = get_padding_shape_from_config( - self.model.config, fbs if self.continuous_batching else bs, seq_len - ) + kv_cache_shape = get_padding_shape_from_config(self.config, fbs if self.continuous_batching else bs, seq_len) example_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), @@ -1495,21 +1502,16 @@ def export(self, export_dir: Optional[str] = None) -> str: 2: "ctx_len", } output_names = [] - if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False): - if self.model.qaic_config.get("return_pdfs", False): + if self.qaic_config is not None and self.qaic_config.get("include_sampler", False): + if self.qaic_config.get("return_pdfs", False): output_names.append("probs") output_names.append("next_tokens") else: output_names.append("logits") # TODO Update the get_padding_shape_from_config method to handle the case when the model config has attention_chunk_size or sliding_window and it should return a list of shapes for each layer - if ( - hasattr(self.model.config, "model_type") - and self.model.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH - ): - pkv_cache = self.model.get_dummy_pkv_cache( - self.model.config, fbs if self.continuous_batching else bs, seq_len - ) + if hasattr(self.config, "model_type") and self.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH: + pkv_cache = self.model.get_dummy_pkv_cache(self.config, fbs if self.continuous_batching else bs, seq_len) for i in range(self.num_layers): for kv in ["key", "value"]: example_inputs["past_key_values"][i].append(torch.zeros(pkv_cache[0][0].shape, dtype=torch.float32)) @@ -1532,7 +1534,7 @@ def export(self, export_dir: Optional[str] = None) -> str: example_inputs["num_logits_to_keep"] = torch.arange(nlk).view(nlk, 1) dynamic_axes["num_logits_to_keep"] = {0: "num_logits_to_keep"} - if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False): + if self.qaic_config is not None and self.qaic_config.get("include_sampler", False): example_inputs, output_names, dynamic_axes = self.get_sampling_inputs_and_outputs( example_inputs=example_inputs, output_names=output_names, @@ -1565,7 +1567,7 @@ def get_sampling_inputs_and_outputs( dynamic_axes["last_accepted_output_tokens"] = {0: "batch_size", 1: "seq_len"} example_inputs["past_repetition_penalty_buffer"] = torch.zeros( - (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool + (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool ) dynamic_axes["past_repetition_penalty_buffer"] = { 0: "full_batch_size" if self.continuous_batching else "batch_size", @@ -1578,7 +1580,7 @@ def get_sampling_inputs_and_outputs( dynamic_axes["repetition_penalties"] = {0: "batch_size"} example_inputs["past_presence_penalty_buffer"] = torch.zeros( - (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool + (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool ) dynamic_axes["past_presence_penalty_buffer"] = { 0: "full_batch_size" if self.continuous_batching else "batch_size", @@ -1595,7 +1597,7 @@ def get_sampling_inputs_and_outputs( ) dynamic_axes["temperatures"] = {0: "batch_size"} - max_top_k_ids = self.model.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS) + max_top_k_ids = self.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS) example_inputs["top_ks"] = torch.randint(1, max_top_k_ids, size=(bs, 1)).to(torch.int32) dynamic_axes["top_ks"] = {0: "batch_size"} @@ -1724,8 +1726,8 @@ def compile( ) if ( - self.model.qaic_config is not None - and self.model.qaic_config.get("include_sampler", False) + self.qaic_config is not None + and self.qaic_config.get("include_sampler", False) and num_speculative_tokens is not None and num_speculative_tokens > 0 ): @@ -1824,8 +1826,8 @@ def generate( raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int): - if hasattr(self.model.config, "speculative_config"): - num_speculative_tokens_ = self.model.config.speculative_config["num_speculative_tokens"] + if hasattr(self.config, "speculative_config"): + num_speculative_tokens_ = self.config.speculative_config["num_speculative_tokens"] if num_speculative_tokens is not None: logger.warning( f"arg `num_speculative_tokens` is a fixed value of {num_speculative_tokens_} for this model." @@ -1902,7 +1904,7 @@ def __init__(self, model: nn.Module, **kwargs): @property def get_model_config(self) -> dict: - return self.model.config.__dict__ + return self.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ @@ -1957,7 +1959,8 @@ def compile( Returns: :str: Path of the compiled ``qpc`` package. """ - specializations, compiler_options = self.model.get_specializations( + # Method 2 + specializations, compiler_options = self.specializations( batch_size, encoder_ctx_len, ctx_len, @@ -1976,7 +1979,9 @@ def compile( if num_speculative_tokens: logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq") - output_names = self.model.get_output_names() + # Method 2 + output_names = self.output_names() + # output_names = self.get_output_names() kv_cache_dtype = "float16" custom_io = {} @@ -2039,7 +2044,7 @@ def generate( # add start token id and initial position ids to inputs seq_len = 1 inputs["input_ids"] = ( - torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id + torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.config.decoder_start_token_id ).numpy() inputs["position_ids"] = ( torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy() @@ -2050,7 +2055,7 @@ def generate( ) outputs = { - "logits": np.random.randn(self.batch_size, 1, self.model.config.vocab_size).astype(np.float32), + "logits": np.random.randn(self.batch_size, 1, self.config.vocab_size).astype(np.float32), } self.qpc_session.set_buffers(outputs) @@ -2059,8 +2064,8 @@ def generate( outputs = self.qpc_session.run(inputs) # array to hold generated tokens - generated_ids = np.full((self.batch_size, generation_len + 1), self.model.config.eos_token_id) - generated_ids[:, 0] = [self.model.config.decoder_start_token_id] + generated_ids = np.full((self.batch_size, generation_len + 1), self.config.eos_token_id) + generated_ids[:, 0] = [self.config.decoder_start_token_id] logits = outputs["logits"] next_token = logits.argmax(-1) generated_ids[:, 1] = next_token.squeeze(1) @@ -2068,7 +2073,7 @@ def generate( if streamer: streamer.put(next_token) - inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float16) + inputs["input_features"] = np.zeros((self.batch_size, self.config.num_mel_bins, 1)).astype(np.float16) loop_start = perf_counter() for num_tokens in range(generation_len): @@ -2077,7 +2082,7 @@ def generate( next_token = logits.argmax(-1) generated_ids[:, num_tokens + 1] = next_token.squeeze(1) - if next_token[0][0] == self.model.config.eos_token_id: + if next_token[0][0] == self.config.eos_token_id: break inputs["input_ids"] = next_token diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 4ae8928b7..159aa2a70 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -324,7 +324,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( qeff_model.export() - ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate, ctx_len) + ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.config, processor, data, sample_rate, ctx_len) assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for pytorch output and ort output" From 41f8ffff1c2f33dacd779c8f3130c41e55691f1a Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 7 Aug 2025 11:03:46 +0000 Subject: [PATCH 2/8] Unloading model weights only and keeping its architecture Signed-off-by: Asmita Goswami --- QEfficient/base/modeling_qeff.py | 22 ++--- .../transformers/models/modeling_auto.py | 95 +++++++++---------- .../models/test_speech_seq2seq_models.py | 2 +- 3 files changed, 57 insertions(+), 62 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index f5bd40bdc..2f2eb57c8 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -147,6 +147,17 @@ def _export( self.onnx_path = onnx_path return onnx_path + # Storing state_dict to load model's weights if export called again + # if not any(name for name, param in self.model.named_parameters() if param.is_meta): + # self.state_dict = self.model.state_dict() + + # Loading model if weights are in meta state from state_dict + if any(name for name, param in self.model.named_parameters() if param.is_meta): + logger.warning("Export called again, this feature is not supported yet.") + # TODO: Handle weights loading for VLMs + # self.model = self.model.to_empty(device=torch.device("cpu")) + # self.model.load_state_dict(self.state_dict) + tmp_onnx_dir = export_dir / "onnx_tmp" tmp_onnx_path = tmp_onnx_dir / f"{self.model_name}.onnx" tmp_onnx_dir.mkdir(parents=True, exist_ok=True) @@ -217,11 +228,6 @@ def _export( shutil.rmtree(tmp_onnx_dir, ignore_errors=True) self.onnx_path = onnx_path - - # Clear the model to free up memory - self.model = None - gc.collect() - return onnx_path @dump_qconfig @@ -261,12 +267,6 @@ def _compile( if onnx_path is None and self.onnx_path is None: self.export() - # Method 1 - # with init_empty_weights(): - # self.model = self.public_class.from_pretrained( - # pretrained_model_name_or_path=self.config._name_or_path, - # ).model - onnx_path = Path(onnx_path or self.onnx_path) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 9cfa3aaf0..3e50a2783 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -108,12 +108,7 @@ def __new__(cls, *args, **kwargs): def auto_correct_inputs(self, inputs): checked = True - - # Method 2 - inputs_info = self.inputs_info if hasattr(self, "inputs_info") else None - - # Method 1 - # inputs_info = self.model.get_inputs_info() + inputs_info = self.model.get_inputs_info() for valid_input_info in inputs_info: if valid_input_info.name not in inputs: checked = False @@ -235,7 +230,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k @property def get_model_config(self) -> dict: - return self.config.__dict__ + return self.model.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ @@ -345,9 +340,7 @@ def generate( return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids) # PyTorch runtime else: - return self.pytorch_feature_generate( - model=self.model, inputs=inputs - ) # TODO: Handle this case when self.model = None + return self.pytorch_feature_generate(model=self.model, inputs=inputs) def cloud_ai_100_feature_generate( self, @@ -630,9 +623,9 @@ def compile( if skip_lang and skip_vision: raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False") - output_names = self.output_names(kv_offload=True) + output_names = self.model.get_output_names(kv_offload=True) - specializations, compiler_options = self.specializations( + specializations, compiler_options = self.model.get_specializations( batch_size=batch_size, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, @@ -812,7 +805,7 @@ def kv_offload_generate( lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens - not_mllama = hasattr(self.config, "model_type") and self.config.model_type != "mllama" + not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama" if not_mllama: lang_inputs["image_idx"] = np.array([[0]]) @@ -908,13 +901,11 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model, **kwargs) - self.public_class = QEFFAutoModelForImageTextToText - # to handle internvl models - if hasattr(self.config, "llm_config") and hasattr(self.config, "vision_config"): - self.config.llm_config.use_cache = True - self.config.llm_config._attn_implementation = "eager" - self.config.vision_config.use_flash_attn = "false" + if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"): + self.model.config.llm_config.use_cache = True + self.model.config.llm_config._attn_implementation = "eager" + self.model.config.vision_config.use_flash_attn = "false" else: self.model.config.text_config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ @@ -938,7 +929,7 @@ def from_pretrained( config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config=config, *args, **kwargs) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs) return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs) @@ -976,11 +967,11 @@ def compile( f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, " ) - output_names = self.output_names() + output_names = self.model.get_output_names() # Get specializations from modelling file # TODO: expose this via the auto class as well - specializations, compiler_options = self.specializations( + specializations, compiler_options = self.model.get_specializations( batch_size=batch_size, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, @@ -1179,7 +1170,7 @@ def model_name(self) -> str: @property def get_model_config(self) -> dict: - return self.config.__dict__ + return self.model.config.__dict__ class QEFFAutoModelForImageTextToText: @@ -1373,7 +1364,7 @@ def __init__( # previous transform function. self.model, transformed = SamplerTransform.apply(self.model, qaic_config, **kwargs) if self.is_tlm: - self.qaic_config["return_pdfs"] = True + self.model.qaic_config["return_pdfs"] = True @property def model_name(self) -> str: @@ -1466,7 +1457,7 @@ def from_pretrained( @property def get_model_config(self) -> dict: - return self.config.__dict__ + return self.model.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ @@ -1481,7 +1472,9 @@ def export(self, export_dir: Optional[str] = None) -> str: bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS - kv_cache_shape = get_padding_shape_from_config(self.config, fbs if self.continuous_batching else bs, seq_len) + kv_cache_shape = get_padding_shape_from_config( + self.model.config, fbs if self.continuous_batching else bs, seq_len + ) example_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), @@ -1502,16 +1495,21 @@ def export(self, export_dir: Optional[str] = None) -> str: 2: "ctx_len", } output_names = [] - if self.qaic_config is not None and self.qaic_config.get("include_sampler", False): - if self.qaic_config.get("return_pdfs", False): + if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False): + if self.model.qaic_config.get("return_pdfs", False): output_names.append("probs") output_names.append("next_tokens") else: output_names.append("logits") # TODO Update the get_padding_shape_from_config method to handle the case when the model config has attention_chunk_size or sliding_window and it should return a list of shapes for each layer - if hasattr(self.config, "model_type") and self.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH: - pkv_cache = self.model.get_dummy_pkv_cache(self.config, fbs if self.continuous_batching else bs, seq_len) + if ( + hasattr(self.model.config, "model_type") + and self.model.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH + ): + pkv_cache = self.model.get_dummy_pkv_cache( + self.model.config, fbs if self.continuous_batching else bs, seq_len + ) for i in range(self.num_layers): for kv in ["key", "value"]: example_inputs["past_key_values"][i].append(torch.zeros(pkv_cache[0][0].shape, dtype=torch.float32)) @@ -1534,7 +1532,7 @@ def export(self, export_dir: Optional[str] = None) -> str: example_inputs["num_logits_to_keep"] = torch.arange(nlk).view(nlk, 1) dynamic_axes["num_logits_to_keep"] = {0: "num_logits_to_keep"} - if self.qaic_config is not None and self.qaic_config.get("include_sampler", False): + if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False): example_inputs, output_names, dynamic_axes = self.get_sampling_inputs_and_outputs( example_inputs=example_inputs, output_names=output_names, @@ -1567,7 +1565,7 @@ def get_sampling_inputs_and_outputs( dynamic_axes["last_accepted_output_tokens"] = {0: "batch_size", 1: "seq_len"} example_inputs["past_repetition_penalty_buffer"] = torch.zeros( - (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool + (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool ) dynamic_axes["past_repetition_penalty_buffer"] = { 0: "full_batch_size" if self.continuous_batching else "batch_size", @@ -1580,7 +1578,7 @@ def get_sampling_inputs_and_outputs( dynamic_axes["repetition_penalties"] = {0: "batch_size"} example_inputs["past_presence_penalty_buffer"] = torch.zeros( - (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool + (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool ) dynamic_axes["past_presence_penalty_buffer"] = { 0: "full_batch_size" if self.continuous_batching else "batch_size", @@ -1597,7 +1595,7 @@ def get_sampling_inputs_and_outputs( ) dynamic_axes["temperatures"] = {0: "batch_size"} - max_top_k_ids = self.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS) + max_top_k_ids = self.model.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS) example_inputs["top_ks"] = torch.randint(1, max_top_k_ids, size=(bs, 1)).to(torch.int32) dynamic_axes["top_ks"] = {0: "batch_size"} @@ -1726,8 +1724,8 @@ def compile( ) if ( - self.qaic_config is not None - and self.qaic_config.get("include_sampler", False) + self.model.qaic_config is not None + and self.model.qaic_config.get("include_sampler", False) and num_speculative_tokens is not None and num_speculative_tokens > 0 ): @@ -1826,8 +1824,8 @@ def generate( raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int): - if hasattr(self.config, "speculative_config"): - num_speculative_tokens_ = self.config.speculative_config["num_speculative_tokens"] + if hasattr(self.model.config, "speculative_config"): + num_speculative_tokens_ = self.model.config.speculative_config["num_speculative_tokens"] if num_speculative_tokens is not None: logger.warning( f"arg `num_speculative_tokens` is a fixed value of {num_speculative_tokens_} for this model." @@ -1904,7 +1902,7 @@ def __init__(self, model: nn.Module, **kwargs): @property def get_model_config(self) -> dict: - return self.config.__dict__ + return self.model.config.__dict__ def export(self, export_dir: Optional[str] = None) -> str: """ @@ -1959,8 +1957,7 @@ def compile( Returns: :str: Path of the compiled ``qpc`` package. """ - # Method 2 - specializations, compiler_options = self.specializations( + specializations, compiler_options = self.model.get_specializations( batch_size, encoder_ctx_len, ctx_len, @@ -1979,9 +1976,7 @@ def compile( if num_speculative_tokens: logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq") - # Method 2 - output_names = self.output_names() - # output_names = self.get_output_names() + output_names = self.model.get_output_names() kv_cache_dtype = "float16" custom_io = {} @@ -2044,7 +2039,7 @@ def generate( # add start token id and initial position ids to inputs seq_len = 1 inputs["input_ids"] = ( - torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.config.decoder_start_token_id + torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id ).numpy() inputs["position_ids"] = ( torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy() @@ -2055,7 +2050,7 @@ def generate( ) outputs = { - "logits": np.random.randn(self.batch_size, 1, self.config.vocab_size).astype(np.float32), + "logits": np.random.randn(self.batch_size, 1, self.model.config.vocab_size).astype(np.float32), } self.qpc_session.set_buffers(outputs) @@ -2064,8 +2059,8 @@ def generate( outputs = self.qpc_session.run(inputs) # array to hold generated tokens - generated_ids = np.full((self.batch_size, generation_len + 1), self.config.eos_token_id) - generated_ids[:, 0] = [self.config.decoder_start_token_id] + generated_ids = np.full((self.batch_size, generation_len + 1), self.model.config.eos_token_id) + generated_ids[:, 0] = [self.model.config.decoder_start_token_id] logits = outputs["logits"] next_token = logits.argmax(-1) generated_ids[:, 1] = next_token.squeeze(1) @@ -2073,7 +2068,7 @@ def generate( if streamer: streamer.put(next_token) - inputs["input_features"] = np.zeros((self.batch_size, self.config.num_mel_bins, 1)).astype(np.float16) + inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float16) loop_start = perf_counter() for num_tokens in range(generation_len): @@ -2082,7 +2077,7 @@ def generate( next_token = logits.argmax(-1) generated_ids[:, num_tokens + 1] = next_token.squeeze(1) - if next_token[0][0] == self.config.eos_token_id: + if next_token[0][0] == self.model.config.eos_token_id: break inputs["input_ids"] = next_token diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 159aa2a70..4ae8928b7 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -324,7 +324,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( qeff_model.export() - ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.config, processor, data, sample_rate, ctx_len) + ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate, ctx_len) assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for pytorch output and ort output" From 595f3cef135e3ac4153e8bfd27bcada878e05472 Mon Sep 17 00:00:00 2001 From: Rishin Raj Date: Mon, 18 Aug 2025 10:15:19 +0000 Subject: [PATCH 3/8] Moved memory cleanup from decorator to explicit method as it would be better to clean the model before applying onnx transform and code clean up Signed-off-by: Rishin Raj --- QEfficient/base/modeling_qeff.py | 86 ++++++++-- .../transformers/models/modeling_auto.py | 20 ++- tests/base/test_export_memory_offload.py | 159 ++++++++++++++++++ 3 files changed, 243 insertions(+), 22 deletions(-) create mode 100644 tests/base/test_export_memory_offload.py diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 2f2eb57c8..c8c42fa9f 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -62,6 +62,9 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None: self.model_architecture = ( (arch := getattr(self.model.config, "architectures", None)) and len(arch) > 0 and arch[0] ) or None + + # Flag for checking if weights are offloaded + self._is_weights_offloaded: bool = False # Apply the transformations any_transformed = False @@ -74,6 +77,47 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None: else: logger.info(f"Pytorch transforms applied to model: {self.model_name}") + + def _offload_model_weights(self, offload_pt_weights) -> bool: + """ + Clear PyTorch weights after export if offload_pt_weights is set to True + + Returns: + bool: True if weights were successfully offloaded, False otherwise + """ + # Check if offloading is enabled and weights are not already offloaded + if offload_pt_weights and not self._is_weights_offloaded: + try: + self.model = self.model.to_empty(device="meta") + self._is_weights_offloaded = True + logger.info("Model weights offloaded to meta device") + + gc.collect() + logger.info("PyTorch weights cleared after export") + return True + + except Exception as e: + logger.error(f"Failed to offload model weights: {e}") + return False + return False + + + def _model_offloaded_check(self) -> None: + """ + Check if the model is in meta state or weights are offloaded. + + Raises: + RuntimeError: If model is in meta state or if weights are offloaded + """ + if self._is_weights_offloaded or any(param.is_meta for param in self.model.parameters()): + error_msg = ( + "Cannot re-export model: weights have been offloaded to save memory. " + "To re-export, please create a new model instance using from_pretrained() method.") + logger.error(error_msg) + raise RuntimeError(error_msg) + + + @property @abstractmethod def model_name(self) -> str: ... @@ -130,9 +174,15 @@ def _export( export_kwargs: Optional[Dict[str, any]] = None, onnx_transform_kwargs: Optional[Dict[str, any]] = None, export_dir: Optional[str] = None, + offload_pt_weights: bool = True, ) -> str: """ - Export the Pytorch model to ONNX. + Export the PyTorch model to ONNX and apply ONNX transforms + + This method: + 1. Exports PyTorch model to ONNX using torch.onnx.export + 2. Clears PyTorch weights after export + 3. Applies ONNX transforms with reduced memory footprint Args: :example_inputs (dict): Sample inputs to trace the model. @@ -141,29 +191,30 @@ def _export( :export_kwargs (dict): Additional arguments to be passed to `torch.onnx.export`. :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class. :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model. + :offload_pt_weights (bool): If True, offload PyTorch model weights to meta device + after successful export to reduce memory usage. Set to False if you need to + keep weights for further operations. Defaults to True. + Note: + Once weights are offloaded, the model cannot be re-exported. Create a new + instance using from_pretrained() for re-export. + """ onnx_path = export_dir / f"{self.model_name}.onnx" + + # Return early if ONNX already exists if onnx_path.is_file(): self.onnx_path = onnx_path return onnx_path - # Storing state_dict to load model's weights if export called again - # if not any(name for name, param in self.model.named_parameters() if param.is_meta): - # self.state_dict = self.model.state_dict() - - # Loading model if weights are in meta state from state_dict - if any(name for name, param in self.model.named_parameters() if param.is_meta): - logger.warning("Export called again, this feature is not supported yet.") - # TODO: Handle weights loading for VLMs - # self.model = self.model.to_empty(device=torch.device("cpu")) - # self.model.load_state_dict(self.state_dict) + # check if the model is in meta state or weights are offloaded + self._model_offloaded_check() + # Setup temporary paths tmp_onnx_dir = export_dir / "onnx_tmp" tmp_onnx_path = tmp_onnx_dir / f"{self.model_name}.onnx" tmp_onnx_dir.mkdir(parents=True, exist_ok=True) # Create input_names from example_inputs - input_names = [] for param in inspect.signature(self.model.forward).parameters: if param in example_inputs: @@ -199,7 +250,10 @@ def _export( opset_version=constants.ONNX_EXPORT_OPSET, **export_kwargs, ) - logger.info("Pytorch export successful") + logger.info("PyTorch export successful") + + _ = self._offload_model_weights(offload_pt_weights) + model = onnx.load(tmp_onnx_path, load_external_data=False) transform_kwargs = { @@ -211,17 +265,17 @@ def _export( for transform in self._onnx_transforms: model, transformed = transform.apply(model, **transform_kwargs) + model.metadata_props.append( onnx.StringStringEntryProto(key="qeff_transforms", value=",".join(self._transform_names())) ) logger.info("ONNX transforms applied") onnx.save(model, onnx_path) - logger.info("Transformed onnx saved") + logger.info("Transformed ONNX saved") except Exception as e: - logger.error(f"ONNX export (or) ONNXTransforms failed: {e}") - + logger.error(f"ONNX export or transforms failed: {e}") raise e finally: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3e50a2783..8d439dfd4 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -433,8 +433,8 @@ def __init__(self, model: nn.modules, **kwargs): self.model = model.get_qeff_vision_encoder() self.hash_params["qeff_auto_class"] = self.__class__.__name__ - def export(self, inputs, output_names, dynamic_axes, export_dir=None): - return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir) + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights) def compile( self, @@ -488,8 +488,8 @@ def __init__(self, model, **kwargs): self.model = model.get_qeff_language_decoder() self.hash_params["qeff_auto_class"] = self.__class__.__name__ - def export(self, inputs, output_names, dynamic_axes, export_dir=None): - return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir) + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights) def compile( self, @@ -583,14 +583,22 @@ def export( inputs = self.model.get_dummy_inputs(kv_offload=True) dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True) output_names = self.model.get_output_names(kv_offload=True) + self.vision_model.export( inputs["vision"], output_names["vision"], dynamic_axes["vision"], export_dir=export_dir, + offload_pt_weights=False ) - - self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir=export_dir) + self.lang_model.export( + inputs["lang"], + output_names["lang"], + dynamic_axes["lang"], + export_dir=export_dir, + offload_pt_weights=True + ) + return self.onnx_path def compile( diff --git a/tests/base/test_export_memory_offload.py b/tests/base/test_export_memory_offload.py new file mode 100644 index 000000000..9abdd4fe3 --- /dev/null +++ b/tests/base/test_export_memory_offload.py @@ -0,0 +1,159 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import pytest +import torch +from transformers import AutoConfig, AutoModelForCausalLM + +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM + + +# Simple test config for memory reduction testing +test_config = AutoConfig.for_model( + "gpt2", + max_position_embeddings=256, + num_hidden_layers=2, + num_attention_heads=4, + hidden_size=128, + intermediate_size=512, + vocab_size=127, + num_key_value_heads=2, +) + +model_kwargs = {"attn_implementation": "eager"} + + +@pytest.fixture +def tmp_cache(tmp_path, monkeypatch): + monkeypatch.setattr("QEfficient.base.modeling_qeff.QEFF_HOME", tmp_path) + yield tmp_path + + +def test_offload_weights_method(): + """Test the _offload_model_weights method with both True and False values.""" + model = AutoModelForCausalLM.from_config(test_config, **model_kwargs) + qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False) + + # Initially weights should not be offloaded + assert not qeff_model._is_weights_offloaded + assert not any(param.is_meta for param in qeff_model.model.parameters()) + + # Test with offload_pt_weights=True + success = qeff_model._offload_model_weights(offload_pt_weights=True) + assert success + assert qeff_model._is_weights_offloaded + assert all(param.is_meta for param in qeff_model.model.parameters()) + + # Reset for next test + model2 = AutoModelForCausalLM.from_config(test_config, **model_kwargs) + qeff_model2 = QEFFAutoModelForCausalLM(model2, continuous_batching=False) + + # Test with offload_pt_weights=False + success = qeff_model2._offload_model_weights(offload_pt_weights=False) + assert not success + assert not qeff_model2._is_weights_offloaded + assert not any(param.is_meta for param in qeff_model2.model.parameters()) + + +def test_re_export_behavior_with_offloaded_weights(tmp_cache): + """Test that re-export fails when weights are offloaded.""" + model = AutoModelForCausalLM.from_config(test_config, **model_kwargs) + qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False) + + # First export should succeed + first_export_path = qeff_model.export() + assert qeff_model.onnx_path is not None + + # Manually offload weights + qeff_model._offload_model_weights(offload_pt_weights=True) + assert qeff_model._is_weights_offloaded + + # Force a new export by removing the file + import os + os.remove(qeff_model.onnx_path) + qeff_model.onnx_path = None + + # Re-export should fail with RuntimeError due to offloaded weights + with pytest.raises(RuntimeError, match="weights have been offloaded"): + qeff_model.export() + + +def test_vlm_dual_qpc_memory_offload_behavior(): + """Test asymmetric memory offload behavior for VLM dual QPC models.""" + # Mock vision model (should NOT offload weights) + class MockVisionModel: + def __init__(self): + self._is_weights_offloaded = False + + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + if offload_pt_weights: + self._is_weights_offloaded = True + return "vision_export_path" + + # Mock language model (should offload weights) + class MockLangModel: + def __init__(self): + self._is_weights_offloaded = False + + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + if offload_pt_weights: + self._is_weights_offloaded = True + return "lang_export_path" + + # Test dual QPC behavior + vision_model = MockVisionModel() + lang_model = MockLangModel() + + # Simulate dual QPC export behavior + vision_model.export({}, [], {}, offload_pt_weights=False) # Vision model doesn't offload + lang_model.export({}, [], {}, offload_pt_weights=True) # Language model offloads + + # Verify asymmetric behavior + assert not vision_model._is_weights_offloaded # Vision model should NOT be offloaded + assert lang_model._is_weights_offloaded # Language model should be offloaded + + +def test_vlm_single_qpc_memory_offload_behavior(): + """Test memory offload behavior for VLM single QPC models with both True and False.""" + class MockParam: + def __init__(self, is_meta=False): + self.is_meta = is_meta + + class MockModel: + def __init__(self): + self._params = [MockParam(is_meta=False)] + + def parameters(self): + return self._params + + class MockSingleQPCModel: + def __init__(self): + self._is_weights_offloaded = False + self.model = MockModel() + + def _offload_model_weights(self): + self._is_weights_offloaded = True + for param in self.model.parameters(): + param.is_meta = True + return True + + def export(self, export_dir=None, offload_pt_weights=True): + if offload_pt_weights: + self._offload_model_weights() + return "single_qpc_export_path" + + # Test with offload_pt_weights=True + qeff_model = MockSingleQPCModel() + qeff_model.export(offload_pt_weights=True) + assert qeff_model._is_weights_offloaded + assert all(param.is_meta for param in qeff_model.model.parameters()) + + # Test with offload_pt_weights=False + qeff_model2 = MockSingleQPCModel() + qeff_model2.export(offload_pt_weights=False) + assert not qeff_model2._is_weights_offloaded + assert not any(param.is_meta for param in qeff_model2.model.parameters()) From 5b82053225df911dde94bbd8b08be6fde311f2a3 Mon Sep 17 00:00:00 2001 From: Rishin Raj Date: Mon, 18 Aug 2025 10:34:41 +0000 Subject: [PATCH 4/8] lint and format Signed-off-by: Rishin Raj --- QEfficient/base/modeling_qeff.py | 38 ++++++------- .../transformers/models/modeling_auto.py | 20 +++---- tests/base/test_export_memory_offload.py | 53 ++++++++++--------- 3 files changed, 54 insertions(+), 57 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index c8c42fa9f..fd4370acf 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -77,11 +77,10 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None: else: logger.info(f"Pytorch transforms applied to model: {self.model_name}") - def _offload_model_weights(self, offload_pt_weights) -> bool: """ Clear PyTorch weights after export if offload_pt_weights is set to True - + Returns: bool: True if weights were successfully offloaded, False otherwise """ @@ -91,33 +90,31 @@ def _offload_model_weights(self, offload_pt_weights) -> bool: self.model = self.model.to_empty(device="meta") self._is_weights_offloaded = True logger.info("Model weights offloaded to meta device") - + gc.collect() logger.info("PyTorch weights cleared after export") return True - + except Exception as e: logger.error(f"Failed to offload model weights: {e}") return False return False - def _model_offloaded_check(self) -> None: """ Check if the model is in meta state or weights are offloaded. - + Raises: - RuntimeError: If model is in meta state or if weights are offloaded + RuntimeError: If model is in meta state or if weights are offloaded """ - if self._is_weights_offloaded or any(param.is_meta for param in self.model.parameters()): + if self._is_weights_offloaded or any(param.is_meta for param in self.model.parameters()): error_msg = ( - "Cannot re-export model: weights have been offloaded to save memory. " - "To re-export, please create a new model instance using from_pretrained() method.") + "Cannot re-export model: weights have been offloaded to save memory. " + "To re-export, please create a new model instance using from_pretrained() method." + ) logger.error(error_msg) raise RuntimeError(error_msg) - - - + @property @abstractmethod def model_name(self) -> str: ... @@ -178,7 +175,7 @@ def _export( ) -> str: """ Export the PyTorch model to ONNX and apply ONNX transforms - + This method: 1. Exports PyTorch model to ONNX using torch.onnx.export 2. Clears PyTorch weights after export @@ -191,16 +188,16 @@ def _export( :export_kwargs (dict): Additional arguments to be passed to `torch.onnx.export`. :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class. :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model. - :offload_pt_weights (bool): If True, offload PyTorch model weights to meta device - after successful export to reduce memory usage. Set to False if you need to + :offload_pt_weights (bool): If True, offload PyTorch model weights to meta device + after successful export to reduce memory usage. Set to False if you need to keep weights for further operations. Defaults to True. Note: - Once weights are offloaded, the model cannot be re-exported. Create a new + Once weights are offloaded, the model cannot be re-exported. Create a new instance using from_pretrained() for re-export. - + """ onnx_path = export_dir / f"{self.model_name}.onnx" - + # Return early if ONNX already exists if onnx_path.is_file(): self.onnx_path = onnx_path @@ -253,7 +250,6 @@ def _export( logger.info("PyTorch export successful") _ = self._offload_model_weights(offload_pt_weights) - model = onnx.load(tmp_onnx_path, load_external_data=False) transform_kwargs = { @@ -265,7 +261,7 @@ def _export( for transform in self._onnx_transforms: model, transformed = transform.apply(model, **transform_kwargs) - + model.metadata_props.append( onnx.StringStringEntryProto(key="qeff_transforms", value=",".join(self._transform_names())) ) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 8d439dfd4..b3d27f3a5 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -434,7 +434,9 @@ def __init__(self, model: nn.modules, **kwargs): self.hash_params["qeff_auto_class"] = self.__class__.__name__ def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): - return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights) + return self._export( + inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights + ) def compile( self, @@ -489,7 +491,9 @@ def __init__(self, model, **kwargs): self.hash_params["qeff_auto_class"] = self.__class__.__name__ def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): - return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights) + return self._export( + inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights + ) def compile( self, @@ -583,22 +587,18 @@ def export( inputs = self.model.get_dummy_inputs(kv_offload=True) dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True) output_names = self.model.get_output_names(kv_offload=True) - + self.vision_model.export( inputs["vision"], output_names["vision"], dynamic_axes["vision"], export_dir=export_dir, - offload_pt_weights=False + offload_pt_weights=False, ) self.lang_model.export( - inputs["lang"], - output_names["lang"], - dynamic_axes["lang"], - export_dir=export_dir, - offload_pt_weights=True + inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir=export_dir, offload_pt_weights=True ) - + return self.onnx_path def compile( diff --git a/tests/base/test_export_memory_offload.py b/tests/base/test_export_memory_offload.py index 9abdd4fe3..2752b366d 100644 --- a/tests/base/test_export_memory_offload.py +++ b/tests/base/test_export_memory_offload.py @@ -6,12 +6,10 @@ # ----------------------------------------------------------------------------- import pytest -import torch from transformers import AutoConfig, AutoModelForCausalLM from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM - # Simple test config for memory reduction testing test_config = AutoConfig.for_model( "gpt2", @@ -37,21 +35,21 @@ def test_offload_weights_method(): """Test the _offload_model_weights method with both True and False values.""" model = AutoModelForCausalLM.from_config(test_config, **model_kwargs) qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False) - + # Initially weights should not be offloaded assert not qeff_model._is_weights_offloaded assert not any(param.is_meta for param in qeff_model.model.parameters()) - + # Test with offload_pt_weights=True success = qeff_model._offload_model_weights(offload_pt_weights=True) assert success assert qeff_model._is_weights_offloaded assert all(param.is_meta for param in qeff_model.model.parameters()) - + # Reset for next test model2 = AutoModelForCausalLM.from_config(test_config, **model_kwargs) qeff_model2 = QEFFAutoModelForCausalLM(model2, continuous_batching=False) - + # Test with offload_pt_weights=False success = qeff_model2._offload_model_weights(offload_pt_weights=False) assert not success @@ -63,20 +61,21 @@ def test_re_export_behavior_with_offloaded_weights(tmp_cache): """Test that re-export fails when weights are offloaded.""" model = AutoModelForCausalLM.from_config(test_config, **model_kwargs) qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False) - + # First export should succeed - first_export_path = qeff_model.export() + _ = qeff_model.export() assert qeff_model.onnx_path is not None - + # Manually offload weights qeff_model._offload_model_weights(offload_pt_weights=True) assert qeff_model._is_weights_offloaded - + # Force a new export by removing the file import os + os.remove(qeff_model.onnx_path) qeff_model.onnx_path = None - + # Re-export should fail with RuntimeError due to offloaded weights with pytest.raises(RuntimeError, match="weights have been offloaded"): qeff_model.export() @@ -84,74 +83,76 @@ def test_re_export_behavior_with_offloaded_weights(tmp_cache): def test_vlm_dual_qpc_memory_offload_behavior(): """Test asymmetric memory offload behavior for VLM dual QPC models.""" + # Mock vision model (should NOT offload weights) class MockVisionModel: def __init__(self): self._is_weights_offloaded = False - + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): if offload_pt_weights: self._is_weights_offloaded = True return "vision_export_path" - + # Mock language model (should offload weights) class MockLangModel: def __init__(self): self._is_weights_offloaded = False - + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): if offload_pt_weights: self._is_weights_offloaded = True return "lang_export_path" - + # Test dual QPC behavior vision_model = MockVisionModel() lang_model = MockLangModel() - + # Simulate dual QPC export behavior vision_model.export({}, [], {}, offload_pt_weights=False) # Vision model doesn't offload - lang_model.export({}, [], {}, offload_pt_weights=True) # Language model offloads - + lang_model.export({}, [], {}, offload_pt_weights=True) # Language model offloads + # Verify asymmetric behavior assert not vision_model._is_weights_offloaded # Vision model should NOT be offloaded - assert lang_model._is_weights_offloaded # Language model should be offloaded + assert lang_model._is_weights_offloaded # Language model should be offloaded def test_vlm_single_qpc_memory_offload_behavior(): """Test memory offload behavior for VLM single QPC models with both True and False.""" + class MockParam: def __init__(self, is_meta=False): self.is_meta = is_meta - + class MockModel: def __init__(self): self._params = [MockParam(is_meta=False)] - + def parameters(self): return self._params - + class MockSingleQPCModel: def __init__(self): self._is_weights_offloaded = False self.model = MockModel() - + def _offload_model_weights(self): self._is_weights_offloaded = True for param in self.model.parameters(): param.is_meta = True return True - + def export(self, export_dir=None, offload_pt_weights=True): if offload_pt_weights: self._offload_model_weights() return "single_qpc_export_path" - + # Test with offload_pt_weights=True qeff_model = MockSingleQPCModel() qeff_model.export(offload_pt_weights=True) assert qeff_model._is_weights_offloaded assert all(param.is_meta for param in qeff_model.model.parameters()) - + # Test with offload_pt_weights=False qeff_model2 = MockSingleQPCModel() qeff_model2.export(offload_pt_weights=False) From 0fafed67b94addbbb8f8ca04311a6947c7e6f844 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 19 Aug 2025 08:39:33 +0000 Subject: [PATCH 5/8] Updated deprecated_api test to load model again before another export Signed-off-by: Asmita Goswami --- tests/transformers/models/test_causal_lm_models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 77354ee23..49d2ccf8c 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -282,6 +282,10 @@ def test_causal_lm_export_with_deprecated_api(model_name): tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name) qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) new_api_onnx_model_path = qeff_model.export() + + # Again loading model since the export moves model to meta device + model, _ = load_causal_lm_model(model_name, n_layer=1) + qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name) _, old_api_onnx_model_path = qualcomm_efficient_converter( model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer ) From 723ff9729ed8864b536430365076fd4eda60e27d Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 19 Aug 2025 09:11:57 +0000 Subject: [PATCH 6/8] Lint Format Signed-off-by: Asmita Goswami --- QEfficient/base/modeling_qeff.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index fd4370acf..6c9f88d9f 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -5,6 +5,7 @@ # # ---------------------------------------------------------------------------- +import gc import inspect import logging import shutil @@ -62,7 +63,7 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None: self.model_architecture = ( (arch := getattr(self.model.config, "architectures", None)) and len(arch) > 0 and arch[0] ) or None - + # Flag for checking if weights are offloaded self._is_weights_offloaded: bool = False From 376c72d1c630a3c43cdc488d62d2d0a18a00ceaf Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 21 Aug 2025 06:44:56 +0000 Subject: [PATCH 7/8] Updated QEFF_HOME path Signed-off-by: Asmita Goswami --- tests/base/test_export_memory_offload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/test_export_memory_offload.py b/tests/base/test_export_memory_offload.py index 2752b366d..d1b7a4653 100644 --- a/tests/base/test_export_memory_offload.py +++ b/tests/base/test_export_memory_offload.py @@ -27,7 +27,7 @@ @pytest.fixture def tmp_cache(tmp_path, monkeypatch): - monkeypatch.setattr("QEfficient.base.modeling_qeff.QEFF_HOME", tmp_path) + monkeypatch.setattr("QEfficient.utils._utils.QEFF_HOME", tmp_path) yield tmp_path From 56af5caa0030bcfbcef704d3ceeba672b7731031 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 22 Aug 2025 06:32:05 +0000 Subject: [PATCH 8/8] Updated generation_config tensor from meta to cpu Signed-off-by: Asmita Goswami --- QEfficient/peft/auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py index f1532ad1b..820372561 100644 --- a/QEfficient/peft/auto.py +++ b/QEfficient/peft/auto.py @@ -287,7 +287,7 @@ def generate( generation_config = generation_config or self.model.generation_config generation_config, model_kwargs = self.model._prepare_generation_config(generation_config, **kwargs) - self.model._prepare_special_tokens(generation_config) + self.model._prepare_special_tokens(generation_config, device="cpu") if generation_config.do_sample: raise NotImplementedError("do_sample=True not supported currently") if generation_config.num_beams > 1: