From 8a4cf1f3ce5ac21d06243f82f857e5086a05860c Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Tue, 22 Jul 2025 12:59:20 +0000
Subject: [PATCH 1/8] Deleted model after export to save memory

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 QEfficient/base/modeling_qeff.py              | 11 +++
 .../transformers/models/modeling_auto.py      | 95 ++++++++++---------
 .../models/test_speech_seq2seq_models.py      |  2 +-
 3 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 0b07bb6b3..f5bd40bdc 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -217,6 +217,11 @@ def _export(
             shutil.rmtree(tmp_onnx_dir, ignore_errors=True)
 
         self.onnx_path = onnx_path
+
+        # Clear the model to free up memory
+        self.model = None
+        gc.collect()
+
         return onnx_path
 
     @dump_qconfig
@@ -256,6 +261,12 @@ def _compile(
         if onnx_path is None and self.onnx_path is None:
             self.export()
 
+        # Method 1
+        # with init_empty_weights():
+        #     self.model = self.public_class.from_pretrained(
+        #         pretrained_model_name_or_path=self.config._name_or_path,
+        #     ).model
+
         onnx_path = Path(onnx_path or self.onnx_path)
         compile_dir = Path(compile_dir or onnx_path.parent)
         qpc_path = compile_dir / "qpc"
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 3e50a2783..9cfa3aaf0 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -108,7 +108,12 @@ def __new__(cls, *args, **kwargs):
 
     def auto_correct_inputs(self, inputs):
         checked = True
-        inputs_info = self.model.get_inputs_info()
+
+        # Method 2
+        inputs_info = self.inputs_info if hasattr(self, "inputs_info") else None
+
+        # Method 1
+        # inputs_info = self.model.get_inputs_info()
         for valid_input_info in inputs_info:
             if valid_input_info.name not in inputs:
                 checked = False
@@ -230,7 +235,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k
 
     @property
     def get_model_config(self) -> dict:
-        return self.model.config.__dict__
+        return self.config.__dict__
 
     def export(self, export_dir: Optional[str] = None) -> str:
         """
@@ -340,7 +345,9 @@ def generate(
             return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids)
         # PyTorch runtime
         else:
-            return self.pytorch_feature_generate(model=self.model, inputs=inputs)
+            return self.pytorch_feature_generate(
+                model=self.model, inputs=inputs
+            )  # TODO: Handle this case when self.model = None
 
     def cloud_ai_100_feature_generate(
         self,
@@ -623,9 +630,9 @@ def compile(
         if skip_lang and skip_vision:
             raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False")
 
-        output_names = self.model.get_output_names(kv_offload=True)
+        output_names = self.output_names(kv_offload=True)
 
-        specializations, compiler_options = self.model.get_specializations(
+        specializations, compiler_options = self.specializations(
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
             ctx_len=ctx_len,
@@ -805,7 +812,7 @@ def kv_offload_generate(
             lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
         )  # Need to use -1 as position_ids for invalid tokens
 
-        not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
+        not_mllama = hasattr(self.config, "model_type") and self.config.model_type != "mllama"
         if not_mllama:
             lang_inputs["image_idx"] = np.array([[0]])
 
@@ -901,11 +908,13 @@ def __init__(
             raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
         super().__init__(model, **kwargs)
 
+        self.public_class = QEFFAutoModelForImageTextToText
+
         # to handle internvl models
-        if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"):
-            self.model.config.llm_config.use_cache = True
-            self.model.config.llm_config._attn_implementation = "eager"
-            self.model.config.vision_config.use_flash_attn = "false"
+        if hasattr(self.config, "llm_config") and hasattr(self.config, "vision_config"):
+            self.config.llm_config.use_cache = True
+            self.config.llm_config._attn_implementation = "eager"
+            self.config.vision_config.use_flash_attn = "false"
         else:
             self.model.config.text_config.use_cache = True
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
@@ -929,7 +938,7 @@ def from_pretrained(
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
         config._attn_implementation = "eager"
         config.vision_config.use_flash_attn = "false"
-        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs)
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config=config, *args, **kwargs)
 
         return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
 
@@ -967,11 +976,11 @@ def compile(
                 f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
             )
 
-        output_names = self.model.get_output_names()
+        output_names = self.output_names()
 
         # Get specializations from modelling file
         # TODO: expose this via the auto class as well
-        specializations, compiler_options = self.model.get_specializations(
+        specializations, compiler_options = self.specializations(
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
             ctx_len=ctx_len,
@@ -1170,7 +1179,7 @@ def model_name(self) -> str:
 
     @property
     def get_model_config(self) -> dict:
-        return self.model.config.__dict__
+        return self.config.__dict__
 
 
 class QEFFAutoModelForImageTextToText:
@@ -1364,7 +1373,7 @@ def __init__(
         # previous transform function.
         self.model, transformed = SamplerTransform.apply(self.model, qaic_config, **kwargs)
         if self.is_tlm:
-            self.model.qaic_config["return_pdfs"] = True
+            self.qaic_config["return_pdfs"] = True
 
     @property
     def model_name(self) -> str:
@@ -1457,7 +1466,7 @@ def from_pretrained(
 
     @property
     def get_model_config(self) -> dict:
-        return self.model.config.__dict__
+        return self.config.__dict__
 
     def export(self, export_dir: Optional[str] = None) -> str:
         """
@@ -1472,9 +1481,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
         bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
         seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
         fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS
-        kv_cache_shape = get_padding_shape_from_config(
-            self.model.config, fbs if self.continuous_batching else bs, seq_len
-        )
+        kv_cache_shape = get_padding_shape_from_config(self.config, fbs if self.continuous_batching else bs, seq_len)
         example_inputs = {
             "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
             "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1),
@@ -1495,21 +1502,16 @@ def export(self, export_dir: Optional[str] = None) -> str:
                 2: "ctx_len",
             }
         output_names = []
-        if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False):
-            if self.model.qaic_config.get("return_pdfs", False):
+        if self.qaic_config is not None and self.qaic_config.get("include_sampler", False):
+            if self.qaic_config.get("return_pdfs", False):
                 output_names.append("probs")
             output_names.append("next_tokens")
         else:
             output_names.append("logits")
 
         # TODO Update the get_padding_shape_from_config method to handle the case when the model config has attention_chunk_size or sliding_window and it should return a list of shapes for each layer
-        if (
-            hasattr(self.model.config, "model_type")
-            and self.model.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH
-        ):
-            pkv_cache = self.model.get_dummy_pkv_cache(
-                self.model.config, fbs if self.continuous_batching else bs, seq_len
-            )
+        if hasattr(self.config, "model_type") and self.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH:
+            pkv_cache = self.model.get_dummy_pkv_cache(self.config, fbs if self.continuous_batching else bs, seq_len)
             for i in range(self.num_layers):
                 for kv in ["key", "value"]:
                     example_inputs["past_key_values"][i].append(torch.zeros(pkv_cache[0][0].shape, dtype=torch.float32))
@@ -1532,7 +1534,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
             example_inputs["num_logits_to_keep"] = torch.arange(nlk).view(nlk, 1)
             dynamic_axes["num_logits_to_keep"] = {0: "num_logits_to_keep"}
 
-        if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False):
+        if self.qaic_config is not None and self.qaic_config.get("include_sampler", False):
             example_inputs, output_names, dynamic_axes = self.get_sampling_inputs_and_outputs(
                 example_inputs=example_inputs,
                 output_names=output_names,
@@ -1565,7 +1567,7 @@ def get_sampling_inputs_and_outputs(
         dynamic_axes["last_accepted_output_tokens"] = {0: "batch_size", 1: "seq_len"}
 
         example_inputs["past_repetition_penalty_buffer"] = torch.zeros(
-            (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool
+            (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool
         )
         dynamic_axes["past_repetition_penalty_buffer"] = {
             0: "full_batch_size" if self.continuous_batching else "batch_size",
@@ -1578,7 +1580,7 @@ def get_sampling_inputs_and_outputs(
         dynamic_axes["repetition_penalties"] = {0: "batch_size"}
 
         example_inputs["past_presence_penalty_buffer"] = torch.zeros(
-            (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool
+            (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool
         )
         dynamic_axes["past_presence_penalty_buffer"] = {
             0: "full_batch_size" if self.continuous_batching else "batch_size",
@@ -1595,7 +1597,7 @@ def get_sampling_inputs_and_outputs(
         )
         dynamic_axes["temperatures"] = {0: "batch_size"}
 
-        max_top_k_ids = self.model.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS)
+        max_top_k_ids = self.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS)
         example_inputs["top_ks"] = torch.randint(1, max_top_k_ids, size=(bs, 1)).to(torch.int32)
         dynamic_axes["top_ks"] = {0: "batch_size"}
 
@@ -1724,8 +1726,8 @@ def compile(
             )
 
         if (
-            self.model.qaic_config is not None
-            and self.model.qaic_config.get("include_sampler", False)
+            self.qaic_config is not None
+            and self.qaic_config.get("include_sampler", False)
             and num_speculative_tokens is not None
             and num_speculative_tokens > 0
         ):
@@ -1824,8 +1826,8 @@ def generate(
             raise NotImplementedError("Only AI_100 runtime is supported right now via generate API")
 
     def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int):
-        if hasattr(self.model.config, "speculative_config"):
-            num_speculative_tokens_ = self.model.config.speculative_config["num_speculative_tokens"]
+        if hasattr(self.config, "speculative_config"):
+            num_speculative_tokens_ = self.config.speculative_config["num_speculative_tokens"]
             if num_speculative_tokens is not None:
                 logger.warning(
                     f"arg `num_speculative_tokens` is a fixed value of {num_speculative_tokens_} for this model."
@@ -1902,7 +1904,7 @@ def __init__(self, model: nn.Module, **kwargs):
 
     @property
     def get_model_config(self) -> dict:
-        return self.model.config.__dict__
+        return self.config.__dict__
 
     def export(self, export_dir: Optional[str] = None) -> str:
         """
@@ -1957,7 +1959,8 @@ def compile(
         Returns:
             :str: Path of the compiled ``qpc`` package.
         """
-        specializations, compiler_options = self.model.get_specializations(
+        # Method 2
+        specializations, compiler_options = self.specializations(
             batch_size,
             encoder_ctx_len,
             ctx_len,
@@ -1976,7 +1979,9 @@ def compile(
         if num_speculative_tokens:
             logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
 
-        output_names = self.model.get_output_names()
+        # Method 2
+        output_names = self.output_names()
+        # output_names = self.get_output_names()
 
         kv_cache_dtype = "float16"
         custom_io = {}
@@ -2039,7 +2044,7 @@ def generate(
         # add start token id and initial position ids to inputs
         seq_len = 1
         inputs["input_ids"] = (
-            torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id
+            torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.config.decoder_start_token_id
         ).numpy()
         inputs["position_ids"] = (
             torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy()
@@ -2050,7 +2055,7 @@ def generate(
         )
 
         outputs = {
-            "logits": np.random.randn(self.batch_size, 1, self.model.config.vocab_size).astype(np.float32),
+            "logits": np.random.randn(self.batch_size, 1, self.config.vocab_size).astype(np.float32),
         }
         self.qpc_session.set_buffers(outputs)
 
@@ -2059,8 +2064,8 @@ def generate(
         outputs = self.qpc_session.run(inputs)
 
         # array to hold generated tokens
-        generated_ids = np.full((self.batch_size, generation_len + 1), self.model.config.eos_token_id)
-        generated_ids[:, 0] = [self.model.config.decoder_start_token_id]
+        generated_ids = np.full((self.batch_size, generation_len + 1), self.config.eos_token_id)
+        generated_ids[:, 0] = [self.config.decoder_start_token_id]
         logits = outputs["logits"]
         next_token = logits.argmax(-1)
         generated_ids[:, 1] = next_token.squeeze(1)
@@ -2068,7 +2073,7 @@ def generate(
         if streamer:
             streamer.put(next_token)
 
-        inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float16)
+        inputs["input_features"] = np.zeros((self.batch_size, self.config.num_mel_bins, 1)).astype(np.float16)
 
         loop_start = perf_counter()
         for num_tokens in range(generation_len):
@@ -2077,7 +2082,7 @@ def generate(
             next_token = logits.argmax(-1)
             generated_ids[:, num_tokens + 1] = next_token.squeeze(1)
 
-            if next_token[0][0] == self.model.config.eos_token_id:
+            if next_token[0][0] == self.config.eos_token_id:
                 break
 
             inputs["input_ids"] = next_token
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
index 4ae8928b7..159aa2a70 100644
--- a/tests/transformers/models/test_speech_seq2seq_models.py
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -324,7 +324,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
 
     qeff_model.export()
 
-    ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate, ctx_len)
+    ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.config, processor, data, sample_rate, ctx_len)
 
     assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for pytorch output and ort output"
 

From 41f8ffff1c2f33dacd779c8f3130c41e55691f1a Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Thu, 7 Aug 2025 11:03:46 +0000
Subject: [PATCH 2/8] Unloading model weights only and keeping its architecture

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 QEfficient/base/modeling_qeff.py              | 22 ++---
 .../transformers/models/modeling_auto.py      | 95 +++++++++----------
 .../models/test_speech_seq2seq_models.py      |  2 +-
 3 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index f5bd40bdc..2f2eb57c8 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -147,6 +147,17 @@ def _export(
             self.onnx_path = onnx_path
             return onnx_path
 
+        # Storing state_dict to load model's weights if export called again
+        # if not any(name for name, param in self.model.named_parameters() if param.is_meta):
+        #     self.state_dict = self.model.state_dict()
+
+        # Loading model if weights are in meta state from state_dict
+        if any(name for name, param in self.model.named_parameters() if param.is_meta):
+            logger.warning("Export called again, this feature is not supported yet.")
+            # TODO: Handle weights loading for VLMs
+            # self.model = self.model.to_empty(device=torch.device("cpu"))
+            # self.model.load_state_dict(self.state_dict)
+
         tmp_onnx_dir = export_dir / "onnx_tmp"
         tmp_onnx_path = tmp_onnx_dir / f"{self.model_name}.onnx"
         tmp_onnx_dir.mkdir(parents=True, exist_ok=True)
@@ -217,11 +228,6 @@ def _export(
             shutil.rmtree(tmp_onnx_dir, ignore_errors=True)
 
         self.onnx_path = onnx_path
-
-        # Clear the model to free up memory
-        self.model = None
-        gc.collect()
-
         return onnx_path
 
     @dump_qconfig
@@ -261,12 +267,6 @@ def _compile(
         if onnx_path is None and self.onnx_path is None:
             self.export()
 
-        # Method 1
-        # with init_empty_weights():
-        #     self.model = self.public_class.from_pretrained(
-        #         pretrained_model_name_or_path=self.config._name_or_path,
-        #     ).model
-
         onnx_path = Path(onnx_path or self.onnx_path)
         compile_dir = Path(compile_dir or onnx_path.parent)
         qpc_path = compile_dir / "qpc"
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9cfa3aaf0..3e50a2783 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -108,12 +108,7 @@ def __new__(cls, *args, **kwargs):
 
     def auto_correct_inputs(self, inputs):
         checked = True
-
-        # Method 2
-        inputs_info = self.inputs_info if hasattr(self, "inputs_info") else None
-
-        # Method 1
-        # inputs_info = self.model.get_inputs_info()
+        inputs_info = self.model.get_inputs_info()
         for valid_input_info in inputs_info:
             if valid_input_info.name not in inputs:
                 checked = False
@@ -235,7 +230,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k
 
     @property
     def get_model_config(self) -> dict:
-        return self.config.__dict__
+        return self.model.config.__dict__
 
     def export(self, export_dir: Optional[str] = None) -> str:
         """
@@ -345,9 +340,7 @@ def generate(
             return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids)
         # PyTorch runtime
         else:
-            return self.pytorch_feature_generate(
-                model=self.model, inputs=inputs
-            )  # TODO: Handle this case when self.model = None
+            return self.pytorch_feature_generate(model=self.model, inputs=inputs)
 
     def cloud_ai_100_feature_generate(
         self,
@@ -630,9 +623,9 @@ def compile(
         if skip_lang and skip_vision:
             raise ValueError("Expected at least one of 'skip_lang' or 'skip_vision' to be False")
 
-        output_names = self.output_names(kv_offload=True)
+        output_names = self.model.get_output_names(kv_offload=True)
 
-        specializations, compiler_options = self.specializations(
+        specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
             ctx_len=ctx_len,
@@ -812,7 +805,7 @@ def kv_offload_generate(
             lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
         )  # Need to use -1 as position_ids for invalid tokens
 
-        not_mllama = hasattr(self.config, "model_type") and self.config.model_type != "mllama"
+        not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
         if not_mllama:
             lang_inputs["image_idx"] = np.array([[0]])
 
@@ -908,13 +901,11 @@ def __init__(
             raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
         super().__init__(model, **kwargs)
 
-        self.public_class = QEFFAutoModelForImageTextToText
-
         # to handle internvl models
-        if hasattr(self.config, "llm_config") and hasattr(self.config, "vision_config"):
-            self.config.llm_config.use_cache = True
-            self.config.llm_config._attn_implementation = "eager"
-            self.config.vision_config.use_flash_attn = "false"
+        if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"):
+            self.model.config.llm_config.use_cache = True
+            self.model.config.llm_config._attn_implementation = "eager"
+            self.model.config.vision_config.use_flash_attn = "false"
         else:
             self.model.config.text_config.use_cache = True
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
@@ -938,7 +929,7 @@ def from_pretrained(
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
         config._attn_implementation = "eager"
         config.vision_config.use_flash_attn = "false"
-        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config=config, *args, **kwargs)
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs)
 
         return cls(model, pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
 
@@ -976,11 +967,11 @@ def compile(
                 f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
             )
 
-        output_names = self.output_names()
+        output_names = self.model.get_output_names()
 
         # Get specializations from modelling file
         # TODO: expose this via the auto class as well
-        specializations, compiler_options = self.specializations(
+        specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
             ctx_len=ctx_len,
@@ -1179,7 +1170,7 @@ def model_name(self) -> str:
 
     @property
     def get_model_config(self) -> dict:
-        return self.config.__dict__
+        return self.model.config.__dict__
 
 
 class QEFFAutoModelForImageTextToText:
@@ -1373,7 +1364,7 @@ def __init__(
         # previous transform function.
         self.model, transformed = SamplerTransform.apply(self.model, qaic_config, **kwargs)
         if self.is_tlm:
-            self.qaic_config["return_pdfs"] = True
+            self.model.qaic_config["return_pdfs"] = True
 
     @property
     def model_name(self) -> str:
@@ -1466,7 +1457,7 @@ def from_pretrained(
 
     @property
     def get_model_config(self) -> dict:
-        return self.config.__dict__
+        return self.model.config.__dict__
 
     def export(self, export_dir: Optional[str] = None) -> str:
         """
@@ -1481,7 +1472,9 @@ def export(self, export_dir: Optional[str] = None) -> str:
         bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
         seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
         fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS
-        kv_cache_shape = get_padding_shape_from_config(self.config, fbs if self.continuous_batching else bs, seq_len)
+        kv_cache_shape = get_padding_shape_from_config(
+            self.model.config, fbs if self.continuous_batching else bs, seq_len
+        )
         example_inputs = {
             "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
             "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1),
@@ -1502,16 +1495,21 @@ def export(self, export_dir: Optional[str] = None) -> str:
                 2: "ctx_len",
             }
         output_names = []
-        if self.qaic_config is not None and self.qaic_config.get("include_sampler", False):
-            if self.qaic_config.get("return_pdfs", False):
+        if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False):
+            if self.model.qaic_config.get("return_pdfs", False):
                 output_names.append("probs")
             output_names.append("next_tokens")
         else:
             output_names.append("logits")
 
         # TODO Update the get_padding_shape_from_config method to handle the case when the model config has attention_chunk_size or sliding_window and it should return a list of shapes for each layer
-        if hasattr(self.config, "model_type") and self.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH:
-            pkv_cache = self.model.get_dummy_pkv_cache(self.config, fbs if self.continuous_batching else bs, seq_len)
+        if (
+            hasattr(self.model.config, "model_type")
+            and self.model.config.model_type in DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH
+        ):
+            pkv_cache = self.model.get_dummy_pkv_cache(
+                self.model.config, fbs if self.continuous_batching else bs, seq_len
+            )
             for i in range(self.num_layers):
                 for kv in ["key", "value"]:
                     example_inputs["past_key_values"][i].append(torch.zeros(pkv_cache[0][0].shape, dtype=torch.float32))
@@ -1534,7 +1532,7 @@ def export(self, export_dir: Optional[str] = None) -> str:
             example_inputs["num_logits_to_keep"] = torch.arange(nlk).view(nlk, 1)
             dynamic_axes["num_logits_to_keep"] = {0: "num_logits_to_keep"}
 
-        if self.qaic_config is not None and self.qaic_config.get("include_sampler", False):
+        if self.model.qaic_config is not None and self.model.qaic_config.get("include_sampler", False):
             example_inputs, output_names, dynamic_axes = self.get_sampling_inputs_and_outputs(
                 example_inputs=example_inputs,
                 output_names=output_names,
@@ -1567,7 +1565,7 @@ def get_sampling_inputs_and_outputs(
         dynamic_axes["last_accepted_output_tokens"] = {0: "batch_size", 1: "seq_len"}
 
         example_inputs["past_repetition_penalty_buffer"] = torch.zeros(
-            (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool
+            (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool
         )
         dynamic_axes["past_repetition_penalty_buffer"] = {
             0: "full_batch_size" if self.continuous_batching else "batch_size",
@@ -1580,7 +1578,7 @@ def get_sampling_inputs_and_outputs(
         dynamic_axes["repetition_penalties"] = {0: "batch_size"}
 
         example_inputs["past_presence_penalty_buffer"] = torch.zeros(
-            (fbs if self.continuous_batching else bs, self.config.vocab_size), dtype=torch.bool
+            (fbs if self.continuous_batching else bs, self.model.config.vocab_size), dtype=torch.bool
         )
         dynamic_axes["past_presence_penalty_buffer"] = {
             0: "full_batch_size" if self.continuous_batching else "batch_size",
@@ -1597,7 +1595,7 @@ def get_sampling_inputs_and_outputs(
         )
         dynamic_axes["temperatures"] = {0: "batch_size"}
 
-        max_top_k_ids = self.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS)
+        max_top_k_ids = self.model.qaic_config.get("max_top_k_ids", constants.ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS)
         example_inputs["top_ks"] = torch.randint(1, max_top_k_ids, size=(bs, 1)).to(torch.int32)
         dynamic_axes["top_ks"] = {0: "batch_size"}
 
@@ -1726,8 +1724,8 @@ def compile(
             )
 
         if (
-            self.qaic_config is not None
-            and self.qaic_config.get("include_sampler", False)
+            self.model.qaic_config is not None
+            and self.model.qaic_config.get("include_sampler", False)
             and num_speculative_tokens is not None
             and num_speculative_tokens > 0
         ):
@@ -1826,8 +1824,8 @@ def generate(
             raise NotImplementedError("Only AI_100 runtime is supported right now via generate API")
 
     def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int):
-        if hasattr(self.config, "speculative_config"):
-            num_speculative_tokens_ = self.config.speculative_config["num_speculative_tokens"]
+        if hasattr(self.model.config, "speculative_config"):
+            num_speculative_tokens_ = self.model.config.speculative_config["num_speculative_tokens"]
             if num_speculative_tokens is not None:
                 logger.warning(
                     f"arg `num_speculative_tokens` is a fixed value of {num_speculative_tokens_} for this model."
@@ -1904,7 +1902,7 @@ def __init__(self, model: nn.Module, **kwargs):
 
     @property
     def get_model_config(self) -> dict:
-        return self.config.__dict__
+        return self.model.config.__dict__
 
     def export(self, export_dir: Optional[str] = None) -> str:
         """
@@ -1959,8 +1957,7 @@ def compile(
         Returns:
             :str: Path of the compiled ``qpc`` package.
         """
-        # Method 2
-        specializations, compiler_options = self.specializations(
+        specializations, compiler_options = self.model.get_specializations(
             batch_size,
             encoder_ctx_len,
             ctx_len,
@@ -1979,9 +1976,7 @@ def compile(
         if num_speculative_tokens:
             logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
 
-        # Method 2
-        output_names = self.output_names()
-        # output_names = self.get_output_names()
+        output_names = self.model.get_output_names()
 
         kv_cache_dtype = "float16"
         custom_io = {}
@@ -2044,7 +2039,7 @@ def generate(
         # add start token id and initial position ids to inputs
         seq_len = 1
         inputs["input_ids"] = (
-            torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.config.decoder_start_token_id
+            torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id
         ).numpy()
         inputs["position_ids"] = (
             torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy()
@@ -2055,7 +2050,7 @@ def generate(
         )
 
         outputs = {
-            "logits": np.random.randn(self.batch_size, 1, self.config.vocab_size).astype(np.float32),
+            "logits": np.random.randn(self.batch_size, 1, self.model.config.vocab_size).astype(np.float32),
         }
         self.qpc_session.set_buffers(outputs)
 
@@ -2064,8 +2059,8 @@ def generate(
         outputs = self.qpc_session.run(inputs)
 
         # array to hold generated tokens
-        generated_ids = np.full((self.batch_size, generation_len + 1), self.config.eos_token_id)
-        generated_ids[:, 0] = [self.config.decoder_start_token_id]
+        generated_ids = np.full((self.batch_size, generation_len + 1), self.model.config.eos_token_id)
+        generated_ids[:, 0] = [self.model.config.decoder_start_token_id]
         logits = outputs["logits"]
         next_token = logits.argmax(-1)
         generated_ids[:, 1] = next_token.squeeze(1)
@@ -2073,7 +2068,7 @@ def generate(
         if streamer:
             streamer.put(next_token)
 
-        inputs["input_features"] = np.zeros((self.batch_size, self.config.num_mel_bins, 1)).astype(np.float16)
+        inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float16)
 
         loop_start = perf_counter()
         for num_tokens in range(generation_len):
@@ -2082,7 +2077,7 @@ def generate(
             next_token = logits.argmax(-1)
             generated_ids[:, num_tokens + 1] = next_token.squeeze(1)
 
-            if next_token[0][0] == self.config.eos_token_id:
+            if next_token[0][0] == self.model.config.eos_token_id:
                 break
 
             inputs["input_ids"] = next_token
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
index 159aa2a70..4ae8928b7 100644
--- a/tests/transformers/models/test_speech_seq2seq_models.py
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -324,7 +324,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
 
     qeff_model.export()
 
-    ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.config, processor, data, sample_rate, ctx_len)
+    ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate, ctx_len)
 
     assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for pytorch output and ort output"
 

From 595f3cef135e3ac4153e8bfd27bcada878e05472 Mon Sep 17 00:00:00 2001
From: Rishin Raj <rishinr@qti.qualcomm.com>
Date: Mon, 18 Aug 2025 10:15:19 +0000
Subject: [PATCH 3/8] Moved memory cleanup from decorator to explicit method as
 it would be better to clean the model before applying onnx transform and code
 clean up

Signed-off-by: Rishin Raj <rishinr@qti.qualcomm.com>
---
 QEfficient/base/modeling_qeff.py              |  86 ++++++++--
 .../transformers/models/modeling_auto.py      |  20 ++-
 tests/base/test_export_memory_offload.py      | 159 ++++++++++++++++++
 3 files changed, 243 insertions(+), 22 deletions(-)
 create mode 100644 tests/base/test_export_memory_offload.py

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 2f2eb57c8..c8c42fa9f 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -62,6 +62,9 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
         self.model_architecture = (
             (arch := getattr(self.model.config, "architectures", None)) and len(arch) > 0 and arch[0]
         ) or None
+        
+        # Flag for checking if weights are offloaded
+        self._is_weights_offloaded: bool = False
 
         # Apply the transformations
         any_transformed = False
@@ -74,6 +77,47 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
         else:
             logger.info(f"Pytorch transforms applied to model: {self.model_name}")
 
+
+    def _offload_model_weights(self, offload_pt_weights) -> bool:
+        """
+        Clear PyTorch weights after export if offload_pt_weights is set to True
+        
+        Returns:
+            bool: True if weights were successfully offloaded, False otherwise
+        """
+        # Check if offloading is enabled and weights are not already offloaded
+        if offload_pt_weights and not self._is_weights_offloaded:
+            try:
+                self.model = self.model.to_empty(device="meta")
+                self._is_weights_offloaded = True
+                logger.info("Model weights offloaded to meta device")
+            
+                gc.collect()
+                logger.info("PyTorch weights cleared after export")
+                return True
+        
+            except Exception as e:
+                logger.error(f"Failed to offload model weights: {e}")
+                return False
+        return False
+            
+
+    def _model_offloaded_check(self) -> None:
+        """
+        Check if the model is in meta state or weights are offloaded.
+        
+        Raises:
+            RuntimeError: If model is in meta state or if weights are offloaded 
+        """
+        if self._is_weights_offloaded or any(param.is_meta for param in self.model.parameters()): 
+            error_msg = (
+            "Cannot re-export model: weights have been offloaded to save memory. "
+            "To re-export, please create a new model instance using from_pretrained() method.")
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+    
+    
+            
     @property
     @abstractmethod
     def model_name(self) -> str: ...
@@ -130,9 +174,15 @@ def _export(
         export_kwargs: Optional[Dict[str, any]] = None,
         onnx_transform_kwargs: Optional[Dict[str, any]] = None,
         export_dir: Optional[str] = None,
+        offload_pt_weights: bool = True,
     ) -> str:
         """
-        Export the Pytorch model to ONNX.
+        Export the PyTorch model to ONNX and apply ONNX transforms
+        
+        This method:
+        1. Exports PyTorch model to ONNX using torch.onnx.export
+        2. Clears PyTorch weights after export
+        3. Applies ONNX transforms with reduced memory footprint
 
         Args:
             :example_inputs (dict): Sample inputs to trace the model.
@@ -141,29 +191,30 @@ def _export(
             :export_kwargs (dict): Additional arguments to be passed to `torch.onnx.export`.
             :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class.
             :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model.
+            :offload_pt_weights (bool): If True, offload PyTorch model weights to meta device 
+            after successful export to reduce memory usage. Set to False if you need to 
+            keep weights for further operations. Defaults to True.
+            Note:
+            Once weights are offloaded, the model cannot be re-exported. Create a new 
+            instance using from_pretrained() for re-export.
+     
         """
         onnx_path = export_dir / f"{self.model_name}.onnx"
+        
+        # Return early if ONNX already exists
         if onnx_path.is_file():
             self.onnx_path = onnx_path
             return onnx_path
 
-        # Storing state_dict to load model's weights if export called again
-        # if not any(name for name, param in self.model.named_parameters() if param.is_meta):
-        #     self.state_dict = self.model.state_dict()
-
-        # Loading model if weights are in meta state from state_dict
-        if any(name for name, param in self.model.named_parameters() if param.is_meta):
-            logger.warning("Export called again, this feature is not supported yet.")
-            # TODO: Handle weights loading for VLMs
-            # self.model = self.model.to_empty(device=torch.device("cpu"))
-            # self.model.load_state_dict(self.state_dict)
+        # check if the model is in meta state or weights are offloaded
+        self._model_offloaded_check()
 
+        # Setup temporary paths
         tmp_onnx_dir = export_dir / "onnx_tmp"
         tmp_onnx_path = tmp_onnx_dir / f"{self.model_name}.onnx"
         tmp_onnx_dir.mkdir(parents=True, exist_ok=True)
 
         # Create input_names from example_inputs
-
         input_names = []
         for param in inspect.signature(self.model.forward).parameters:
             if param in example_inputs:
@@ -199,7 +250,10 @@ def _export(
                 opset_version=constants.ONNX_EXPORT_OPSET,
                 **export_kwargs,
             )
-            logger.info("Pytorch export successful")
+            logger.info("PyTorch export successful")
+
+            _ = self._offload_model_weights(offload_pt_weights)
+            
 
             model = onnx.load(tmp_onnx_path, load_external_data=False)
             transform_kwargs = {
@@ -211,17 +265,17 @@ def _export(
 
             for transform in self._onnx_transforms:
                 model, transformed = transform.apply(model, **transform_kwargs)
+            
             model.metadata_props.append(
                 onnx.StringStringEntryProto(key="qeff_transforms", value=",".join(self._transform_names()))
             )
             logger.info("ONNX transforms applied")
 
             onnx.save(model, onnx_path)
-            logger.info("Transformed onnx saved")
+            logger.info("Transformed ONNX saved")
 
         except Exception as e:
-            logger.error(f"ONNX export (or) ONNXTransforms failed: {e}")
-
+            logger.error(f"ONNX export or transforms failed: {e}")
             raise e
 
         finally:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 3e50a2783..8d439dfd4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -433,8 +433,8 @@ def __init__(self, model: nn.modules, **kwargs):
         self.model = model.get_qeff_vision_encoder()
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
 
-    def export(self, inputs, output_names, dynamic_axes, export_dir=None):
-        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+    def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights)
 
     def compile(
         self,
@@ -488,8 +488,8 @@ def __init__(self, model, **kwargs):
         self.model = model.get_qeff_language_decoder()
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
 
-    def export(self, inputs, output_names, dynamic_axes, export_dir=None):
-        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+    def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights)
 
     def compile(
         self,
@@ -583,14 +583,22 @@ def export(
         inputs = self.model.get_dummy_inputs(kv_offload=True)
         dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True)
         output_names = self.model.get_output_names(kv_offload=True)
+        
         self.vision_model.export(
             inputs["vision"],
             output_names["vision"],
             dynamic_axes["vision"],
             export_dir=export_dir,
+            offload_pt_weights=False
         )
-
-        self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir=export_dir)
+        self.lang_model.export(
+            inputs["lang"], 
+            output_names["lang"], 
+            dynamic_axes["lang"], 
+            export_dir=export_dir,
+            offload_pt_weights=True  
+        )
+        
         return self.onnx_path
 
     def compile(
diff --git a/tests/base/test_export_memory_offload.py b/tests/base/test_export_memory_offload.py
new file mode 100644
index 000000000..9abdd4fe3
--- /dev/null
+++ b/tests/base/test_export_memory_offload.py
@@ -0,0 +1,159 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import pytest
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+
+
+# Simple test config for memory reduction testing
+test_config = AutoConfig.for_model(
+    "gpt2",
+    max_position_embeddings=256,
+    num_hidden_layers=2,
+    num_attention_heads=4,
+    hidden_size=128,
+    intermediate_size=512,
+    vocab_size=127,
+    num_key_value_heads=2,
+)
+
+model_kwargs = {"attn_implementation": "eager"}
+
+
+@pytest.fixture
+def tmp_cache(tmp_path, monkeypatch):
+    monkeypatch.setattr("QEfficient.base.modeling_qeff.QEFF_HOME", tmp_path)
+    yield tmp_path
+
+
+def test_offload_weights_method():
+    """Test the _offload_model_weights method with both True and False values."""
+    model = AutoModelForCausalLM.from_config(test_config, **model_kwargs)
+    qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False)
+    
+    # Initially weights should not be offloaded
+    assert not qeff_model._is_weights_offloaded
+    assert not any(param.is_meta for param in qeff_model.model.parameters())
+    
+    # Test with offload_pt_weights=True
+    success = qeff_model._offload_model_weights(offload_pt_weights=True)
+    assert success
+    assert qeff_model._is_weights_offloaded
+    assert all(param.is_meta for param in qeff_model.model.parameters())
+    
+    # Reset for next test
+    model2 = AutoModelForCausalLM.from_config(test_config, **model_kwargs)
+    qeff_model2 = QEFFAutoModelForCausalLM(model2, continuous_batching=False)
+    
+    # Test with offload_pt_weights=False
+    success = qeff_model2._offload_model_weights(offload_pt_weights=False)
+    assert not success
+    assert not qeff_model2._is_weights_offloaded
+    assert not any(param.is_meta for param in qeff_model2.model.parameters())
+
+
+def test_re_export_behavior_with_offloaded_weights(tmp_cache):
+    """Test that re-export fails when weights are offloaded."""
+    model = AutoModelForCausalLM.from_config(test_config, **model_kwargs)
+    qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False)
+    
+    # First export should succeed
+    first_export_path = qeff_model.export()
+    assert qeff_model.onnx_path is not None
+    
+    # Manually offload weights
+    qeff_model._offload_model_weights(offload_pt_weights=True)
+    assert qeff_model._is_weights_offloaded
+    
+    # Force a new export by removing the file
+    import os
+    os.remove(qeff_model.onnx_path)
+    qeff_model.onnx_path = None
+    
+    # Re-export should fail with RuntimeError due to offloaded weights
+    with pytest.raises(RuntimeError, match="weights have been offloaded"):
+        qeff_model.export()
+
+
+def test_vlm_dual_qpc_memory_offload_behavior():
+    """Test asymmetric memory offload behavior for VLM dual QPC models."""
+    # Mock vision model (should NOT offload weights)
+    class MockVisionModel:
+        def __init__(self):
+            self._is_weights_offloaded = False
+            
+        def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
+            if offload_pt_weights:
+                self._is_weights_offloaded = True
+            return "vision_export_path"
+    
+    # Mock language model (should offload weights)
+    class MockLangModel:
+        def __init__(self):
+            self._is_weights_offloaded = False
+            
+        def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
+            if offload_pt_weights:
+                self._is_weights_offloaded = True
+            return "lang_export_path"
+    
+    # Test dual QPC behavior
+    vision_model = MockVisionModel()
+    lang_model = MockLangModel()
+    
+    # Simulate dual QPC export behavior
+    vision_model.export({}, [], {}, offload_pt_weights=False)  # Vision model doesn't offload
+    lang_model.export({}, [], {}, offload_pt_weights=True)     # Language model offloads
+    
+    # Verify asymmetric behavior
+    assert not vision_model._is_weights_offloaded  # Vision model should NOT be offloaded
+    assert lang_model._is_weights_offloaded        # Language model should be offloaded
+
+
+def test_vlm_single_qpc_memory_offload_behavior():
+    """Test memory offload behavior for VLM single QPC models with both True and False."""
+    class MockParam:
+        def __init__(self, is_meta=False):
+            self.is_meta = is_meta
+    
+    class MockModel:
+        def __init__(self):
+            self._params = [MockParam(is_meta=False)]
+            
+        def parameters(self):
+            return self._params
+    
+    class MockSingleQPCModel:
+        def __init__(self):
+            self._is_weights_offloaded = False
+            self.model = MockModel()
+            
+        def _offload_model_weights(self):
+            self._is_weights_offloaded = True
+            for param in self.model.parameters():
+                param.is_meta = True
+            return True
+            
+        def export(self, export_dir=None, offload_pt_weights=True):
+            if offload_pt_weights:
+                self._offload_model_weights()
+            return "single_qpc_export_path"
+    
+    # Test with offload_pt_weights=True
+    qeff_model = MockSingleQPCModel()
+    qeff_model.export(offload_pt_weights=True)
+    assert qeff_model._is_weights_offloaded
+    assert all(param.is_meta for param in qeff_model.model.parameters())
+    
+    # Test with offload_pt_weights=False
+    qeff_model2 = MockSingleQPCModel()
+    qeff_model2.export(offload_pt_weights=False)
+    assert not qeff_model2._is_weights_offloaded
+    assert not any(param.is_meta for param in qeff_model2.model.parameters())

From 5b82053225df911dde94bbd8b08be6fde311f2a3 Mon Sep 17 00:00:00 2001
From: Rishin Raj <rishinr@qti.qualcomm.com>
Date: Mon, 18 Aug 2025 10:34:41 +0000
Subject: [PATCH 4/8] lint and format

Signed-off-by: Rishin Raj <rishinr@qti.qualcomm.com>
---
 QEfficient/base/modeling_qeff.py              | 38 ++++++-------
 .../transformers/models/modeling_auto.py      | 20 +++----
 tests/base/test_export_memory_offload.py      | 53 ++++++++++---------
 3 files changed, 54 insertions(+), 57 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index c8c42fa9f..fd4370acf 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -77,11 +77,10 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
         else:
             logger.info(f"Pytorch transforms applied to model: {self.model_name}")
 
-
     def _offload_model_weights(self, offload_pt_weights) -> bool:
         """
         Clear PyTorch weights after export if offload_pt_weights is set to True
-        
+
         Returns:
             bool: True if weights were successfully offloaded, False otherwise
         """
@@ -91,33 +90,31 @@ def _offload_model_weights(self, offload_pt_weights) -> bool:
                 self.model = self.model.to_empty(device="meta")
                 self._is_weights_offloaded = True
                 logger.info("Model weights offloaded to meta device")
-            
+
                 gc.collect()
                 logger.info("PyTorch weights cleared after export")
                 return True
-        
+
             except Exception as e:
                 logger.error(f"Failed to offload model weights: {e}")
                 return False
         return False
-            
 
     def _model_offloaded_check(self) -> None:
         """
         Check if the model is in meta state or weights are offloaded.
-        
+
         Raises:
-            RuntimeError: If model is in meta state or if weights are offloaded 
+            RuntimeError: If model is in meta state or if weights are offloaded
         """
-        if self._is_weights_offloaded or any(param.is_meta for param in self.model.parameters()): 
+        if self._is_weights_offloaded or any(param.is_meta for param in self.model.parameters()):
             error_msg = (
-            "Cannot re-export model: weights have been offloaded to save memory. "
-            "To re-export, please create a new model instance using from_pretrained() method.")
+                "Cannot re-export model: weights have been offloaded to save memory. "
+                "To re-export, please create a new model instance using from_pretrained() method."
+            )
             logger.error(error_msg)
             raise RuntimeError(error_msg)
-    
-    
-            
+
     @property
     @abstractmethod
     def model_name(self) -> str: ...
@@ -178,7 +175,7 @@ def _export(
     ) -> str:
         """
         Export the PyTorch model to ONNX and apply ONNX transforms
-        
+
         This method:
         1. Exports PyTorch model to ONNX using torch.onnx.export
         2. Clears PyTorch weights after export
@@ -191,16 +188,16 @@ def _export(
             :export_kwargs (dict): Additional arguments to be passed to `torch.onnx.export`.
             :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class.
             :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model.
-            :offload_pt_weights (bool): If True, offload PyTorch model weights to meta device 
-            after successful export to reduce memory usage. Set to False if you need to 
+            :offload_pt_weights (bool): If True, offload PyTorch model weights to meta device
+            after successful export to reduce memory usage. Set to False if you need to
             keep weights for further operations. Defaults to True.
             Note:
-            Once weights are offloaded, the model cannot be re-exported. Create a new 
+            Once weights are offloaded, the model cannot be re-exported. Create a new
             instance using from_pretrained() for re-export.
-     
+
         """
         onnx_path = export_dir / f"{self.model_name}.onnx"
-        
+
         # Return early if ONNX already exists
         if onnx_path.is_file():
             self.onnx_path = onnx_path
@@ -253,7 +250,6 @@ def _export(
             logger.info("PyTorch export successful")
 
             _ = self._offload_model_weights(offload_pt_weights)
-            
 
             model = onnx.load(tmp_onnx_path, load_external_data=False)
             transform_kwargs = {
@@ -265,7 +261,7 @@ def _export(
 
             for transform in self._onnx_transforms:
                 model, transformed = transform.apply(model, **transform_kwargs)
-            
+
             model.metadata_props.append(
                 onnx.StringStringEntryProto(key="qeff_transforms", value=",".join(self._transform_names()))
             )
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 8d439dfd4..b3d27f3a5 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -434,7 +434,9 @@ def __init__(self, model: nn.modules, **kwargs):
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
 
     def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
-        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights)
+        return self._export(
+            inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights
+        )
 
     def compile(
         self,
@@ -489,7 +491,9 @@ def __init__(self, model, **kwargs):
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
 
     def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
-        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights)
+        return self._export(
+            inputs, output_names, dynamic_axes, export_dir=export_dir, offload_pt_weights=offload_pt_weights
+        )
 
     def compile(
         self,
@@ -583,22 +587,18 @@ def export(
         inputs = self.model.get_dummy_inputs(kv_offload=True)
         dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True)
         output_names = self.model.get_output_names(kv_offload=True)
-        
+
         self.vision_model.export(
             inputs["vision"],
             output_names["vision"],
             dynamic_axes["vision"],
             export_dir=export_dir,
-            offload_pt_weights=False
+            offload_pt_weights=False,
         )
         self.lang_model.export(
-            inputs["lang"], 
-            output_names["lang"], 
-            dynamic_axes["lang"], 
-            export_dir=export_dir,
-            offload_pt_weights=True  
+            inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir=export_dir, offload_pt_weights=True
         )
-        
+
         return self.onnx_path
 
     def compile(
diff --git a/tests/base/test_export_memory_offload.py b/tests/base/test_export_memory_offload.py
index 9abdd4fe3..2752b366d 100644
--- a/tests/base/test_export_memory_offload.py
+++ b/tests/base/test_export_memory_offload.py
@@ -6,12 +6,10 @@
 # -----------------------------------------------------------------------------
 
 import pytest
-import torch
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 
-
 # Simple test config for memory reduction testing
 test_config = AutoConfig.for_model(
     "gpt2",
@@ -37,21 +35,21 @@ def test_offload_weights_method():
     """Test the _offload_model_weights method with both True and False values."""
     model = AutoModelForCausalLM.from_config(test_config, **model_kwargs)
     qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False)
-    
+
     # Initially weights should not be offloaded
     assert not qeff_model._is_weights_offloaded
     assert not any(param.is_meta for param in qeff_model.model.parameters())
-    
+
     # Test with offload_pt_weights=True
     success = qeff_model._offload_model_weights(offload_pt_weights=True)
     assert success
     assert qeff_model._is_weights_offloaded
     assert all(param.is_meta for param in qeff_model.model.parameters())
-    
+
     # Reset for next test
     model2 = AutoModelForCausalLM.from_config(test_config, **model_kwargs)
     qeff_model2 = QEFFAutoModelForCausalLM(model2, continuous_batching=False)
-    
+
     # Test with offload_pt_weights=False
     success = qeff_model2._offload_model_weights(offload_pt_weights=False)
     assert not success
@@ -63,20 +61,21 @@ def test_re_export_behavior_with_offloaded_weights(tmp_cache):
     """Test that re-export fails when weights are offloaded."""
     model = AutoModelForCausalLM.from_config(test_config, **model_kwargs)
     qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False)
-    
+
     # First export should succeed
-    first_export_path = qeff_model.export()
+    _ = qeff_model.export()
     assert qeff_model.onnx_path is not None
-    
+
     # Manually offload weights
     qeff_model._offload_model_weights(offload_pt_weights=True)
     assert qeff_model._is_weights_offloaded
-    
+
     # Force a new export by removing the file
     import os
+
     os.remove(qeff_model.onnx_path)
     qeff_model.onnx_path = None
-    
+
     # Re-export should fail with RuntimeError due to offloaded weights
     with pytest.raises(RuntimeError, match="weights have been offloaded"):
         qeff_model.export()
@@ -84,74 +83,76 @@ def test_re_export_behavior_with_offloaded_weights(tmp_cache):
 
 def test_vlm_dual_qpc_memory_offload_behavior():
     """Test asymmetric memory offload behavior for VLM dual QPC models."""
+
     # Mock vision model (should NOT offload weights)
     class MockVisionModel:
         def __init__(self):
             self._is_weights_offloaded = False
-            
+
         def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
             if offload_pt_weights:
                 self._is_weights_offloaded = True
             return "vision_export_path"
-    
+
     # Mock language model (should offload weights)
     class MockLangModel:
         def __init__(self):
             self._is_weights_offloaded = False
-            
+
         def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True):
             if offload_pt_weights:
                 self._is_weights_offloaded = True
             return "lang_export_path"
-    
+
     # Test dual QPC behavior
     vision_model = MockVisionModel()
     lang_model = MockLangModel()
-    
+
     # Simulate dual QPC export behavior
     vision_model.export({}, [], {}, offload_pt_weights=False)  # Vision model doesn't offload
-    lang_model.export({}, [], {}, offload_pt_weights=True)     # Language model offloads
-    
+    lang_model.export({}, [], {}, offload_pt_weights=True)  # Language model offloads
+
     # Verify asymmetric behavior
     assert not vision_model._is_weights_offloaded  # Vision model should NOT be offloaded
-    assert lang_model._is_weights_offloaded        # Language model should be offloaded
+    assert lang_model._is_weights_offloaded  # Language model should be offloaded
 
 
 def test_vlm_single_qpc_memory_offload_behavior():
     """Test memory offload behavior for VLM single QPC models with both True and False."""
+
     class MockParam:
         def __init__(self, is_meta=False):
             self.is_meta = is_meta
-    
+
     class MockModel:
         def __init__(self):
             self._params = [MockParam(is_meta=False)]
-            
+
         def parameters(self):
             return self._params
-    
+
     class MockSingleQPCModel:
         def __init__(self):
             self._is_weights_offloaded = False
             self.model = MockModel()
-            
+
         def _offload_model_weights(self):
             self._is_weights_offloaded = True
             for param in self.model.parameters():
                 param.is_meta = True
             return True
-            
+
         def export(self, export_dir=None, offload_pt_weights=True):
             if offload_pt_weights:
                 self._offload_model_weights()
             return "single_qpc_export_path"
-    
+
     # Test with offload_pt_weights=True
     qeff_model = MockSingleQPCModel()
     qeff_model.export(offload_pt_weights=True)
     assert qeff_model._is_weights_offloaded
     assert all(param.is_meta for param in qeff_model.model.parameters())
-    
+
     # Test with offload_pt_weights=False
     qeff_model2 = MockSingleQPCModel()
     qeff_model2.export(offload_pt_weights=False)

From 0fafed67b94addbbb8f8ca04311a6947c7e6f844 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Tue, 19 Aug 2025 08:39:33 +0000
Subject: [PATCH 5/8] Updated deprecated_api test to load model again before
 another export

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 tests/transformers/models/test_causal_lm_models.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 77354ee23..49d2ccf8c 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -282,6 +282,10 @@ def test_causal_lm_export_with_deprecated_api(model_name):
     tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
     qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
     new_api_onnx_model_path = qeff_model.export()
+
+    # Again loading model since the export moves model to meta device
+    model, _ = load_causal_lm_model(model_name, n_layer=1)
+    qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
     _, old_api_onnx_model_path = qualcomm_efficient_converter(
         model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer
     )

From 723ff9729ed8864b536430365076fd4eda60e27d Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Tue, 19 Aug 2025 09:11:57 +0000
Subject: [PATCH 6/8] Lint Format

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 QEfficient/base/modeling_qeff.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index fd4370acf..6c9f88d9f 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -5,6 +5,7 @@
 #
 # ----------------------------------------------------------------------------
 
+import gc
 import inspect
 import logging
 import shutil
@@ -62,7 +63,7 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None:
         self.model_architecture = (
             (arch := getattr(self.model.config, "architectures", None)) and len(arch) > 0 and arch[0]
         ) or None
-        
+
         # Flag for checking if weights are offloaded
         self._is_weights_offloaded: bool = False
 

From 376c72d1c630a3c43cdc488d62d2d0a18a00ceaf Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Thu, 21 Aug 2025 06:44:56 +0000
Subject: [PATCH 7/8] Updated QEFF_HOME path

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 tests/base/test_export_memory_offload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/base/test_export_memory_offload.py b/tests/base/test_export_memory_offload.py
index 2752b366d..d1b7a4653 100644
--- a/tests/base/test_export_memory_offload.py
+++ b/tests/base/test_export_memory_offload.py
@@ -27,7 +27,7 @@
 
 @pytest.fixture
 def tmp_cache(tmp_path, monkeypatch):
-    monkeypatch.setattr("QEfficient.base.modeling_qeff.QEFF_HOME", tmp_path)
+    monkeypatch.setattr("QEfficient.utils._utils.QEFF_HOME", tmp_path)
     yield tmp_path
 
 

From 56af5caa0030bcfbcef704d3ceeba672b7731031 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <asmigosw@qti.qualcomm.com>
Date: Fri, 22 Aug 2025 06:32:05 +0000
Subject: [PATCH 8/8] Updated generation_config tensor from meta to cpu

Signed-off-by: Asmita Goswami <asmigosw@qti.qualcomm.com>
---
 QEfficient/peft/auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
index f1532ad1b..820372561 100644
--- a/QEfficient/peft/auto.py
+++ b/QEfficient/peft/auto.py
@@ -287,7 +287,7 @@ def generate(
 
         generation_config = generation_config or self.model.generation_config
         generation_config, model_kwargs = self.model._prepare_generation_config(generation_config, **kwargs)
-        self.model._prepare_special_tokens(generation_config)
+        self.model._prepare_special_tokens(generation_config, device="cpu")
         if generation_config.do_sample:
             raise NotImplementedError("do_sample=True not supported currently")
         if generation_config.num_beams > 1: