From e05a478d53865ab67cdd017dd226e1ae28b4811d Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Tue, 9 Sep 2025 09:51:36 +0000
Subject: [PATCH 1/6] Resolved issue related custom_io in CLI

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/cloud/custom_yaml.py      | 76 ++++++++++++++++++++++++++++
 QEfficient/cloud/export.py           |  5 +-
 QEfficient/compile/compile_helper.py | 22 +++++---
 3 files changed, 95 insertions(+), 8 deletions(-)
 create mode 100644 QEfficient/cloud/custom_yaml.py

diff --git a/QEfficient/cloud/custom_yaml.py b/QEfficient/cloud/custom_yaml.py
new file mode 100644
index 000000000..23c041633
--- /dev/null
+++ b/QEfficient/cloud/custom_yaml.py
@@ -0,0 +1,76 @@
+from pathlib import Path
+import warnings
+
+def dump_custom_io(custom_io, cache_dir, dtype_suffix):
+    custom_io_yaml = Path(cache_dir) / f"custom_io_{dtype_suffix}.yaml"
+    with open(custom_io_yaml, "w") as fp:
+        for io_name, dtype in custom_io.items():
+            fp.write(f" - IOName: {io_name}\n   Precision: {dtype}\n\n")
+
+def generate_custom_io(qeff_model, cache_dir=".", mxint8_kv_cache=False):
+    model_class_name = type(qeff_model).__name__
+    if not model_class_name == "QEFFAutoModelForCausalLM":
+        output_names = qeff_model.model.get_output_names()
+    kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+    dtype_suffix = "int8" if mxint8_kv_cache else "fp16"
+
+    custom_io = {}
+
+    # if model_class_name in [
+    #     "QEffCausalLMForTextImageToTextModel",
+    #     "QEffVisionEncoderForTextImageToTextModel"
+    # ]:
+    #     dump_custom_io(custom_io, cache_dir, dtype_suffix)
+    #     warnings.warn(
+    #         f"custom_io generated for these '{model_class_name}' class is empty.",
+    #         UserWarning
+    #     )
+
+    # Dual QPC: generate two YAML files
+    if model_class_name == "_QEFFAutoModelForImageTextToTextDualQPC":
+        custom_io_vision = {}
+        for output_name in output_names.get("vision", []):
+            custom_io_vision[output_name] = kv_cache_dtype if output_name.startswith("past_") else "float16"
+
+        custom_io_lang = {}
+        for output_name in output_names.get("lang", []):
+            if output_name.endswith("_RetainedState"):
+                base_name = output_name[: -len("_RetainedState")]
+                custom_io_lang[base_name] = "float16" if "vision_embeds" in output_name else kv_cache_dtype
+                custom_io_lang[output_name] = "float16" if "vision_embeds" in output_name else kv_cache_dtype
+
+        dump_custom_io(custom_io_vision, cache_dir, f'{dtype_suffix}_vision')
+        dump_custom_io(custom_io_lang, cache_dir, f'{dtype_suffix}_lang')
+        return {**custom_io_vision, **custom_io_lang}
+
+    # Single QPC
+    elif model_class_name == "_QEFFAutoModelForImageTextToTextSingleQPC":
+        for input_name in output_names:
+            if input_name.endswith("_RetainedState"):
+                custom_io[input_name[: -len("_RetainedState")]] = (
+                    "float16" if "pixel_values" in input_name else kv_cache_dtype
+                )
+        for output_name in output_names:
+            if output_name.endswith("_RetainedState"):
+                custom_io[output_name] = "float16" if "pixel_values" in output_name else kv_cache_dtype
+
+    # Causal LM
+    elif model_class_name == "QEFFAutoModelForCausalLM":
+        for suffix in ["", "_RetainedState"]:
+            num_layers = getattr(qeff_model, "num_layers", 12)
+            for i in range(num_layers):
+                for kv in ["key", "value"]:
+                    custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
+
+    # Speech Seq2Seq
+    elif model_class_name == "QEFFAutoModelForSpeechSeq2Seq":
+        custom_io["input_features"] = kv_cache_dtype
+        for output_name in output_names:
+            if output_name.endswith("_RetainedState"):
+                custom_io[output_name[: -len("_RetainedState")]] = kv_cache_dtype
+                custom_io[output_name] = kv_cache_dtype
+    else:
+        warnings.warn(f"Unsupported model class: {model_class_name}", UserWarning)
+    
+    dump_custom_io(custom_io, cache_dir, dtype_suffix)
+    return custom_io
\ No newline at end of file
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 849325c9d..11c78c902 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -12,7 +12,7 @@
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
-
+from .custom_yaml import generate_custom_io
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
@@ -45,6 +45,7 @@ def get_onnx_model_path(
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
     )
+    generate_custom_io(qeff_model, cache_dir=".", mxint8_kv_cache=False)
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
@@ -107,4 +108,4 @@ def main(
         help="Set full batch size to enable continuous batching mode, default is None",
     )
     args = parser.parse_args()
-    main(**args.__dict__)
+    main(**args.__dict__)
\ No newline at end of file
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
index 56177cce9..e61e6112e 100644
--- a/QEfficient/compile/compile_helper.py
+++ b/QEfficient/compile/compile_helper.py
@@ -171,6 +171,7 @@ def compile(
     Returns:
         :str: Path to compiled ``qpc`` package.
     """
+    
     if full_batch_size and batch_size != 1:
         raise ValueError("Only either batch_size or full_batch_size should be greater than one")
 
@@ -184,12 +185,21 @@ def compile(
         path=specialization_json_path,
         full_batch_size=full_batch_size,
     )
-
-    # Select the customIO config based on the mx flag.
-    custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
-
+    
+    dtype_suffix = "int8" if mxint8 else "fp16"
+    source_path = f"./custom_io_{dtype_suffix}.yaml"
+    destination_path = os.path.join(os.path.dirname(qpc_path), f"custom_io_{dtype_suffix}.yaml")
+
+    # Move the custom YAML file to the cache/qeff_model directory
+    try:
+        shutil.move(source_path, destination_path)
+        print(f"Successfully moved '{source_path}' to '{destination_path}'.")
+    except Exception as e:
+        print(f"Error while moving file '{source_path}': {e}")
+
+    custom_io_file_name = f"custom_io_{dtype_suffix}.yaml"
     if custom_io_file_path is None:
-        custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+        custom_io_file_path = os.path.join(os.path.dirname(qpc_path), custom_io_file_name)
 
     if not os.path.isfile(custom_io_file_path):
         raise FileNotFoundError(
@@ -234,4 +244,4 @@ def compile(
         else:
             logger.info(f"Compiled QPC files can be found here: {qpc_path}")
 
-    return qpc_path
+    return qpc_path
\ No newline at end of file

From 294d97fcc683f335b02a237c11f135a64ae970c4 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Wed, 10 Sep 2025 05:05:02 +0000
Subject: [PATCH 2/6] Fixed issue of No custom_IO file found during compile
 through CLI

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/cloud/custom_yaml.py | 264 ++++++++++++++++++++++++--------
 1 file changed, 196 insertions(+), 68 deletions(-)

diff --git a/QEfficient/cloud/custom_yaml.py b/QEfficient/cloud/custom_yaml.py
index 23c041633..ba6da26e2 100644
--- a/QEfficient/cloud/custom_yaml.py
+++ b/QEfficient/cloud/custom_yaml.py
@@ -1,76 +1,204 @@
 from pathlib import Path
 import warnings
 
-def dump_custom_io(custom_io, cache_dir, dtype_suffix):
-    custom_io_yaml = Path(cache_dir) / f"custom_io_{dtype_suffix}.yaml"
-    with open(custom_io_yaml, "w") as fp:
-        for io_name, dtype in custom_io.items():
-            fp.write(f" - IOName: {io_name}\n   Precision: {dtype}\n\n")
-
-def generate_custom_io(qeff_model, cache_dir=".", mxint8_kv_cache=False):
-    model_class_name = type(qeff_model).__name__
-    if not model_class_name == "QEFFAutoModelForCausalLM":
-        output_names = qeff_model.model.get_output_names()
-    kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
-    dtype_suffix = "int8" if mxint8_kv_cache else "fp16"
-
-    custom_io = {}
-
-    # if model_class_name in [
-    #     "QEffCausalLMForTextImageToTextModel",
-    #     "QEffVisionEncoderForTextImageToTextModel"
-    # ]:
-    #     dump_custom_io(custom_io, cache_dir, dtype_suffix)
-    #     warnings.warn(
-    #         f"custom_io generated for these '{model_class_name}' class is empty.",
-    #         UserWarning
-    #     )
-
-    # Dual QPC: generate two YAML files
-    if model_class_name == "_QEFFAutoModelForImageTextToTextDualQPC":
-        custom_io_vision = {}
-        for output_name in output_names.get("vision", []):
-            custom_io_vision[output_name] = kv_cache_dtype if output_name.startswith("past_") else "float16"
+class CustomIOGenerator:
+    """
+    Abstract base class for generating custom IO mappings for different model types.
 
-        custom_io_lang = {}
-        for output_name in output_names.get("lang", []):
-            if output_name.endswith("_RetainedState"):
-                base_name = output_name[: -len("_RetainedState")]
-                custom_io_lang[base_name] = "float16" if "vision_embeds" in output_name else kv_cache_dtype
-                custom_io_lang[output_name] = "float16" if "vision_embeds" in output_name else kv_cache_dtype
-
-        dump_custom_io(custom_io_vision, cache_dir, f'{dtype_suffix}_vision')
-        dump_custom_io(custom_io_lang, cache_dir, f'{dtype_suffix}_lang')
-        return {**custom_io_vision, **custom_io_lang}
+    Args:
+        model (object): The model instance for which IO mappings are to be generated.
+        cache_dir (str): Directory path where the generated YAML files will be saved.
+        mxint8_kv_cache (bool): If True, use 'mxint8' precision for KV cache; otherwise, use 'float16'.
+    """
+
+    def __init__(self, model, cache_dir=".", mxint8_kv_cache=False):
+        self.model = model
+        self.cache_dir = Path(cache_dir)
+        self.kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+        self.dtype_suffix = "int8" if mxint8_kv_cache else "fp16"
+
+    def dump(self, custom_io: dict, suffix: str):
+        """
+        Writes the custom IO mapping to a YAML file.
+
+        Args:
+            custom_io (dict): Dictionary containing IO names and their precision types.
+            suffix (str): Suffix to append to the output filename.
+        """
+        custom_io_yaml = self.cache_dir / f"custom_io_{suffix}.yaml"
+        with open(custom_io_yaml, "w") as fp:
+            for io_name, dtype in custom_io.items():
+                fp.write(f" - IOName: {io_name}\n   Precision: {dtype}\n\n")
+
+    def generate(self) -> dict:
+        """
+        Abstract method to generate custom IO mappings.
+
+        Returns:
+            dict: A dictionary of IO names and their precision types.
+
+        Raises:
+            NotImplementedError: Must be implemented by subclasses.
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
+
+class CausalLMIOGenerator(CustomIOGenerator):
+    """
+    IO generator for causal language models.
+    """
 
-    # Single QPC
-    elif model_class_name == "_QEFFAutoModelForImageTextToTextSingleQPC":
-        for input_name in output_names:
-            if input_name.endswith("_RetainedState"):
-                custom_io[input_name[: -len("_RetainedState")]] = (
-                    "float16" if "pixel_values" in input_name else kv_cache_dtype
-                )
-        for output_name in output_names:
-            if output_name.endswith("_RetainedState"):
-                custom_io[output_name] = "float16" if "pixel_values" in output_name else kv_cache_dtype
-
-    # Causal LM
-    elif model_class_name == "QEFFAutoModelForCausalLM":
+    def generate(self) -> dict:
+        """
+        Generates IO mappings for past key/value states in causal language models.
+
+        Returns:
+            dict: Mapping of IO names to precision types.
+        """
+        custom_io = {}
+        num_layers = getattr(self.model, "num_layers", 12)
         for suffix in ["", "_RetainedState"]:
-            num_layers = getattr(qeff_model, "num_layers", 12)
             for i in range(num_layers):
                 for kv in ["key", "value"]:
-                    custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype
-
-    # Speech Seq2Seq
-    elif model_class_name == "QEFFAutoModelForSpeechSeq2Seq":
-        custom_io["input_features"] = kv_cache_dtype
-        for output_name in output_names:
-            if output_name.endswith("_RetainedState"):
-                custom_io[output_name[: -len("_RetainedState")]] = kv_cache_dtype
-                custom_io[output_name] = kv_cache_dtype
-    else:
-        warnings.warn(f"Unsupported model class: {model_class_name}", UserWarning)
-    
-    dump_custom_io(custom_io, cache_dir, dtype_suffix)
-    return custom_io
\ No newline at end of file
+                    custom_io[f"past_{kv}.{i}{suffix}"] = self.kv_cache_dtype
+        self.dump(custom_io, self.dtype_suffix)
+        return custom_io
+
+
+class DualQPCIOGenerator(CustomIOGenerator):
+    """
+    IO generator for dual QPC models (e.g., vision-language models).
+    """
+
+    def generate(self) -> dict:
+        """
+        Generates IO mappings for both vision and language components.
+
+        Returns:
+            dict: Combined mapping of IO names to precision types for vision and language outputs.
+        """
+        output_names = self.model.model.get_output_names()
+        custom_io_vision = {
+            name: self.kv_cache_dtype if name.startswith("past_") else "float16"
+            for name in output_names.get("vision", [])
+        }
+
+        custom_io_lang = {}
+        for name in output_names.get("lang", []):
+            if name.endswith("_RetainedState"):
+                base = name[:-len("_RetainedState")]
+                dtype = "float16" if "vision_embeds" in name else self.kv_cache_dtype
+                custom_io_lang[base] = dtype
+                custom_io_lang[name] = dtype
+
+        self.dump(custom_io_vision, f"{self.dtype_suffix}_vision")
+        self.dump(custom_io_lang, f"{self.dtype_suffix}_lang")
+        return {**custom_io_vision, **custom_io_lang}
+
+
+class SingleQPCIOGenerator(CustomIOGenerator):
+    """
+    IO generator for single QPC models.
+    """
+
+    def generate(self) -> dict:
+        """
+        Generates IO mappings for retained states in single QPC models.
+
+        Returns:
+            dict: Mapping of IO names to precision types.
+        """
+        output_names = self.model.model.get_output_names()
+        custom_io = {}
+        for name in output_names:
+            if name.endswith("_RetainedState"):
+                base = name[:-len("_RetainedState")]
+                dtype = "float16" if "pixel_values" in name else self.kv_cache_dtype
+                custom_io[base] = dtype
+                custom_io[name] = dtype
+        self.dump(custom_io, self.dtype_suffix)
+        return custom_io
+
+
+class SpeechSeq2SeqIOGenerator(CustomIOGenerator):
+    """
+    IO generator for speech sequence-to-sequence models.
+    """
+
+    def generate(self) -> dict:
+        """
+        Generates IO mappings for input features and retained states in speech models.
+
+        Returns:
+            dict: Mapping of IO names to precision types.
+        """
+        output_names = self.model.model.get_output_names()
+        custom_io = {"input_features": self.kv_cache_dtype}
+        for name in output_names:
+            if name.endswith("_RetainedState"):
+                base = name[:-len("_RetainedState")]
+                custom_io[base] = self.kv_cache_dtype
+                custom_io[name] = self.kv_cache_dtype
+        self.dump(custom_io, self.dtype_suffix)
+        return custom_io
+
+
+class UnsupportedModelIOGenerator(CustomIOGenerator):
+    """
+    Fallback IO generator for unsupported model types.
+    """
+
+    def generate(self) -> dict:
+        """
+        Emits a warning for unsupported model types.
+
+        Returns:
+            dict: Empty dictionary.
+        """
+        warnings.warn(f"Unsupported model class: {type(self.model).__name__}", UserWarning)
+        return {}
+
+
+class CustomIOFactory:
+    """
+    Factory class to instantiate the appropriate IO generator based on model type.
+    """
+
+    @staticmethod
+    def get_generator(model, cache_dir=".", mxint8_kv_cache=False) -> CustomIOGenerator:
+        """
+        Returns the appropriate IO generator instance for the given model.
+
+        Args:
+            model (object): The model instance.
+            cache_dir (str): Directory to store YAML files.
+            mxint8_kv_cache (bool): Flag to use 'mxint8' precision.
+
+        Returns:
+            CustomIOGenerator: An instance of the appropriate subclass.
+        """
+        model_class_name = type(model).__name__
+        mapping = {
+            "QEFFAutoModelForCausalLM": CausalLMIOGenerator,
+            "_QEFFAutoModelForImageTextToTextDualQPC": DualQPCIOGenerator,
+            "_QEFFAutoModelForImageTextToTextSingleQPC": SingleQPCIOGenerator,
+            "QEFFAutoModelForSpeechSeq2Seq": SpeechSeq2SeqIOGenerator,
+        }
+        generator_class = mapping.get(model_class_name, UnsupportedModelIOGenerator)
+        return generator_class(model, cache_dir, mxint8_kv_cache)
+
+
+def generate_custom_io(qeff_model, cache_dir=".", mxint8_kv_cache=False) -> dict:
+    """
+    Generates and returns custom IO mappings for the given QEFF model.
+
+    Args:
+        qeff_model (object): The model instance.
+        cache_dir (str): Directory to store YAML files.
+        mxint8_kv_cache (bool): Flag to use 'mxint8' precision.
+
+    Returns:
+        dict: Custom IO mapping generated by the appropriate generator.
+    """
+    generator = CustomIOFactory.get_generator(qeff_model, cache_dir, mxint8_kv_cache)
+    return generator.generate()

From 33e4b1ac31c106e58336391e395e4f6addcc9654 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Wed, 10 Sep 2025 05:06:58 +0000
Subject: [PATCH 3/6] Fixed issue of No custom_IO file found during compile
 through CLI

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/cloud/custom_yaml.py      | 9 +++++----
 QEfficient/cloud/export.py           | 4 +++-
 QEfficient/compile/compile_helper.py | 6 +++---
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/QEfficient/cloud/custom_yaml.py b/QEfficient/cloud/custom_yaml.py
index ba6da26e2..364f61320 100644
--- a/QEfficient/cloud/custom_yaml.py
+++ b/QEfficient/cloud/custom_yaml.py
@@ -1,5 +1,6 @@
-from pathlib import Path
 import warnings
+from pathlib import Path
+
 
 class CustomIOGenerator:
     """
@@ -86,7 +87,7 @@ def generate(self) -> dict:
         custom_io_lang = {}
         for name in output_names.get("lang", []):
             if name.endswith("_RetainedState"):
-                base = name[:-len("_RetainedState")]
+                base = name[: -len("_RetainedState")]
                 dtype = "float16" if "vision_embeds" in name else self.kv_cache_dtype
                 custom_io_lang[base] = dtype
                 custom_io_lang[name] = dtype
@@ -112,7 +113,7 @@ def generate(self) -> dict:
         custom_io = {}
         for name in output_names:
             if name.endswith("_RetainedState"):
-                base = name[:-len("_RetainedState")]
+                base = name[: -len("_RetainedState")]
                 dtype = "float16" if "pixel_values" in name else self.kv_cache_dtype
                 custom_io[base] = dtype
                 custom_io[name] = dtype
@@ -136,7 +137,7 @@ def generate(self) -> dict:
         custom_io = {"input_features": self.kv_cache_dtype}
         for name in output_names:
             if name.endswith("_RetainedState"):
-                base = name[:-len("_RetainedState")]
+                base = name[: -len("_RetainedState")]
                 custom_io[base] = self.kv_cache_dtype
                 custom_io[name] = self.kv_cache_dtype
         self.dump(custom_io, self.dtype_suffix)
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 11c78c902..218dc7c93 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -12,7 +12,9 @@
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
+
 from .custom_yaml import generate_custom_io
+
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
@@ -108,4 +110,4 @@ def main(
         help="Set full batch size to enable continuous batching mode, default is None",
     )
     args = parser.parse_args()
-    main(**args.__dict__)
\ No newline at end of file
+    main(**args.__dict__)
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
index e61e6112e..8f79b39c4 100644
--- a/QEfficient/compile/compile_helper.py
+++ b/QEfficient/compile/compile_helper.py
@@ -171,7 +171,7 @@ def compile(
     Returns:
         :str: Path to compiled ``qpc`` package.
     """
-    
+
     if full_batch_size and batch_size != 1:
         raise ValueError("Only either batch_size or full_batch_size should be greater than one")
 
@@ -185,7 +185,7 @@ def compile(
         path=specialization_json_path,
         full_batch_size=full_batch_size,
     )
-    
+
     dtype_suffix = "int8" if mxint8 else "fp16"
     source_path = f"./custom_io_{dtype_suffix}.yaml"
     destination_path = os.path.join(os.path.dirname(qpc_path), f"custom_io_{dtype_suffix}.yaml")
@@ -244,4 +244,4 @@ def compile(
         else:
             logger.info(f"Compiled QPC files can be found here: {qpc_path}")
 
-    return qpc_path
\ No newline at end of file
+    return qpc_path

From 5d9c60f2ebe69ff207621b1043d29803ae2a8594 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 12 Sep 2025 05:48:13 +0000
Subject: [PATCH 4/6] Moved custom_yaml.py in utils and other appropriate
 changes

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/cloud/export.py                 | 24 +++++++++++++++-------
 QEfficient/{cloud => utils}/custom_yaml.py |  8 ++++++++
 2 files changed, 25 insertions(+), 7 deletions(-)
 rename QEfficient/{cloud => utils}/custom_yaml.py (95%)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 218dc7c93..e0c6a9b59 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -11,23 +11,23 @@
 
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir
+from QEfficient.utils.custom_yaml import generate_custom_io
 from QEfficient.utils.logging_utils import logger
 
-from .custom_yaml import generate_custom_io
-
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
-def get_onnx_model_path(
+def get_onnx_path_and_setup_customIO(
     model_name: str,
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
     full_batch_size: Optional[int] = None,
     local_model_dir: Optional[str] = None,
+    mxint8_kv_cache: Optional[int] = False,
 ):
     """
-    exports the model to onnx if pre-exported file is not found and returns onnx_model_path
+    exports the model to onnx if pre-exported file is not found and returns onnx_model_path and generates cutom_io file.
 
     ``Mandatory`` Args:
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
@@ -47,9 +47,11 @@ def get_onnx_model_path(
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
     )
-    generate_custom_io(qeff_model, cache_dir=".", mxint8_kv_cache=False)
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
+
+    # Generating Custom IO for the compile.
+    generate_custom_io(qeff_model, mxint8_kv_cache=mxint8_kv_cache)
     return onnx_model_path
 
 
@@ -59,6 +61,7 @@ def main(
     hf_token: Optional[str] = None,
     local_model_dir: Optional[str] = None,
     full_batch_size: Optional[int] = None,
+    mxint8_kv_cache: Optional[bool] = False,
 ) -> None:
     """
     Helper function used by export CLI app for exporting to ONNX Model.
@@ -71,19 +74,20 @@ def main(
         :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
-
+        :mxint8_kv_cache (bool): Whether to export int8 model or not. ``Defaults to False.``
     .. code-block:: bash
 
         python -m QEfficient.cloud.export OPTIONS
 
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
-    get_onnx_model_path(
+    get_onnx_path_and_setup_customIO(
         model_name=model_name,
         cache_dir=cache_dir,
         hf_token=hf_token,
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
+        mxint8_kv_cache=mxint8_kv_cache,
     )
 
 
@@ -109,5 +113,11 @@ def main(
         default=None,
         help="Set full batch size to enable continuous batching mode, default is None",
     )
+    parser.add_argument(
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
+        required=False,
+        help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
+    )
     args = parser.parse_args()
     main(**args.__dict__)
diff --git a/QEfficient/cloud/custom_yaml.py b/QEfficient/utils/custom_yaml.py
similarity index 95%
rename from QEfficient/cloud/custom_yaml.py
rename to QEfficient/utils/custom_yaml.py
index 364f61320..2adb656b5 100644
--- a/QEfficient/cloud/custom_yaml.py
+++ b/QEfficient/utils/custom_yaml.py
@@ -1,3 +1,10 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
 import warnings
 from pathlib import Path
 
@@ -94,6 +101,7 @@ def generate(self) -> dict:
 
         self.dump(custom_io_vision, f"{self.dtype_suffix}_vision")
         self.dump(custom_io_lang, f"{self.dtype_suffix}_lang")
+        warnings.warn(f"Unsupported model class via CLI: {type(self.model).__name__}", UserWarning)
         return {**custom_io_vision, **custom_io_lang}
 
 

From d23558e3798824992128e79101ac318aa3f86edb Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 10 Oct 2025 05:14:37 +0000
Subject: [PATCH 5/6] Minnor fixes function name error

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 examples/cpp_execution/text_inference_using_cpp.py | 4 ++--
 tests/cloud/test_export_compile_execute.py         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/cpp_execution/text_inference_using_cpp.py b/examples/cpp_execution/text_inference_using_cpp.py
index 9b0d59c73..072f2c57c 100644
--- a/examples/cpp_execution/text_inference_using_cpp.py
+++ b/examples/cpp_execution/text_inference_using_cpp.py
@@ -14,7 +14,7 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 import QEfficient
-from QEfficient.cloud.export import get_onnx_model_path
+from QEfficient.cloud.export import get_onnx_path_and_setup_customIO
 from QEfficient.generation.text_generation_inference import fix_prompts, get_compilation_dims, get_input_prompts
 from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
 from QEfficient.utils.logging_utils import logger
@@ -103,7 +103,7 @@ def main(
         logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
     else:
         # Handle onnx model generation
-        onnx_model_path = get_onnx_model_path(
+        onnx_model_path = get_onnx_path_and_setup_customIO(
             model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
         )
         _ = QEfficient.compile(
diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py
index 7cac59da7..f1c80a6b0 100644
--- a/tests/cloud/test_export_compile_execute.py
+++ b/tests/cloud/test_export_compile_execute.py
@@ -18,7 +18,7 @@
 
 def check_export_compile_execute(mocker, model_name, full_batch_size=None, enable_qnn=False):
     check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
+    get_onnx_path_and_setup_customIO_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_path_and_setup_customIO")
     load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.execute, "load_hf_tokenizer")
     cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.execute, "cloud_ai_100_exec_kv")
 
@@ -29,9 +29,9 @@ def check_export_compile_execute(mocker, model_name, full_batch_size=None, enabl
     )
 
     check_and_assign_cache_dir_spy.assert_called_once()
-    get_onnx_model_path_spy.assert_called_once()
+    get_onnx_path_and_setup_customIO_spy.assert_called_once()
 
-    onnx_model_path = get_onnx_model_path_spy.spy_return
+    onnx_model_path = get_onnx_path_and_setup_customIO_spy.spy_return
 
     assert os.path.isfile(onnx_model_path)
 

From a018c373e818ed4f4d017b1a56c17ab76d522082 Mon Sep 17 00:00:00 2001
From: abhishek-singh591 <sabhis@qti.qualcomm.com>
Date: Fri, 10 Oct 2025 05:23:20 +0000
Subject: [PATCH 6/6] Minnor fixes function name error

Signed-off-by: abhishek-singh591 <sabhis@qti.qualcomm.com>
---
 QEfficient/cloud/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index a36d5a60d..a5e0b6e19 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -84,7 +84,7 @@ def main(
 
     This function serves as the entry point for exporting a PyTorch model, loaded
     via QEFFCommonLoader, to the ONNX format. It prepares the necessary
-    paths and calls `get_onnx_model_path`.
+    paths and calls `get_onnx_path_and_setup_customIO`.
 
     Parameters
     ----------