pytorch · cccclai · Sep 8, 2025 · Sep 1, 2025
@@ -5,9 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO
+from executorch.exir.delegate import executorch_call_delegate
 
-from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.pass_base import ExportPass, ProxyValue
 from executorch.exir.tensor import TensorSpec
+from torch.utils import _pytree as pytree
 
 
 class BuildQuantIo(ExportPass):
@@ -26,26 +28,22 @@ def _make_spec(self, x):
         else:
             return None
 
-    def _build(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-        # Forcedly update delegate node's meta['spec'] to get correct output
-        # tensor size in runtime
-        call_delegate = [
-            node
-            for node in graph_module.graph.nodes
-            if node.op == "call_function" and node.name == "executorch_call_delegate"
-        ]
-        assert len(call_delegate) == 1
-        for n in graph_module.graph.nodes:
-            if QCOM_QUANTIZED_IO in n.meta:
-                n.meta["val"] = n.meta["val"].to(dtype=n.meta[QCOM_QUANTIZED_IO])
-
-        spec = []
-        for user in list(call_delegate[0].users):
-            spec.append(self._make_spec(user.meta["val"]))
-        call_delegate[0].meta["spec"] = tuple(spec)
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        self._build(graph_module)
-        graph_module.graph.eliminate_dead_code()
-        graph_module.recompile()
-        return PassResult(graph_module, True)
+    def placeholder(self, name: str, arg, meta):
+        if quantized_dtype := meta.data.get(QCOM_QUANTIZED_IO, None):
+            arg = arg.to(dtype=quantized_dtype)
+            meta["spec"] = self._make_spec(arg)
+        return super().placeholder(name, arg, meta)
+
+    def call_getitem(self, value, key: int, meta):
+        meta["spec"] = value.node.meta["spec"][key]
+        return super().call_getitem(value, key, meta)
+
+    def call_delegate(self, lowered_module, args, kwargs, meta):
+        args_data, _ = pytree.tree_map_only(
+            ProxyValue, lambda x: x.data, (args, kwargs)
+        )
+        meta["spec"] = pytree.tree_map(
+            self._make_spec,
+            executorch_call_delegate(lowered_module, *args_data),
+        )
+        return super().call_delegate(lowered_module, args, kwargs, meta)
@@ -396,7 +396,7 @@ def get_ptq_per_block_quant_config(
     )
 
 
-# TODO merge qat and ptq to a fucntion, and use a bool flag to control it
+# TODO merge qat and ptq to a function, and use a bool flag to control it
 def get_8a8w_qnn_qat_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
@@ -598,21 +598,7 @@ def get_qat_per_channel_quant_config(
         observer_or_fake_quant_ctr=weight_fake_quant_ctr,
     )
 
-    bias_fake_quant_ctr = FakeQuantize.with_args(
-        dtype=torch.int32,
-        quant_min=torch.iinfo(torch.int32).min,
-        quant_max=torch.iinfo(torch.int32).max,
-        qscheme=torch.per_tensor_symmetric,
-        reduce_range=True,
-        observer=MovingAverageMinMaxObserver,
-    )
-    bias_quantization_spec = QuantizationSpec(
-        dtype=torch.int32,
-        quant_min=torch.iinfo(torch.int32).min,
-        quant_max=torch.iinfo(torch.int32).max,
-        qscheme=torch.per_tensor_symmetric,
-        observer_or_fake_quant_ctr=bias_fake_quant_ctr,
-    )
+    bias_quantization_spec = _derived_bias_quant_spec
 
     quantization_config = QuantizationConfig(
         input_activation=act_quantization_spec,