NXP Backend: Add pass to remove unnecessary Quantize/Dequantize nodes. (#15148)

MartinPavella · web-flow · commit c00d726be3b3 · 2025-12-03T13:29:52.000+01:00
### Summary This PR adds an edge dialect pre-processing pass to remove some Q/DQ nodes. This enables some non-delegated nodes (which run on the CPU) to run in directly in int8 and avoid the QDQ compute overhead. This improves the inference speed (by eliminating the need to artificially quantize and de-quantize input and output values. ### Test plan Unit tests provided. cc @robert-kalmar
diff --git a/backends/nxp/backend/edge_helper.py b/backends/nxp/backend/edge_helper.py
@@ -125,3 +125,14 @@ def previous_non_qdq_node(node: Node, input_index: int = 0) -> Node | None:
             current_node = current_node.args[0]
         else:
             return current_node
+
+
+Scale = list[float] | float
+ZeroPoint = list[int] | int
+
+
+def get_quantization_parameters_for(node: Node) -> tuple[Scale, ZeroPoint] | None:
+    if "quantize" not in node.target.__name__ or len(node.args) < 3:
+        return None
+
+    return node.args[1], node.args[2]  # Scale and zero_point
diff --git a/backends/nxp/edge_passes/remove_additional_quantize_dequantize_nodes_pass.py b/backends/nxp/edge_passes/remove_additional_quantize_dequantize_nodes_pass.py
@@ -0,0 +1,111 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_helper import get_quantization_parameters_for
+from executorch.backends.nxp.edge_passes.neutron_edge_pass import NeutronEdgePass
+from executorch.backends.nxp.neutron_partitioner import QDQClusterRecognizer
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx.passes.infra.pass_base import PassResult
+
+
+class RemoveAdditionalQDQClustersPass(NeutronEdgePass):
+    """
+    After delegation of partitions, there may be additional dequantize quantize nodes for QDQ clusters that were
+    not delegated. If dequantize quantize nodes are quantized per tensor and quantization parameters of dequantize
+    and quantize nodes in a QDQ cluster are equal, the nodes can be removed and thus the inner nodes computed in int8.
+
+                                         │
+                            ┌────────────▼──────────┐
+                            │ dequantize_per_tensor │
+                            └────────────┬──────────┘
+                                         │                                    │
+                                     ┌───▼──┐        replace with         ┌───▼──┐
+                                     │ node │       ──────────────►       │ node │
+                                     └───┬──┘                             └───┬──┘
+                                         │                                    ▼
+                             ┌───────────▼─────────┐
+                             │ quantize_per_tensor │
+                             └───────────┬─────────┘
+                                         ▼
+
+    """
+
+    qdq_per_channel_nodes = (
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    )
+
+    qdq_per_tensor_nodes = (
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    )
+
+    def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        nodes = list(graph_module.graph.nodes)
+        qdq_clusterer = QDQClusterRecognizer()
+        qdq_clusterer.tag_qdq_clusters(nodes)
+
+        for cluster in qdq_clusterer.cluster_map.values():
+            # For now, enable only permute_copy and cat.
+            if cluster.compute_node.target not in [
+                exir_ops.edge.aten.permute_copy.default,
+                exir_ops.edge.aten.cat.default,
+            ]:
+                continue
+
+            # Ensure cluster doesn't contain dequantize/quantize per channel nodes.
+            if any(
+                node
+                for node in cluster.ops
+                if node.target in self.qdq_per_channel_nodes
+            ):
+                continue
+
+            qdq_nodes = [
+                node for node in cluster.ops if node.target in self.qdq_per_tensor_nodes
+            ]
+
+            qdq_nodes_quant_params = [
+                get_quantization_parameters_for(node) for node in qdq_nodes
+            ]
+
+            equal_quant_scales = [
+                np.allclose(
+                    qdq_nodes_quant_params[idx][0], qdq_nodes_quant_params[idx + 1][0]
+                )
+                for idx in range(len(qdq_nodes_quant_params[:-1]))
+            ]
+
+            equal_quant_zero_points = [
+                np.allclose(
+                    qdq_nodes_quant_params[idx][1], qdq_nodes_quant_params[idx + 1][1]
+                )
+                for idx in range(len(qdq_nodes_quant_params[:-1]))
+            ]
+
+            # Check if all quantization params are equal to ensure that QDQ cluster can be removed.
+            if not all(equal_quant_scales + equal_quant_zero_points):
+                continue
+
+            # Replace the uses of each dequantize/quantize node with its arg node.
+            for qdq_node in qdq_nodes:
+                qdq_node.replace_all_uses_with(qdq_node.args[0])
+                graph_module.graph.erase_node(qdq_node)
+
+            # Remove compute node cluster info from node meta.
+            cluster.compute_node.meta.pop("cluster")
+
+            graph_module = self.recompile_module(graph_module)
+
+            # The graph has now changed, and we cannot keep iterating through it. Return the new graph and the parent
+            #  class will call this pass again.
+            return PassResult(graph_module, True)
+
+        return PassResult(graph_module, False)
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
@@ -17,6 +17,9 @@
 from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
     NeutronEdgePassManager,
 )
+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (
+    RemoveAdditionalQDQClustersPass,
+)
 from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (
     RemoveIOQuantOpsPass,
 )
@@ -35,7 +38,6 @@
 from torch.export import export
 from torchao.quantization.pt2e.quantizer import Quantizer
 
-
 neutron_converter_flavor = "SDK_25_09"
 neutron_target_spec = NeutronTargetSpec(
     target="imxrt700", neutron_converter_flavor=neutron_converter_flavor
@@ -64,7 +66,6 @@ def _get_default_quantizer(target_spec: NeutronTargetSpec) -> Quantizer:
 def to_model_input_spec(
     input_spec: tuple[ModelInputSpec, ...] | tuple[int, ...] | list[tuple[int, ...]]
 ) -> tuple[ModelInputSpec, ...]:
-
     if isinstance(input_spec, tuple) and all(
         isinstance(spec, ModelInputSpec) for spec in input_spec
     ):
@@ -139,6 +140,10 @@ def to_quantized_edge_program(
             [RemoveIOQuantOpsPass(edge_program_manager=edge_program_manager)]
         )
 
+    edge_program_manager = edge_program_manager.transform(
+        NeutronEdgePassManager([RemoveAdditionalQDQClustersPass()])
+    )
+
     return edge_program_manager
 
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_permute_copy_converter.py
@@ -104,7 +104,7 @@ def forward(self, x):
         return torch.permute(x, self.perm)
 
 
-class TestPermuteCopyConversion(kgb.SpyAgency, unittest.TestCase):
+class TestPermuteCopyConversion(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         torch.manual_seed(23)
@@ -302,9 +302,9 @@ def test_permute_copy_non_delegated_conversion__from_permute_4D__quantized(
         edge_program = to_quantized_edge_program(model, input_shape).exported_program()
 
         nodes = list(edge_program.graph.nodes)
-        assert len(nodes) == 10
+        assert len(nodes) == 8
         assert (
-            nodes[6].target == exir_ops.edge.aten.permute_copy.default
+            nodes[5].target == exir_ops.edge.aten.permute_copy.default
         )  # PermuteCopy not delegated.
 
     @parameterized.expand(
@@ -320,7 +320,7 @@ def test_permute_copy_non_delegated_conversion__from_transpose_4D__quantized(
         edge_program = to_quantized_edge_program(model, input_shape).exported_program()
 
         nodes = list(edge_program.graph.nodes)
-        assert len(nodes) == 10
+        assert len(nodes) == 8
         assert (
-            nodes[6].target == exir_ops.edge.aten.permute_copy.default
+            nodes[5].target == exir_ops.edge.aten.permute_copy.default
         )  # PermuteCopy not delegated.
diff --git a/backends/nxp/tests/test_edge_passes.py b/backends/nxp/tests/test_edge_passes.py
@@ -3,33 +3,54 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
 import unittest
 
 import kgb
 import numpy as np
 import torch
 
+from executorch.backends.nxp.backend.custom_delegation_options import (
+    CustomDelegationOptions,
+)
 from executorch.backends.nxp.backend.edge_helper import _is_dequantize, _is_quantize
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import (
     ViewCopyConverter,
 )
+from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
+    NeutronEdgePassManager,
+)
+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (
+    RemoveAdditionalQDQClustersPass,
+)
+from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
+from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from executorch.backends.nxp.quantizer.utils import post_training_quantize
 from executorch.backends.nxp.tests.executorch_pipeline import (
+    get_random_calibration_inputs,
     neutron_target_spec,
+    to_model_input_spec,
     to_quantized_edge_program,
 )
 from executorch.backends.nxp.tests.executors import (
+    compare_output_arrays,
     EdgeProgramExecutor,
     OverrideTargetSupportCheck,
 )
+from executorch.backends.nxp.tests.ir.converter.node_converter.test_permute_copy_converter import (
+    Conv2dPermuteModule,
+)
 from executorch.backends.nxp.tests.models import (
     ConvActivationModule,
     ConvFCFCSoftmaxModuleWithoutReshape,
     LinearActivationModule,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.extension.export_util.utils import export_to_edge
 from parameterized import parameterized
 from torch.export import ExportedProgram
 from torch.fx import Graph, Node
@@ -117,7 +138,6 @@ def test_moving_fusable_activations_into_separate_qdq_clusters__addmm(
             call_original=True,
             owner=EdgeProgramToIRConverter,
         ) as converter_spy:
-
             input_shape = (1, 4)
             model = LinearActivationModule(
                 activation=activation,
@@ -161,7 +181,6 @@ def test_moving_fusable_activations_into_separate_qdq_clusters__mm(
             call_original=True,
             owner=EdgeProgramToIRConverter,
         ) as converter_spy:
-
             input_shape = (1, 4)
             model = LinearActivationModule(
                 activation=activation,
@@ -205,7 +224,6 @@ def test_moving_fusable_activations_into_separate_qdq_clusters__linear(
             call_original=True,
             owner=EdgeProgramToIRConverter,
         ) as converter_spy:
-
             input_shape = (1, 4)
             model = LinearActivationModule(
                 activation=activation,
@@ -249,7 +267,6 @@ def test_moving_fusable_activations_into_separate_qdq_clusters__conv(
             call_original=True,
             owner=EdgeProgramToIRConverter,
         ) as converter_spy:
-
             input_shape = (1, 4, 8, 8)
             model = ConvActivationModule(
                 activation=activation, inplace=True, in_channels=input_shape[1]
@@ -273,3 +290,91 @@ def test_moving_fusable_activations_into_separate_qdq_clusters__conv(
                 nodes[13]
             )
             assert _is_quantize(nodes[14])
+
+    def test_remove_additional_quantize_dequantize_nodes_pass(self):
+        input_shape = (1, 3, 8, 16)
+        new_dims = (3, 2, 1, 0)
+        model = Conv2dPermuteModule(input_shape[1], new_dims)
+        target = "imxrt700"
+        custom_delegation_options = CustomDelegationOptions()
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+
+        example_input = calibration_inputs[0]
+        exir_program_aten = torch.export.export(model, example_input, strict=True)
+
+        exir_program_aten_quant = post_training_quantize(
+            exir_program_aten,
+            calibration_inputs,
+            NeutronQuantizer(neutron_target_spec),
+        )
+        edge_program_manager = export_to_edge(
+            exir_program_aten_quant,
+            example_input,
+        )
+
+        edge_program_manager = edge_program_manager.transform(NeutronEdgePassManager())
+
+        compile_spec = generate_neutron_compile_spec(target, "SDK_25_09")
+        partitioner = NeutronPartitioner(
+            compile_spec, neutron_target_spec, custom_delegation_options
+        )
+
+        edge_program_manager = edge_program_manager.to_backend(partitioner)
+
+        # Make sure QDQ cluster for permute_copy is present.
+        edge_program_with_qdq_cluster = copy.deepcopy(
+            edge_program_manager.exported_program()
+        )
+        nodes = list(edge_program_with_qdq_cluster.graph.nodes)
+        assert len(nodes) == 10
+        assert (
+            nodes[5].target
+            == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+        )
+        assert nodes[6].target == exir_ops.edge.aten.permute_copy.default
+        assert "cluster" in nodes[6].meta
+        assert (
+            nodes[7].target
+            == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+        )
+
+        # Run pass for removal of additional QDQ nodes and compute in non-float types where possible
+        edge_program_manager = edge_program_manager.transform(
+            NeutronEdgePassManager([RemoveAdditionalQDQClustersPass()])
+        )
+
+        # Make sure QDQ cluster for permute_copy is removed.
+        edge_program_without_qdq_cluster = edge_program_manager.exported_program()
+        nodes = list(edge_program_without_qdq_cluster.graph.nodes)
+        assert len(nodes) == 8
+        assert nodes[4].name == "getitem"
+        assert nodes[5].target == exir_ops.edge.aten.permute_copy.default
+        assert "cluster" not in nodes[5].meta
+        assert (
+            nodes[6].target
+            == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+        )
+
+        edge_program_executor_without_qdq_cluster = EdgeProgramExecutor(
+            edge_program_without_qdq_cluster
+        )
+        edge_program_executor_with_qdq_cluster = EdgeProgramExecutor(
+            edge_program_with_qdq_cluster
+        )
+
+        input_data = np.random.random(input_shape).astype(np.float32)
+        edge_program_output_without_qdq_cluster = (
+            edge_program_executor_without_qdq_cluster.inference(input_data)
+        )
+        edge_program_output_with_qdq_cluster = (
+            edge_program_executor_with_qdq_cluster.inference(input_data)
+        )
+
+        compare_output_arrays(
+            edge_program_output_without_qdq_cluster,
+            edge_program_output_with_qdq_cluster,
+            "main output",
+        )
diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py
@@ -18,6 +18,9 @@
 from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
     NeutronEdgePassManager,
 )
+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (
+    RemoveAdditionalQDQClustersPass,
+)
 from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (
     RemoveIOQuantOpsPass,
 )
@@ -258,6 +261,10 @@ def get_model_and_inputs_from_name(model_name: str):
             [RemoveIOQuantOpsPass(edge_program_manager=edge_program_manager)]
         )
 
+    edge_program_manager = edge_program_manager.transform(
+        NeutronEdgePassManager([RemoveAdditionalQDQClustersPass()])
+    )
+
     logging.debug(f"Lowered graph:\n{edge_program_manager.exported_program().graph}")
 
     # 5. Export to ExecuTorch program

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,9 @@`
`18`	`18`	`from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (`
`19`	`19`	`NeutronEdgePassManager,`
`20`	`20`	`)`
	`21`	`+from executorch.backends.nxp.edge_passes.remove_additional_quantize_dequantize_nodes_pass import (`
	`22`	`+ RemoveAdditionalQDQClustersPass,`
	`23`	`+)`
`21`	`24`	`from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (`
`22`	`25`	`RemoveIOQuantOpsPass,`
`23`	`26`	`)`
`@@ -258,6 +261,10 @@ def get_model_and_inputs_from_name(model_name: str):`
`258`	`261`	`[RemoveIOQuantOpsPass(edge_program_manager=edge_program_manager)]`
`259`	`262`	`)`
`260`	`263`
	`264`	`+ edge_program_manager = edge_program_manager.transform(`
	`265`	`+ NeutronEdgePassManager([RemoveAdditionalQDQClustersPass()])`
	`266`	`+ )`
	`267`	`+`
`261`	`268`	`logging.debug(f"Lowered graph:\n{edge_program_manager.exported_program().graph}")`
`262`	`269`
`263`	`270`	`# 5. Export to ExecuTorch program`