From a825fd151e19697abc1d1899bce752c3b19c29cd Mon Sep 17 00:00:00 2001 From: DannyYuyang-quic Date: Mon, 7 Apr 2025 16:27:45 +0800 Subject: [PATCH] Qualcomm AI Engine Direct - Support tile op for different I/O rank Summary: - Support if the rank of input tensor is less than the rank of output tensor. - make_quantizer kwargs alignment. - Remove module.eval() since calling eval() is not supported for exported models. --- .../_passes/expand_broadcast_tensor_shape.py | 4 ++ backends/qualcomm/_passes/utils.py | 2 +- backends/qualcomm/tests/test_qnn_delegate.py | 44 ++++++++++++++----- backends/qualcomm/tests/utils.py | 5 ++- examples/qualcomm/utils.py | 6 +-- 5 files changed, 45 insertions(+), 16 deletions(-) diff --git a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py index 277fc9c6ce8..829b3757e06 100644 --- a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py +++ b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py @@ -22,12 +22,16 @@ def __init__(self): exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.mul.Tensor, exir_ops.edge.aten.div.Tensor, + # Support if the rank of input tensor: {input_dims} is less than the rank of output tensor: {output_dims}. + exir_ops.edge.aten.expand_copy.default, ] def traverse_broadcast_node(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: if node.target in self.broadcast_op_targets: for arg in node.args: + if not isinstance(arg, torch.fx.Node): + continue input_rank = len(arg.meta["val"].shape) output_rank = len(node.meta["val"].shape) if input_rank != output_rank: diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py index a8eb6b192ee..2b33ee2bbff 100755 --- a/backends/qualcomm/_passes/utils.py +++ b/backends/qualcomm/_passes/utils.py @@ -104,7 +104,7 @@ def get_passes_dependency_for_capture_program(): ConvertConv1dToConv2d: [FoldQDQ], DecomposeAny: [RemoveRedundancy], DecomposeLinalgVectorNorm: [RemoveRedundancy], - ExpandBroadcastTensorShape: [RemoveRedundancy], + ExpandBroadcastTensorShape: [FoldQDQ], FixedLinearKeepDim: [FoldQDQ], FoldQDQ: [AnnotateQuantAttrs, AnnotateStack, AnnotateUnbind], I64toI32: [RemoveRedundancy], diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 9aba5a059e0..d34b43b9258 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -69,7 +69,11 @@ from collections import defaultdict from typing import List -from executorch.backends.qualcomm._passes import FoldQDQ, TagQuantIO +from executorch.backends.qualcomm._passes import ( + ExpandBroadcastTensorShape, + FoldQDQ, + TagQuantIO, +) from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors from executorch.backends.qualcomm.debugger.utils import DrawGraph from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet101Model @@ -430,10 +434,20 @@ def test_qnn_backend_equal(self): def test_qnn_backend_expand(self): modules = [ExpandAs(), ExpandCopy()] # noqa: F405 - sample_input = (torch.randn([3, 1]),) - for i, module in enumerate(modules): - with self.subTest(i=i): - self.lower_module_and_test_output(module, sample_input) + sample_inputs = [ + (torch.randn([3, 1]),), + (torch.randn([4]),), + ] + passes_job = get_capture_program_passes() + passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True + index = 0 + for module in modules: + for sample_input in sample_inputs: + with self.subTest(i=index): + self.lower_module_and_test_output( + module, sample_input, passes_job=passes_job + ) + index += 1 def test_qnn_backend_expm1(self): sample_input = (torch.randn(3, 4, 5),) @@ -1506,11 +1520,21 @@ def test_qnn_backend_equal(self): def test_qnn_backend_expand(self): modules = [ExpandAs(), ExpandCopy()] # noqa: F405 - sample_input = (torch.randn([3, 1]),) - for i, module in enumerate(modules): - with self.subTest(i=i): - module = self.get_qdq_module(module, sample_input) - self.lower_module_and_test_output(module, sample_input) + sample_inputs = [ + (torch.randn([3, 1]),), + (torch.randn([4]),), + ] + passes_job = get_capture_program_passes() + passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True + index = 0 + for module in modules: + for sample_input in sample_inputs: + with self.subTest(i=index): + module = self.get_qdq_module(module, sample_input) + self.lower_module_and_test_output( + module, sample_input, passes_job=passes_job + ) + index += 1 def test_qnn_backend_expm1(self): sample_input = (torch.randn(3, 4, 5),) diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index 42eec15891c..77cbed5cc99 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -9,7 +9,7 @@ import subprocess import tempfile import unittest -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable, Dict, List, Optional, OrderedDict, Tuple import numpy as np import torch @@ -435,6 +435,7 @@ def lower_module_and_test_output( expected_profile_events: int = -1, expected_intermediate_events: int = -1, assert_output_equal: bool = True, + passes_job: Optional[OrderedDict] = None, skip_node_id_set: set = None, skip_node_op_set: set = None, dynamic_shapes: Dict = None, @@ -444,6 +445,7 @@ def lower_module_and_test_output( sample_inputs, self.compiler_specs, dynamic_shapes=dynamic_shapes, + passes_job=passes_job, skip_node_id_set=skip_node_id_set, skip_node_op_set=skip_node_op_set, ) @@ -506,7 +508,6 @@ def get_qdq_module( block_size_map: Dict[str, Tuple] = None, submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None, ) -> torch.fx.GraphModule: - module = module.eval() m = torch.export.export( module, inputs, dynamic_shapes=dynamic_shapes, strict=True ).module() diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index b17bc8f98bd..2ad2fa208ff 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -262,7 +262,7 @@ def make_quantizer( per_channel_linear=False, act_observer=MovingAverageMinMaxObserver, is_qat=False, - callback_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None, + submodule_qconfig_list: Optional[List[Tuple[Callable, ModuleQConfig]]] = None, ): quantizer = QnnQuantizer() quantizer.add_custom_quant_annotations(custom_annotations) @@ -273,8 +273,8 @@ def make_quantizer( is_linear_per_channel=per_channel_linear, act_observer=act_observer, ) - callback_qconfig_list = callback_qconfig_list or [] - quantizer.set_submodule_qconfig_list(callback_qconfig_list) + submodule_qconfig_list = submodule_qconfig_list or [] + quantizer.set_submodule_qconfig_list(submodule_qconfig_list) return quantizer