From 6e5764c243e35f49fb458be1fc11a6b65baa983e Mon Sep 17 00:00:00 2001 From: Eli Amesefe Date: Thu, 2 Oct 2025 17:50:15 -0700 Subject: [PATCH] Update addmm int16 for Ethos-U85 (#14714) Summary: Adjust op_bmm to allow int16 types with int48 output buffer Note: I am rescaling outputs back to the original int16 dtype output. This is obviously dangerous if done without a properly calibrated quantization parameter, but this is our base assumption. bypass-github-export-checks bypass-github-pytorch-ci-checks bypass-github-executorch-ci-checks Reviewed By: digantdesai Differential Revision: D83627934 --- backends/arm/operators/op_bmm.py | 23 +++++++++++++++++++++++ backends/arm/test/ops/test_addmm.py | 6 ------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py index 2636a08d7c5..9bebc3597ca 100644 --- a/backends/arm/operators/op_bmm.py +++ b/backends/arm/operators/op_bmm.py @@ -79,6 +79,12 @@ def define_node( input1_zp = input_qparams[1].get_zp_per_tensor() bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32) bmm_output_name = bmm_result.name + elif inputs[0].dtype == ts.DType.INT16: + input_qparams = get_input_qparams(node) + input0_zp = input_qparams[0].get_zp_per_tensor() + input1_zp = input_qparams[1].get_zp_per_tensor() + bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT48) + bmm_output_name = bmm_result.name else: bmm_output_name = output.name input0_zp, input1_zp = 0, 0 @@ -118,3 +124,20 @@ def define_node( output_zp=[output_qparams.get_zp_per_tensor()], rounding_mode=RoundingMode.SINGLE_ROUND, ) + elif output.dtype == ts.DType.INT16: + output_qparams = get_output_qparams(node)[0] + final_output_scale = ( + input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor() # type: ignore[possibly-undefined] # pyre-ignore[61] + ) / output_qparams.get_scale_per_tensor() + + build_rescale( + tosa_fb=tosa_graph, + scale=[final_output_scale], + # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined. + input_node=bmm_result, # type: ignore[possibly-undefined] + output_name=output.name, + output_type=ts.DType.INT16, + input_zp=[0], + output_zp=[output_qparams.get_zp_per_tensor()], + rounding_mode=RoundingMode.SINGLE_ROUND, + ) diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py index b9a891ec740..1170f65dd58 100644 --- a/backends/arm/test/ops/test_addmm.py +++ b/backends/arm/test/ops/test_addmm.py @@ -213,9 +213,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False): @common.parametrize("test_data", test_data_suite) -@pytest.mark.xfail( - reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979" -) def test_addmm_16a8w_tosa_INT(test_data: input_t1): """Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)""" per_channel_quantization = False @@ -268,9 +265,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1): @common.parametrize("test_data", test_data_suite) @common.XfailIfNoCorstone320 -@pytest.mark.xfail( - reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations" -) def test_addmm_16a8w_u85_INT16(test_data: input_t1): """Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)""" per_channel_quantization = False