From 0bb87afc3ab65c4d216c270a179cce6f9a204d05 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 10:42:32 -0700 Subject: [PATCH 01/17] Add torchao conversion --- .../test_torchao_huggingface_checkpoints.sh | 27 ++++++++++++++----- .github/workflows/trunk.yml | 17 +++++++++--- examples/models/llama/export_llama_lib.py | 12 +++++++++ extension/llm/export/config/llm_config.py | 13 +++++++++ third-party/ao | 2 +- 5 files changed, 59 insertions(+), 12 deletions(-) diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh index 3c9ac598f8f..9d4d18522ae 100644 --- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh +++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh @@ -5,6 +5,7 @@ set -euxo pipefail # Args / flags # ------------------------- TEST_WITH_RUNNER=0 +USE_TORCHAO_KERNELS=0 MODEL_NAME="" # Parse args @@ -22,10 +23,14 @@ while [[ $# -gt 0 ]]; do --test_with_runner) TEST_WITH_RUNNER=1 ;; + --use_torchao_kernels) + USE_TORCHAO_KERNELS=1 + ;; -h|--help) - echo "Usage: $0 [--test_with_runner]" + echo "Usage: $0 [--test_with_runner] [--use_torchao_kernels]" echo " model_name: qwen3_4b | phi_4_mini" echo " --test_with_runner: build ET + run llama_main to sanity-check the export" + echo " --use_torchao_kernels: use torchao kernels for linear and tied embedding" exit 0 ;; *) @@ -42,6 +47,13 @@ fi MODEL_OUT=model.pte + +# Default to XNNPACK +BACKEND_ARGS="-X --xnnpack-extended-ops" +if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then + BACKEND_ARGS="--torchao-kernels" +fi + case "$MODEL_NAME" in qwen3_4b) echo "Running Qwen3-4B export..." @@ -58,12 +70,12 @@ case "$MODEL_NAME" in --output_name $MODEL_OUT \ -kv \ --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ --max_context_length 1024 \ --max_seq_length 1024 \ + --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \ + --verbose \ --dtype fp32 \ - --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' + ${BACKEND_ARGS} ;; phi_4_mini) @@ -81,12 +93,12 @@ case "$MODEL_NAME" in --output_name $MODEL_OUT \ -kv \ --use_sdpa_with_kv_cache \ - -X \ - --xnnpack-extended-ops \ --max_context_length 1024 \ --max_seq_length 1024 \ + --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \ + --verbose \ --dtype fp32 \ - --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' + ${BACKEND_ARGS} ;; *) @@ -120,6 +132,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ + -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \ -Bcmake-out . cmake --build cmake-out -j16 --config Release --target install diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 362df17dc9b..e0754c9723d 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -594,15 +594,24 @@ jobs: strategy: matrix: model: [qwen3_4b, phi_4_mini] + runner: [linux.2xlarge, linux.arm64.2xlarge] + docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64] include: - model: qwen3_4b test_with_runner: true - model: phi_4_mini test_with_runner: false + - runner: linux.2xlarge + use_torchao_kernels: false + - runner: linux.arm64.2xlarge + use_torchao_kernels: true + exclude: + - runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-gcc11-aarch64 + - runner: linux.arm64.2xlarge + docker-image: executorch-ubuntu-22.04-clang12 fail-fast: false with: - runner: linux.2xlarge - docker-image: ci-image:executorch-ubuntu-22.04-clang12 submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 900 @@ -611,10 +620,10 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" - PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake + PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_KLEIDIAI=1 bash .ci/scripts/setup-linux.sh --build-tool cmake pip install -U "huggingface_hub[cli]" - bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }} + bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }} ${{ matrix.use_torchao_kernels && '--use_torchao_kernels' || '' }} test-multimodal-macos: if: ${{ !github.event.pull_request.head.repo.fork }} diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 7192204a141..baa1ebf7b3b 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -417,6 +417,11 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="Delegate more operators beyond DQLinear to the xnnpack backend. Requires -X or --xnnpack to be set.", ) + parser.add_argument( + "--torchao-kernels", + action="store_true", + help="Delegate tied-embedding and quantized linear ops to torchao kernels", + ) parser.add_argument("-V", "--vulkan", action="store_true") parser.add_argument("--vulkan-force-fp16", action="store_true") parser.add_argument("--mps", action="store_true") @@ -741,6 +746,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: preq_group_size=llm_config.base.preq_group_size, preq_embedding_quantize=llm_config.base.preq_embedding_quantize, local_global_attention=llm_config.model.local_global_attention, + use_torchao_kernels=llm_config.backend.torchao.enabled, ) ) @@ -1303,6 +1309,7 @@ def _get_source_transforms( # noqa preq_group_size: Optional[int] = None, preq_embedding_quantize: Optional[str] = None, local_global_attention: Optional[List[int]] = None, + use_torchao_kernels: bool = False, ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]: """ Return a list of functions that transform a graph. @@ -1475,6 +1482,11 @@ def _get_source_transforms( # noqa ) ) + if use_torchao_kernels: + from torchao.prototype.tensor_conversion.api import _convert_model_for_aarch64 + + transforms.append(_convert_model_for_aarch64) + return transforms diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index d756d1886ad..91ca646e3ea 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -452,6 +452,15 @@ class MPSConfig: enabled: bool = False +@dataclass +class TorchAOKernelsConfig: + """ + Configures the torchao-kernels backend. + """ + + enabled: bool = False + + @dataclass class BackendConfig: """ @@ -464,6 +473,7 @@ class BackendConfig: vulkan: VulkanConfig = field(default_factory=VulkanConfig) qnn: QNNConfig = field(default_factory=QNNConfig) mps: MPSConfig = field(default_factory=MPSConfig) + torchao: TorchAOKernelsConfig = field(default_factory=TorchAOKernelsConfig) ################################################################################ @@ -632,6 +642,9 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 if hasattr(args, "mps"): llm_config.backend.mps.enabled = args.mps + if hasattr(args, "torchao_kernels"): + llm_config.backend.torchao.enabled = args.torchao_kernels + # DebugConfig if hasattr(args, "profile_memory"): llm_config.debug.profile_memory = args.profile_memory diff --git a/third-party/ao b/third-party/ao index b99904b34c0..be4203e80d5 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit b99904b34c0fd98f8a63ec57cbc1dc4993f74793 +Subproject commit be4203e80d55e95553eb236e1082b5e079ee35f9 From fc82a58c398f15a444ed75d9d912da0b2f80bdaf Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 13:23:02 -0700 Subject: [PATCH 02/17] up --- .github/workflows/trunk.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index e0754c9723d..56ded142feb 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -612,6 +612,8 @@ jobs: docker-image: executorch-ubuntu-22.04-clang12 fail-fast: false with: + runner: ${{ matrix.runner }} + docker-image: ci-image:${{ matrix.docker-image }} submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 900 From e484954e3c5c7b53c1800f21fe154010e93b9702 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 14:03:02 -0700 Subject: [PATCH 03/17] up --- .ci/scripts/test_torchao_huggingface_checkpoints.sh | 6 +++++- .github/workflows/trunk.yml | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh index 9d4d18522ae..be490f3437d 100644 --- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh +++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh @@ -116,6 +116,10 @@ if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then fi # Install ET with CMake +EXECUTORCH_BUILD_KERNELS_TORCHAO="OFF" +if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then + EXECUTORCH_BUILD_KERNELS_TORCHAO="ON" +fi if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then echo "[runner] Building and testing llama_main ..." cmake -DPYTHON_EXECUTABLE=python \ @@ -132,7 +136,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \ + -DEXECUTORCH_BUILD_KERNELS_TORCHAO=$(EXECUTORCH_BUILD_KERNELS_TORCHAO) \ -Bcmake-out . cmake --build cmake-out -j16 --config Release --target install diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 56ded142feb..84a9602d673 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -622,7 +622,12 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" - PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_KLEIDIAI=1 bash .ci/scripts/setup-linux.sh --build-tool cmake + BUILD_TORCHAO_KERNELS="" + if [[ ${{ matrix.use_torchao_kernels }} == "true" ]]; then + BUILD_TORCHAO_KERNELS="BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_BUILD_KLEIDIAI=1" + fi + + PYTHON_EXECUTABLE=python $BUILD_TORCHAO_KERNELS bash .ci/scripts/setup-linux.sh --build-tool cmake pip install -U "huggingface_hub[cli]" bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }} ${{ matrix.use_torchao_kernels && '--use_torchao_kernels' || '' }} From 288b86b96776f9dae6726074f54fbd6a77b01d94 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 14:27:43 -0700 Subject: [PATCH 04/17] up --- .github/workflows/trunk.yml | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 84a9602d673..fb3b1157ecf 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -594,8 +594,21 @@ jobs: strategy: matrix: model: [qwen3_4b, phi_4_mini] - runner: [linux.2xlarge, linux.arm64.2xlarge] - docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64] + runner: [linux.2xlarge] + docker-image: [executorch-ubuntu-22.04-clang12] + backend: [xnnpack] + include: + - model: qwen3_4b + runner: linux.arm64.2xlarge + docker-image: executorch-ubuntu-22.04-gcc11-aarch64 + backend: torchao + - model: phi_4_mini + runner: linux.arm64.2xlarge + docker-image: executorch-ubuntu-22.04-gcc11-aarch64 + backend: torchao + + + include: - model: qwen3_4b test_with_runner: true @@ -622,15 +635,21 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" - BUILD_TORCHAO_KERNELS="" - if [[ ${{ matrix.use_torchao_kernels }} == "true" ]]; then - BUILD_TORCHAO_KERNELS="BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_BUILD_KLEIDIAI=1" + if [[ "${{ matrix.backend }}" == "torchao" ]]; then + PYTHON_EXECUTABLE=python \ + BUILD_TORCHAO_EXPERIMENTAL=1 \ + TORCHAO_BUILD_CPU_AARCH64=1 \ + TORCHAO_ENABLE_ARM_NEON_DOT=1 \ + TORCHAO_BUILD_KLEIDIAI=1 \ + bash .ci/scripts/setup-linux.sh --build-tool cmake + else + PYTHON_EXECUTABLE=python \ + bash .ci/scripts/setup-linux.sh --build-tool cmake fi - PYTHON_EXECUTABLE=python $BUILD_TORCHAO_KERNELS bash .ci/scripts/setup-linux.sh --build-tool cmake pip install -U "huggingface_hub[cli]" - bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }} ${{ matrix.use_torchao_kernels && '--use_torchao_kernels' || '' }} + bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.model != 'phi_4_mini' && '--test_with_runner' || '' }} ${{ matrix.backend == 'torchao' && '--use_torchao_kernels' || '' }} test-multimodal-macos: if: ${{ !github.event.pull_request.head.repo.fork }} From 96f98b5bec7d6bfb27f4c231d87a1d9af81ed7ed Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 14:31:09 -0700 Subject: [PATCH 05/17] up --- .github/workflows/trunk.yml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index fb3b1157ecf..d3346f8dcca 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -606,23 +606,6 @@ jobs: runner: linux.arm64.2xlarge docker-image: executorch-ubuntu-22.04-gcc11-aarch64 backend: torchao - - - - include: - - model: qwen3_4b - test_with_runner: true - - model: phi_4_mini - test_with_runner: false - - runner: linux.2xlarge - use_torchao_kernels: false - - runner: linux.arm64.2xlarge - use_torchao_kernels: true - exclude: - - runner: linux.2xlarge - docker-image: executorch-ubuntu-22.04-gcc11-aarch64 - - runner: linux.arm64.2xlarge - docker-image: executorch-ubuntu-22.04-clang12 fail-fast: false with: runner: ${{ matrix.runner }} From 4d9e718e449f89fe9ba4ba2d61e2485aa0f44195 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 15:27:17 -0700 Subject: [PATCH 06/17] up --- .github/workflows/trunk.yml | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index d3346f8dcca..0b2fbe05bd6 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -593,19 +593,23 @@ jobs: contents: read strategy: matrix: - model: [qwen3_4b, phi_4_mini] - runner: [linux.2xlarge] - docker-image: [executorch-ubuntu-22.04-clang12] - backend: [xnnpack] include: - model: qwen3_4b + backend: xnnpack + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12 + - model: phi_4_mini + backend: xnnpack + runner: linux.2xlarge + docker-image: executorch-ubuntu-22.04-clang12 + - model: qwen3_4b + backend: torchao runner: linux.arm64.2xlarge docker-image: executorch-ubuntu-22.04-gcc11-aarch64 - backend: torchao - model: phi_4_mini + backend: torchao runner: linux.arm64.2xlarge docker-image: executorch-ubuntu-22.04-gcc11-aarch64 - backend: torchao fail-fast: false with: runner: ${{ matrix.runner }} @@ -618,17 +622,11 @@ jobs: CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") conda activate "${CONDA_ENV}" + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake + if [[ "${{ matrix.backend }}" == "torchao" ]]; then - PYTHON_EXECUTABLE=python \ - BUILD_TORCHAO_EXPERIMENTAL=1 \ - TORCHAO_BUILD_CPU_AARCH64=1 \ - TORCHAO_ENABLE_ARM_NEON_DOT=1 \ - TORCHAO_BUILD_KLEIDIAI=1 \ - bash .ci/scripts/setup-linux.sh --build-tool cmake + BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 pip install third-party/ao else - PYTHON_EXECUTABLE=python \ - bash .ci/scripts/setup-linux.sh --build-tool cmake - fi pip install -U "huggingface_hub[cli]" From 61a05bc0b3031e09029e09306a3f8c9d11daa388 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 17:00:06 -0700 Subject: [PATCH 07/17] up --- .github/workflows/trunk.yml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 0b2fbe05bd6..d724d6f335b 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -593,23 +593,19 @@ jobs: contents: read strategy: matrix: + model: [qwen3_4b, phi_4_mini] + runner: [linux.2xlarge] + docker-image: [executorch-ubuntu-22.04-clang12] + backend: [xnnpack] include: - model: qwen3_4b - backend: xnnpack - runner: linux.2xlarge - docker-image: executorch-ubuntu-22.04-clang12 - - model: phi_4_mini - backend: xnnpack - runner: linux.2xlarge - docker-image: executorch-ubuntu-22.04-clang12 - - model: qwen3_4b - backend: torchao runner: linux.arm64.2xlarge docker-image: executorch-ubuntu-22.04-gcc11-aarch64 - - model: phi_4_mini backend: torchao + - model: phi_4_mini runner: linux.arm64.2xlarge docker-image: executorch-ubuntu-22.04-gcc11-aarch64 + backend: torchao fail-fast: false with: runner: ${{ matrix.runner }} From 1a5c3f3cd422c5808b91b8400f02d6c7012f2e90 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 17:01:37 -0700 Subject: [PATCH 08/17] up --- third-party/ao | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third-party/ao b/third-party/ao index be4203e80d5..b47f1a36550 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit be4203e80d55e95553eb236e1082b5e079ee35f9 +Subproject commit b47f1a3655004b2b4dd3b4f01a5d8eebff1faa3c From ffd7c1c9d77ae6a10d5723b4868054c93f80d898 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 17:23:25 -0700 Subject: [PATCH 09/17] up --- examples/models/llama/export_llama_lib.py | 26 +++++++++++++++++++---- extension/llm/export/config/llm_config.py | 14 +++++++++--- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index baa1ebf7b3b..52e1d8c7ed6 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -422,6 +422,16 @@ def build_args_parser() -> argparse.ArgumentParser: action="store_true", help="Delegate tied-embedding and quantized linear ops to torchao kernels", ) + parser.add_argument( + "--torchao-kernels-tied-embedding", + action="store_true", + help="Delegate tied-embedding ops to torchao kernels", + ) + parser.add_argument( + "--torchao-kernels-linear", + action="store_true", + help="Delegate linear ops to torchao kernels", + ) parser.add_argument("-V", "--vulkan", action="store_true") parser.add_argument("--vulkan-force-fp16", action="store_true") parser.add_argument("--mps", action="store_true") @@ -746,7 +756,8 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: preq_group_size=llm_config.base.preq_group_size, preq_embedding_quantize=llm_config.base.preq_embedding_quantize, local_global_attention=llm_config.model.local_global_attention, - use_torchao_kernels=llm_config.backend.torchao.enabled, + use_torchao_kernels_linear=llm_config.backend.torchao.linear, + use_torchao_kernels_tied_embedding=llm_config.backend.torchao.tied_embedding, ) ) @@ -1309,7 +1320,8 @@ def _get_source_transforms( # noqa preq_group_size: Optional[int] = None, preq_embedding_quantize: Optional[str] = None, local_global_attention: Optional[List[int]] = None, - use_torchao_kernels: bool = False, + use_torchao_kernels_linear: bool = False, + use_torchao_kernels_tied_embedding: bool = False, ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]: """ Return a list of functions that transform a graph. @@ -1482,10 +1494,16 @@ def _get_source_transforms( # noqa ) ) - if use_torchao_kernels: + if any([use_torchao_kernels_linear, use_torchao_kernels_tied_embedding]): from torchao.prototype.tensor_conversion.api import _convert_model_for_aarch64 - transforms.append(_convert_model_for_aarch64) + transforms.append( + partial( + _convert_model_for_aarch64, + convert_linear=use_torchao_kernels_linear, + convert_tied_embedding=use_torchao_kernels_tied_embedding, + ) + ) return transforms diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index 91ca646e3ea..327ce3a9e5e 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -458,7 +458,8 @@ class TorchAOKernelsConfig: Configures the torchao-kernels backend. """ - enabled: bool = False + linear: bool = True + tied_embedding: bool = True @dataclass @@ -642,8 +643,15 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 if hasattr(args, "mps"): llm_config.backend.mps.enabled = args.mps - if hasattr(args, "torchao_kernels"): - llm_config.backend.torchao.enabled = args.torchao_kernels + if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_linear"): + assert args.torchao_kernels + assert args.torchao_kernels_linear + llm_config.backend.torchao.linear = True + + if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_tied_embedding"): + assert args.torchao_kernels + assert args.torchao_kernels_tied_embedding + llm_config.backend.torchao.tied_embedding = True # DebugConfig if hasattr(args, "profile_memory"): From f503d2c092644734a57c7b6fb745cbc83a3bff1e Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 24 Sep 2025 17:23:40 -0700 Subject: [PATCH 10/17] up --- extension/llm/export/config/llm_config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index 327ce3a9e5e..2d4a7ba34c0 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -647,8 +647,10 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 assert args.torchao_kernels assert args.torchao_kernels_linear llm_config.backend.torchao.linear = True - - if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_tied_embedding"): + + if hasattr(args, "torchao_kernels") or hasattr( + args, "torchao_kernels_tied_embedding" + ): assert args.torchao_kernels assert args.torchao_kernels_tied_embedding llm_config.backend.torchao.tied_embedding = True From 2847aae14c6f4b9f68ae7ac672efd93b38dcff7a Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Thu, 25 Sep 2025 09:51:24 -0700 Subject: [PATCH 11/17] up --- extension/llm/export/config/llm_config.py | 36 +++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index 2d4a7ba34c0..a2aa001363f 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -458,8 +458,9 @@ class TorchAOKernelsConfig: Configures the torchao-kernels backend. """ - linear: bool = True - tied_embedding: bool = True + enabled: bool = False + convert_linear: bool = True + convert_tied_embedding: bool = True @dataclass @@ -643,17 +644,28 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 if hasattr(args, "mps"): llm_config.backend.mps.enabled = args.mps - if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_linear"): - assert args.torchao_kernels - assert args.torchao_kernels_linear - llm_config.backend.torchao.linear = True - - if hasattr(args, "torchao_kernels") or hasattr( - args, "torchao_kernels_tied_embedding" + # TorchAoKernels + if any( + hasattr(args, a) + for a in [ + "torchao_kernels", + "torchao_kernels_linear", + "torchao_kernels_tied_embedding", + ] ): - assert args.torchao_kernels - assert args.torchao_kernels_tied_embedding - llm_config.backend.torchao.tied_embedding = True + llm_config.backend.torchao.enabled = True + if hasattr(args, "torchao_kernels") and args.torchao_kernels: + # Enable all conversions if torchao_kernels is specified + llm_config.backend.torchao.convert_linear = True + llm_config.backend.torchao.convert_tied_embedding = True + else: + # Otherwise, only enable the conversions that are specified + llm_config.backend.torchao.convert_linear = getattr( + args, "torchao_kernels_linear", False + ) + llm_config.backend.torchao.convert_tied_embedding = getattr( + args, "torchao_kernels_tied_embedding", False + ) # DebugConfig if hasattr(args, "profile_memory"): From 2c69ccedf71d92dc98dcdd007175628ae6e6179e Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Thu, 25 Sep 2025 10:19:43 -0700 Subject: [PATCH 12/17] up --- .github/workflows/trunk.yml | 2 +- examples/models/llama/export_llama_lib.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index d724d6f335b..106f970085d 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -622,7 +622,7 @@ jobs: if [[ "${{ matrix.backend }}" == "torchao" ]]; then BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 pip install third-party/ao - else + fi pip install -U "huggingface_hub[cli]" diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 52e1d8c7ed6..ec0af36c165 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -756,8 +756,8 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: preq_group_size=llm_config.base.preq_group_size, preq_embedding_quantize=llm_config.base.preq_embedding_quantize, local_global_attention=llm_config.model.local_global_attention, - use_torchao_kernels_linear=llm_config.backend.torchao.linear, - use_torchao_kernels_tied_embedding=llm_config.backend.torchao.tied_embedding, + use_torchao_kernels_linear=llm_config.backend.torchao.convert_linear, + use_torchao_kernels_tied_embedding=llm_config.backend.torchao.convert_tied_embedding, ) ) From f7a6e2e5e922d3a41486eb105a8c8090c902ee20 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Thu, 25 Sep 2025 10:58:25 -0700 Subject: [PATCH 13/17] up --- .github/workflows/trunk.yml | 2 +- extension/llm/export/config/llm_config.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 106f970085d..ee2afb7576d 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -621,7 +621,7 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake if [[ "${{ matrix.backend }}" == "torchao" ]]; then - BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 pip install third-party/ao + BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_PARALLEL_BACKEND=OPENMP pip install third-party/ao fi pip install -U "huggingface_hub[cli]" diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index a2aa001363f..b45919afd83 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -458,9 +458,8 @@ class TorchAOKernelsConfig: Configures the torchao-kernels backend. """ - enabled: bool = False - convert_linear: bool = True - convert_tied_embedding: bool = True + convert_linear: bool = False + convert_tied_embedding: bool = False @dataclass @@ -653,7 +652,6 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 "torchao_kernels_tied_embedding", ] ): - llm_config.backend.torchao.enabled = True if hasattr(args, "torchao_kernels") and args.torchao_kernels: # Enable all conversions if torchao_kernels is specified llm_config.backend.torchao.convert_linear = True From a7fa6bdfcee817a09cfc16ec66dd0b4e7aa2a05e Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Thu, 25 Sep 2025 11:12:48 -0700 Subject: [PATCH 14/17] up --- .ci/scripts/test_torchao_huggingface_checkpoints.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh index be490f3437d..6b84a09ae6f 100644 --- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh +++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh @@ -136,7 +136,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ - -DEXECUTORCH_BUILD_KERNELS_TORCHAO=$(EXECUTORCH_BUILD_KERNELS_TORCHAO) \ + -DEXECUTORCH_BUILD_KERNELS_TORCHAO=${EXECUTORCH_BUILD_KERNELS_TORCHAO} \ -Bcmake-out . cmake --build cmake-out -j16 --config Release --target install From 51279a4ea78ef363325fb709b4ebdc77a4659dc6 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Mon, 29 Sep 2025 15:11:22 -0700 Subject: [PATCH 15/17] up --- .../test_torchao_huggingface_checkpoints.sh | 2 +- examples/models/llama/export_llama_lib.py | 8 +++---- extension/llm/export/config/llm_config.py | 24 +++++++++---------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh index 6b84a09ae6f..f06c794f88d 100644 --- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh +++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh @@ -51,7 +51,7 @@ MODEL_OUT=model.pte # Default to XNNPACK BACKEND_ARGS="-X --xnnpack-extended-ops" if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then - BACKEND_ARGS="--torchao-kernels" + BACKEND_ARGS="--use-torchao-kernels" fi case "$MODEL_NAME" in diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index ec0af36c165..20a5dcea56d 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -418,17 +418,17 @@ def build_args_parser() -> argparse.ArgumentParser: help="Delegate more operators beyond DQLinear to the xnnpack backend. Requires -X or --xnnpack to be set.", ) parser.add_argument( - "--torchao-kernels", + "--use-torchao-kernels", action="store_true", help="Delegate tied-embedding and quantized linear ops to torchao kernels", ) parser.add_argument( - "--torchao-kernels-tied-embedding", + "--use-torchao-kernels-tied-embedding", action="store_true", help="Delegate tied-embedding ops to torchao kernels", ) parser.add_argument( - "--torchao-kernels-linear", + "--use-torchao-kernels-linear", action="store_true", help="Delegate linear ops to torchao kernels", ) @@ -756,7 +756,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: preq_group_size=llm_config.base.preq_group_size, preq_embedding_quantize=llm_config.base.preq_embedding_quantize, local_global_attention=llm_config.model.local_global_attention, - use_torchao_kernels_linear=llm_config.backend.torchao.convert_linear, + use_torchao_kernels_linear=llm_config.backend.torchao.use_torchao_kernels_linear, use_torchao_kernels_tied_embedding=llm_config.backend.torchao.convert_tied_embedding, ) ) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index b45919afd83..4fade035e67 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -458,8 +458,8 @@ class TorchAOKernelsConfig: Configures the torchao-kernels backend. """ - convert_linear: bool = False - convert_tied_embedding: bool = False + use_torchao_kernels_linear: bool = False + use_torchao_kernels_tied_embedding: bool = False @dataclass @@ -647,22 +647,22 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 if any( hasattr(args, a) for a in [ - "torchao_kernels", - "torchao_kernels_linear", - "torchao_kernels_tied_embedding", + "use_torchao_kernels", + "use_torchao_kernels_linear", + "use_torchao_kernels_tied_embedding", ] ): - if hasattr(args, "torchao_kernels") and args.torchao_kernels: + if hasattr(args, "use_torchao_kernels") and args.torchao_kernels: # Enable all conversions if torchao_kernels is specified - llm_config.backend.torchao.convert_linear = True - llm_config.backend.torchao.convert_tied_embedding = True + llm_config.backend.torchao.use_torchao_kernels_linear = True + llm_config.backend.torchao.use_torchao_kernels_tied_embedding = True else: # Otherwise, only enable the conversions that are specified - llm_config.backend.torchao.convert_linear = getattr( - args, "torchao_kernels_linear", False + llm_config.backend.torchao.use_torchao_kernels_linear = getattr( + args, "use_torchao_kernels_linear", False ) - llm_config.backend.torchao.convert_tied_embedding = getattr( - args, "torchao_kernels_tied_embedding", False + llm_config.backend.torchao.use_torchao_kernels_tied_embedding = getattr( + args, "use_torchao_kernels_tied_embedding", False ) # DebugConfig From 9db7b1852253275b4793ab2abf7086792e2be3e4 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Mon, 29 Sep 2025 15:44:33 -0700 Subject: [PATCH 16/17] up --- extension/llm/export/config/llm_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index 4fade035e67..b13001c005b 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -652,7 +652,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig": # noqa: C901 "use_torchao_kernels_tied_embedding", ] ): - if hasattr(args, "use_torchao_kernels") and args.torchao_kernels: + if hasattr(args, "use_torchao_kernels") and args.use_torchao_kernels: # Enable all conversions if torchao_kernels is specified llm_config.backend.torchao.use_torchao_kernels_linear = True llm_config.backend.torchao.use_torchao_kernels_tied_embedding = True From 96dc88ee61a991649e0bdff042ad718abb7f99ac Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Mon, 29 Sep 2025 17:07:27 -0700 Subject: [PATCH 17/17] up --- examples/models/llama/export_llama_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 20a5dcea56d..aa3b157c8da 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -757,7 +757,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager: preq_embedding_quantize=llm_config.base.preq_embedding_quantize, local_global_attention=llm_config.model.local_global_attention, use_torchao_kernels_linear=llm_config.backend.torchao.use_torchao_kernels_linear, - use_torchao_kernels_tied_embedding=llm_config.backend.torchao.convert_tied_embedding, + use_torchao_kernels_tied_embedding=llm_config.backend.torchao.use_torchao_kernels_tied_embedding, ) )