From 0bb87afc3ab65c4d216c270a179cce6f9a204d05 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 10:42:32 -0700
Subject: [PATCH 01/17] Add torchao conversion

---
 .../test_torchao_huggingface_checkpoints.sh   | 27 ++++++++++++++-----
 .github/workflows/trunk.yml                   | 17 +++++++++---
 examples/models/llama/export_llama_lib.py     | 12 +++++++++
 extension/llm/export/config/llm_config.py     | 13 +++++++++
 third-party/ao                                |  2 +-
 5 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index 3c9ac598f8f..9d4d18522ae 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -5,6 +5,7 @@ set -euxo pipefail
 # Args / flags
 # -------------------------
 TEST_WITH_RUNNER=0
+USE_TORCHAO_KERNELS=0
 MODEL_NAME=""
 
 # Parse args
@@ -22,10 +23,14 @@ while [[ $# -gt 0 ]]; do
     --test_with_runner)
       TEST_WITH_RUNNER=1
       ;;
+    --use_torchao_kernels)
+      USE_TORCHAO_KERNELS=1
+      ;;
     -h|--help)
-      echo "Usage: $0 <model_name> [--test_with_runner]"
+      echo "Usage: $0 <model_name> [--test_with_runner] [--use_torchao_kernels]"
       echo "  model_name: qwen3_4b | phi_4_mini"
       echo "  --test_with_runner: build ET + run llama_main to sanity-check the export"
+      echo "  --use_torchao_kernels: use torchao kernels for linear and tied embedding"
       exit 0
       ;;
     *)
@@ -42,6 +47,13 @@ fi
 
 MODEL_OUT=model.pte
 
+
+# Default to XNNPACK
+BACKEND_ARGS="-X --xnnpack-extended-ops"
+if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then
+  BACKEND_ARGS="--torchao-kernels"
+fi
+
 case "$MODEL_NAME" in
   qwen3_4b)
     echo "Running Qwen3-4B export..."
@@ -58,12 +70,12 @@ case "$MODEL_NAME" in
       --output_name $MODEL_OUT \
       -kv \
       --use_sdpa_with_kv_cache \
-      -X \
-      --xnnpack-extended-ops \
       --max_context_length 1024 \
       --max_seq_length 1024 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \
+      --verbose \
       --dtype fp32 \
-      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+      ${BACKEND_ARGS}
     ;;
 
   phi_4_mini)
@@ -81,12 +93,12 @@ case "$MODEL_NAME" in
       --output_name $MODEL_OUT \
       -kv \
       --use_sdpa_with_kv_cache \
-      -X \
-      --xnnpack-extended-ops \
       --max_context_length 1024 \
       --max_seq_length 1024 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \
+      --verbose \
       --dtype fp32 \
-      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+      ${BACKEND_ARGS}
     ;;
 
   *)
@@ -120,6 +132,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
+        -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
         -Bcmake-out .
     cmake --build cmake-out -j16 --config Release --target install
 
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 362df17dc9b..e0754c9723d 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -594,15 +594,24 @@ jobs:
     strategy:
       matrix:
         model: [qwen3_4b, phi_4_mini]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         include:
           - model: qwen3_4b
             test_with_runner: true
           - model: phi_4_mini
             test_with_runner: false
+          - runner: linux.2xlarge
+            use_torchao_kernels: false
+          - runner: linux.arm64.2xlarge
+            use_torchao_kernels: true
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
       fail-fast: false
     with:
-      runner: linux.2xlarge
-      docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
@@ -611,10 +620,10 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_KLEIDIAI=1 bash .ci/scripts/setup-linux.sh --build-tool cmake
         pip install -U "huggingface_hub[cli]"
 
-        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}
+        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}  ${{ matrix.use_torchao_kernels && '--use_torchao_kernels' || '' }}
 
   test-multimodal-macos:
     if: ${{ !github.event.pull_request.head.repo.fork }}
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 7192204a141..baa1ebf7b3b 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -417,6 +417,11 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Delegate more operators beyond DQLinear to the xnnpack backend. Requires -X or --xnnpack to be set.",
     )
+    parser.add_argument(
+        "--torchao-kernels",
+        action="store_true",
+        help="Delegate tied-embedding and quantized linear ops to torchao kernels",
+    )
     parser.add_argument("-V", "--vulkan", action="store_true")
     parser.add_argument("--vulkan-force-fp16", action="store_true")
     parser.add_argument("--mps", action="store_true")
@@ -741,6 +746,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             preq_group_size=llm_config.base.preq_group_size,
             preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
             local_global_attention=llm_config.model.local_global_attention,
+            use_torchao_kernels=llm_config.backend.torchao.enabled,
         )
     )
 
@@ -1303,6 +1309,7 @@ def _get_source_transforms(  # noqa
     preq_group_size: Optional[int] = None,
     preq_embedding_quantize: Optional[str] = None,
     local_global_attention: Optional[List[int]] = None,
+    use_torchao_kernels: bool = False,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1475,6 +1482,11 @@ def _get_source_transforms(  # noqa
             )
         )
 
+    if use_torchao_kernels:
+        from torchao.prototype.tensor_conversion.api import _convert_model_for_aarch64
+
+        transforms.append(_convert_model_for_aarch64)
+
     return transforms
 
 
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index d756d1886ad..91ca646e3ea 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -452,6 +452,15 @@ class MPSConfig:
     enabled: bool = False
 
 
+@dataclass
+class TorchAOKernelsConfig:
+    """
+    Configures the torchao-kernels backend.
+    """
+
+    enabled: bool = False
+
+
 @dataclass
 class BackendConfig:
     """
@@ -464,6 +473,7 @@ class BackendConfig:
     vulkan: VulkanConfig = field(default_factory=VulkanConfig)
     qnn: QNNConfig = field(default_factory=QNNConfig)
     mps: MPSConfig = field(default_factory=MPSConfig)
+    torchao: TorchAOKernelsConfig = field(default_factory=TorchAOKernelsConfig)
 
 
 ################################################################################
@@ -632,6 +642,9 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if hasattr(args, "mps"):
             llm_config.backend.mps.enabled = args.mps
 
+        if hasattr(args, "torchao_kernels"):
+            llm_config.backend.torchao.enabled = args.torchao_kernels
+
         # DebugConfig
         if hasattr(args, "profile_memory"):
             llm_config.debug.profile_memory = args.profile_memory
diff --git a/third-party/ao b/third-party/ao
index b99904b34c0..be4203e80d5 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit b99904b34c0fd98f8a63ec57cbc1dc4993f74793
+Subproject commit be4203e80d55e95553eb236e1082b5e079ee35f9

From fc82a58c398f15a444ed75d9d912da0b2f80bdaf Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 13:23:02 -0700
Subject: [PATCH 02/17] up

---
 .github/workflows/trunk.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index e0754c9723d..56ded142feb 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -612,6 +612,8 @@ jobs:
             docker-image: executorch-ubuntu-22.04-clang12
       fail-fast: false
     with:
+      runner: ${{ matrix.runner }}
+      docker-image: ci-image:${{ matrix.docker-image }}
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900

From e484954e3c5c7b53c1800f21fe154010e93b9702 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 14:03:02 -0700
Subject: [PATCH 03/17] up

---
 .ci/scripts/test_torchao_huggingface_checkpoints.sh | 6 +++++-
 .github/workflows/trunk.yml                         | 7 ++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index 9d4d18522ae..be490f3437d 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -116,6 +116,10 @@ if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
 fi
 
 # Install ET with CMake
+EXECUTORCH_BUILD_KERNELS_TORCHAO="OFF"
+if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then
+  EXECUTORCH_BUILD_KERNELS_TORCHAO="ON"
+fi
 if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
   echo "[runner] Building and testing llama_main ..."
     cmake -DPYTHON_EXECUTABLE=python \
@@ -132,7 +136,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
+        -DEXECUTORCH_BUILD_KERNELS_TORCHAO=$(EXECUTORCH_BUILD_KERNELS_TORCHAO) \
         -Bcmake-out .
     cmake --build cmake-out -j16 --config Release --target install
 
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 56ded142feb..84a9602d673 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -622,7 +622,12 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_KLEIDIAI=1 bash .ci/scripts/setup-linux.sh --build-tool cmake
+        BUILD_TORCHAO_KERNELS=""
+        if [[ ${{ matrix.use_torchao_kernels }} == "true" ]]; then
+          BUILD_TORCHAO_KERNELS="BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_BUILD_KLEIDIAI=1"
+        fi
+
+        PYTHON_EXECUTABLE=python $BUILD_TORCHAO_KERNELS bash .ci/scripts/setup-linux.sh --build-tool cmake
         pip install -U "huggingface_hub[cli]"
 
         bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}  ${{ matrix.use_torchao_kernels && '--use_torchao_kernels' || '' }}

From 288b86b96776f9dae6726074f54fbd6a77b01d94 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 14:27:43 -0700
Subject: [PATCH 04/17] up

---
 .github/workflows/trunk.yml | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 84a9602d673..fb3b1157ecf 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -594,8 +594,21 @@ jobs:
     strategy:
       matrix:
         model: [qwen3_4b, phi_4_mini]
-        runner: [linux.2xlarge, linux.arm64.2xlarge]
-        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        runner: [linux.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12]
+        backend: [xnnpack]
+        include:
+          - model: qwen3_4b
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+            backend: torchao
+          - model: phi_4_mini
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+            backend: torchao
+
+
+
         include:
           - model: qwen3_4b
             test_with_runner: true
@@ -622,15 +635,21 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
-        BUILD_TORCHAO_KERNELS=""
-        if [[ ${{ matrix.use_torchao_kernels }} == "true" ]]; then
-          BUILD_TORCHAO_KERNELS="BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_BUILD_KLEIDIAI=1"
+        if [[ "${{ matrix.backend }}" == "torchao" ]]; then
+          PYTHON_EXECUTABLE=python \
+              BUILD_TORCHAO_EXPERIMENTAL=1 \
+              TORCHAO_BUILD_CPU_AARCH64=1 \
+              TORCHAO_ENABLE_ARM_NEON_DOT=1 \
+              TORCHAO_BUILD_KLEIDIAI=1 \
+              bash .ci/scripts/setup-linux.sh --build-tool cmake
+        else
+          PYTHON_EXECUTABLE=python \
+              bash .ci/scripts/setup-linux.sh --build-tool cmake
         fi
 
-        PYTHON_EXECUTABLE=python $BUILD_TORCHAO_KERNELS bash .ci/scripts/setup-linux.sh --build-tool cmake
         pip install -U "huggingface_hub[cli]"
 
-        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}  ${{ matrix.use_torchao_kernels && '--use_torchao_kernels' || '' }}
+        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.model != 'phi_4_mini' && '--test_with_runner' || '' }}  ${{ matrix.backend == 'torchao' && '--use_torchao_kernels' || '' }}
 
   test-multimodal-macos:
     if: ${{ !github.event.pull_request.head.repo.fork }}

From 96f98b5bec7d6bfb27f4c231d87a1d9af81ed7ed Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 14:31:09 -0700
Subject: [PATCH 05/17] up

---
 .github/workflows/trunk.yml | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index fb3b1157ecf..d3346f8dcca 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -606,23 +606,6 @@ jobs:
             runner: linux.arm64.2xlarge
             docker-image: executorch-ubuntu-22.04-gcc11-aarch64
             backend: torchao
-
-
-
-        include:
-          - model: qwen3_4b
-            test_with_runner: true
-          - model: phi_4_mini
-            test_with_runner: false
-          - runner: linux.2xlarge
-            use_torchao_kernels: false
-          - runner: linux.arm64.2xlarge
-            use_torchao_kernels: true
-        exclude:
-          - runner: linux.2xlarge
-            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-          - runner: linux.arm64.2xlarge
-            docker-image: executorch-ubuntu-22.04-clang12
       fail-fast: false
     with:
       runner: ${{ matrix.runner }}

From 4d9e718e449f89fe9ba4ba2d61e2485aa0f44195 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 15:27:17 -0700
Subject: [PATCH 06/17] up

---
 .github/workflows/trunk.yml | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index d3346f8dcca..0b2fbe05bd6 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -593,19 +593,23 @@ jobs:
       contents: read
     strategy:
       matrix:
-        model: [qwen3_4b, phi_4_mini]
-        runner: [linux.2xlarge]
-        docker-image: [executorch-ubuntu-22.04-clang12]
-        backend: [xnnpack]
         include:
           - model: qwen3_4b
+            backend: xnnpack
+            runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+          - model: phi_4_mini
+            backend: xnnpack
+            runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+          - model: qwen3_4b
+            backend: torchao
             runner: linux.arm64.2xlarge
             docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-            backend: torchao
           - model: phi_4_mini
+            backend: torchao
             runner: linux.arm64.2xlarge
             docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-            backend: torchao
       fail-fast: false
     with:
       runner: ${{ matrix.runner }}
@@ -618,17 +622,11 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
         if [[ "${{ matrix.backend }}" == "torchao" ]]; then
-          PYTHON_EXECUTABLE=python \
-              BUILD_TORCHAO_EXPERIMENTAL=1 \
-              TORCHAO_BUILD_CPU_AARCH64=1 \
-              TORCHAO_ENABLE_ARM_NEON_DOT=1 \
-              TORCHAO_BUILD_KLEIDIAI=1 \
-              bash .ci/scripts/setup-linux.sh --build-tool cmake
+          BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 pip install third-party/ao
         else
-          PYTHON_EXECUTABLE=python \
-              bash .ci/scripts/setup-linux.sh --build-tool cmake
-        fi
 
         pip install -U "huggingface_hub[cli]"
 

From 61a05bc0b3031e09029e09306a3f8c9d11daa388 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 17:00:06 -0700
Subject: [PATCH 07/17] up

---
 .github/workflows/trunk.yml | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 0b2fbe05bd6..d724d6f335b 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -593,23 +593,19 @@ jobs:
       contents: read
     strategy:
       matrix:
+        model: [qwen3_4b, phi_4_mini]
+        runner: [linux.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12]
+        backend: [xnnpack]
         include:
           - model: qwen3_4b
-            backend: xnnpack
-            runner: linux.2xlarge
-            docker-image: executorch-ubuntu-22.04-clang12
-          - model: phi_4_mini
-            backend: xnnpack
-            runner: linux.2xlarge
-            docker-image: executorch-ubuntu-22.04-clang12
-          - model: qwen3_4b
-            backend: torchao
             runner: linux.arm64.2xlarge
             docker-image: executorch-ubuntu-22.04-gcc11-aarch64
-          - model: phi_4_mini
             backend: torchao
+          - model: phi_4_mini
             runner: linux.arm64.2xlarge
             docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+            backend: torchao
       fail-fast: false
     with:
       runner: ${{ matrix.runner }}

From 1a5c3f3cd422c5808b91b8400f02d6c7012f2e90 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 17:01:37 -0700
Subject: [PATCH 08/17] up

---
 third-party/ao | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third-party/ao b/third-party/ao
index be4203e80d5..b47f1a36550 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit be4203e80d55e95553eb236e1082b5e079ee35f9
+Subproject commit b47f1a3655004b2b4dd3b4f01a5d8eebff1faa3c

From ffd7c1c9d77ae6a10d5723b4868054c93f80d898 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 17:23:25 -0700
Subject: [PATCH 09/17] up

---
 examples/models/llama/export_llama_lib.py | 26 +++++++++++++++++++----
 extension/llm/export/config/llm_config.py | 14 +++++++++---
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index baa1ebf7b3b..52e1d8c7ed6 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -422,6 +422,16 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Delegate tied-embedding and quantized linear ops to torchao kernels",
     )
+    parser.add_argument(
+        "--torchao-kernels-tied-embedding",
+        action="store_true",
+        help="Delegate tied-embedding ops to torchao kernels",
+    )
+    parser.add_argument(
+        "--torchao-kernels-linear",
+        action="store_true",
+        help="Delegate linear ops to torchao kernels",
+    )
     parser.add_argument("-V", "--vulkan", action="store_true")
     parser.add_argument("--vulkan-force-fp16", action="store_true")
     parser.add_argument("--mps", action="store_true")
@@ -746,7 +756,8 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             preq_group_size=llm_config.base.preq_group_size,
             preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
             local_global_attention=llm_config.model.local_global_attention,
-            use_torchao_kernels=llm_config.backend.torchao.enabled,
+            use_torchao_kernels_linear=llm_config.backend.torchao.linear,
+            use_torchao_kernels_tied_embedding=llm_config.backend.torchao.tied_embedding,
         )
     )
 
@@ -1309,7 +1320,8 @@ def _get_source_transforms(  # noqa
     preq_group_size: Optional[int] = None,
     preq_embedding_quantize: Optional[str] = None,
     local_global_attention: Optional[List[int]] = None,
-    use_torchao_kernels: bool = False,
+    use_torchao_kernels_linear: bool = False,
+    use_torchao_kernels_tied_embedding: bool = False,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1482,10 +1494,16 @@ def _get_source_transforms(  # noqa
             )
         )
 
-    if use_torchao_kernels:
+    if any([use_torchao_kernels_linear, use_torchao_kernels_tied_embedding]):
         from torchao.prototype.tensor_conversion.api import _convert_model_for_aarch64
 
-        transforms.append(_convert_model_for_aarch64)
+        transforms.append(
+            partial(
+                _convert_model_for_aarch64,
+                convert_linear=use_torchao_kernels_linear,
+                convert_tied_embedding=use_torchao_kernels_tied_embedding,
+            )
+        )
 
     return transforms
 
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index 91ca646e3ea..327ce3a9e5e 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -458,7 +458,8 @@ class TorchAOKernelsConfig:
     Configures the torchao-kernels backend.
     """
 
-    enabled: bool = False
+    linear: bool = True
+    tied_embedding: bool = True
 
 
 @dataclass
@@ -642,8 +643,15 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if hasattr(args, "mps"):
             llm_config.backend.mps.enabled = args.mps
 
-        if hasattr(args, "torchao_kernels"):
-            llm_config.backend.torchao.enabled = args.torchao_kernels
+        if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_linear"):
+            assert args.torchao_kernels
+            assert args.torchao_kernels_linear
+            llm_config.backend.torchao.linear = True
+        
+        if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_tied_embedding"):
+            assert args.torchao_kernels
+            assert args.torchao_kernels_tied_embedding
+            llm_config.backend.torchao.tied_embedding = True
 
         # DebugConfig
         if hasattr(args, "profile_memory"):

From f503d2c092644734a57c7b6fb745cbc83a3bff1e Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 24 Sep 2025 17:23:40 -0700
Subject: [PATCH 10/17] up

---
 extension/llm/export/config/llm_config.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index 327ce3a9e5e..2d4a7ba34c0 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -647,8 +647,10 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             assert args.torchao_kernels
             assert args.torchao_kernels_linear
             llm_config.backend.torchao.linear = True
-        
-        if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_tied_embedding"):
+
+        if hasattr(args, "torchao_kernels") or hasattr(
+            args, "torchao_kernels_tied_embedding"
+        ):
             assert args.torchao_kernels
             assert args.torchao_kernels_tied_embedding
             llm_config.backend.torchao.tied_embedding = True

From 2847aae14c6f4b9f68ae7ac672efd93b38dcff7a Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 25 Sep 2025 09:51:24 -0700
Subject: [PATCH 11/17] up

---
 extension/llm/export/config/llm_config.py | 36 +++++++++++++++--------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index 2d4a7ba34c0..a2aa001363f 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -458,8 +458,9 @@ class TorchAOKernelsConfig:
     Configures the torchao-kernels backend.
     """
 
-    linear: bool = True
-    tied_embedding: bool = True
+    enabled: bool = False
+    convert_linear: bool = True
+    convert_tied_embedding: bool = True
 
 
 @dataclass
@@ -643,17 +644,28 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if hasattr(args, "mps"):
             llm_config.backend.mps.enabled = args.mps
 
-        if hasattr(args, "torchao_kernels") or hasattr(args, "torchao_kernels_linear"):
-            assert args.torchao_kernels
-            assert args.torchao_kernels_linear
-            llm_config.backend.torchao.linear = True
-
-        if hasattr(args, "torchao_kernels") or hasattr(
-            args, "torchao_kernels_tied_embedding"
+        # TorchAoKernels
+        if any(
+            hasattr(args, a)
+            for a in [
+                "torchao_kernels",
+                "torchao_kernels_linear",
+                "torchao_kernels_tied_embedding",
+            ]
         ):
-            assert args.torchao_kernels
-            assert args.torchao_kernels_tied_embedding
-            llm_config.backend.torchao.tied_embedding = True
+            llm_config.backend.torchao.enabled = True
+            if hasattr(args, "torchao_kernels") and args.torchao_kernels:
+                # Enable all conversions if torchao_kernels is specified
+                llm_config.backend.torchao.convert_linear = True
+                llm_config.backend.torchao.convert_tied_embedding = True
+            else:
+                # Otherwise, only enable the conversions that are specified
+                llm_config.backend.torchao.convert_linear = getattr(
+                    args, "torchao_kernels_linear", False
+                )
+                llm_config.backend.torchao.convert_tied_embedding = getattr(
+                    args, "torchao_kernels_tied_embedding", False
+                )
 
         # DebugConfig
         if hasattr(args, "profile_memory"):

From 2c69ccedf71d92dc98dcdd007175628ae6e6179e Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 25 Sep 2025 10:19:43 -0700
Subject: [PATCH 12/17] up

---
 .github/workflows/trunk.yml               | 2 +-
 examples/models/llama/export_llama_lib.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index d724d6f335b..106f970085d 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -622,7 +622,7 @@ jobs:
 
         if [[ "${{ matrix.backend }}" == "torchao" ]]; then
           BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 pip install third-party/ao
-        else
+        fi
 
         pip install -U "huggingface_hub[cli]"
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 52e1d8c7ed6..ec0af36c165 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -756,8 +756,8 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             preq_group_size=llm_config.base.preq_group_size,
             preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
             local_global_attention=llm_config.model.local_global_attention,
-            use_torchao_kernels_linear=llm_config.backend.torchao.linear,
-            use_torchao_kernels_tied_embedding=llm_config.backend.torchao.tied_embedding,
+            use_torchao_kernels_linear=llm_config.backend.torchao.convert_linear,
+            use_torchao_kernels_tied_embedding=llm_config.backend.torchao.convert_tied_embedding,
         )
     )
 

From f7a6e2e5e922d3a41486eb105a8c8090c902ee20 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 25 Sep 2025 10:58:25 -0700
Subject: [PATCH 13/17] up

---
 .github/workflows/trunk.yml               | 2 +-
 extension/llm/export/config/llm_config.py | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 106f970085d..ee2afb7576d 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -621,7 +621,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
 
         if [[ "${{ matrix.backend }}" == "torchao" ]]; then
-          BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 pip install third-party/ao
+          BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_PARALLEL_BACKEND=OPENMP pip install third-party/ao
         fi
 
         pip install -U "huggingface_hub[cli]"
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index a2aa001363f..b45919afd83 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -458,9 +458,8 @@ class TorchAOKernelsConfig:
     Configures the torchao-kernels backend.
     """
 
-    enabled: bool = False
-    convert_linear: bool = True
-    convert_tied_embedding: bool = True
+    convert_linear: bool = False
+    convert_tied_embedding: bool = False
 
 
 @dataclass
@@ -653,7 +652,6 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
                 "torchao_kernels_tied_embedding",
             ]
         ):
-            llm_config.backend.torchao.enabled = True
             if hasattr(args, "torchao_kernels") and args.torchao_kernels:
                 # Enable all conversions if torchao_kernels is specified
                 llm_config.backend.torchao.convert_linear = True

From a7fa6bdfcee817a09cfc16ec66dd0b4e7aa2a05e Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 25 Sep 2025 11:12:48 -0700
Subject: [PATCH 14/17] up

---
 .ci/scripts/test_torchao_huggingface_checkpoints.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index be490f3437d..6b84a09ae6f 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -136,7 +136,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -DEXECUTORCH_BUILD_KERNELS_TORCHAO=$(EXECUTORCH_BUILD_KERNELS_TORCHAO) \
+        -DEXECUTORCH_BUILD_KERNELS_TORCHAO=${EXECUTORCH_BUILD_KERNELS_TORCHAO} \
         -Bcmake-out .
     cmake --build cmake-out -j16 --config Release --target install
 

From 51279a4ea78ef363325fb709b4ebdc77a4659dc6 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 29 Sep 2025 15:11:22 -0700
Subject: [PATCH 15/17] up

---
 .../test_torchao_huggingface_checkpoints.sh   |  2 +-
 examples/models/llama/export_llama_lib.py     |  8 +++----
 extension/llm/export/config/llm_config.py     | 24 +++++++++----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index 6b84a09ae6f..f06c794f88d 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -51,7 +51,7 @@ MODEL_OUT=model.pte
 # Default to XNNPACK
 BACKEND_ARGS="-X --xnnpack-extended-ops"
 if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then
-  BACKEND_ARGS="--torchao-kernels"
+  BACKEND_ARGS="--use-torchao-kernels"
 fi
 
 case "$MODEL_NAME" in
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index ec0af36c165..20a5dcea56d 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -418,17 +418,17 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Delegate more operators beyond DQLinear to the xnnpack backend. Requires -X or --xnnpack to be set.",
     )
     parser.add_argument(
-        "--torchao-kernels",
+        "--use-torchao-kernels",
         action="store_true",
         help="Delegate tied-embedding and quantized linear ops to torchao kernels",
     )
     parser.add_argument(
-        "--torchao-kernels-tied-embedding",
+        "--use-torchao-kernels-tied-embedding",
         action="store_true",
         help="Delegate tied-embedding ops to torchao kernels",
     )
     parser.add_argument(
-        "--torchao-kernels-linear",
+        "--use-torchao-kernels-linear",
         action="store_true",
         help="Delegate linear ops to torchao kernels",
     )
@@ -756,7 +756,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             preq_group_size=llm_config.base.preq_group_size,
             preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
             local_global_attention=llm_config.model.local_global_attention,
-            use_torchao_kernels_linear=llm_config.backend.torchao.convert_linear,
+            use_torchao_kernels_linear=llm_config.backend.torchao.use_torchao_kernels_linear,
             use_torchao_kernels_tied_embedding=llm_config.backend.torchao.convert_tied_embedding,
         )
     )
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index b45919afd83..4fade035e67 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -458,8 +458,8 @@ class TorchAOKernelsConfig:
     Configures the torchao-kernels backend.
     """
 
-    convert_linear: bool = False
-    convert_tied_embedding: bool = False
+    use_torchao_kernels_linear: bool = False
+    use_torchao_kernels_tied_embedding: bool = False
 
 
 @dataclass
@@ -647,22 +647,22 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if any(
             hasattr(args, a)
             for a in [
-                "torchao_kernels",
-                "torchao_kernels_linear",
-                "torchao_kernels_tied_embedding",
+                "use_torchao_kernels",
+                "use_torchao_kernels_linear",
+                "use_torchao_kernels_tied_embedding",
             ]
         ):
-            if hasattr(args, "torchao_kernels") and args.torchao_kernels:
+            if hasattr(args, "use_torchao_kernels") and args.torchao_kernels:
                 # Enable all conversions if torchao_kernels is specified
-                llm_config.backend.torchao.convert_linear = True
-                llm_config.backend.torchao.convert_tied_embedding = True
+                llm_config.backend.torchao.use_torchao_kernels_linear = True
+                llm_config.backend.torchao.use_torchao_kernels_tied_embedding = True
             else:
                 # Otherwise, only enable the conversions that are specified
-                llm_config.backend.torchao.convert_linear = getattr(
-                    args, "torchao_kernels_linear", False
+                llm_config.backend.torchao.use_torchao_kernels_linear = getattr(
+                    args, "use_torchao_kernels_linear", False
                 )
-                llm_config.backend.torchao.convert_tied_embedding = getattr(
-                    args, "torchao_kernels_tied_embedding", False
+                llm_config.backend.torchao.use_torchao_kernels_tied_embedding = getattr(
+                    args, "use_torchao_kernels_tied_embedding", False
                 )
 
         # DebugConfig

From 9db7b1852253275b4793ab2abf7086792e2be3e4 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 29 Sep 2025 15:44:33 -0700
Subject: [PATCH 16/17] up

---
 extension/llm/export/config/llm_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index 4fade035e67..b13001c005b 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -652,7 +652,7 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
                 "use_torchao_kernels_tied_embedding",
             ]
         ):
-            if hasattr(args, "use_torchao_kernels") and args.torchao_kernels:
+            if hasattr(args, "use_torchao_kernels") and args.use_torchao_kernels:
                 # Enable all conversions if torchao_kernels is specified
                 llm_config.backend.torchao.use_torchao_kernels_linear = True
                 llm_config.backend.torchao.use_torchao_kernels_tied_embedding = True

From 96dc88ee61a991649e0bdff042ad718abb7f99ac Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 29 Sep 2025 17:07:27 -0700
Subject: [PATCH 17/17] up

---
 examples/models/llama/export_llama_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 20a5dcea56d..aa3b157c8da 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -757,7 +757,7 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
             local_global_attention=llm_config.model.local_global_attention,
             use_torchao_kernels_linear=llm_config.backend.torchao.use_torchao_kernels_linear,
-            use_torchao_kernels_tied_embedding=llm_config.backend.torchao.convert_tied_embedding,
+            use_torchao_kernels_tied_embedding=llm_config.backend.torchao.use_torchao_kernels_tied_embedding,
         )
     )