In [None]:
# Copyright 2025 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Ethos-U delegate flow example

This guide demonstrates the full flow for running a module on Arm Ethos-U using ExecuTorch. 
Tested on Linux x86_64 and macOS aarch64. If something is not working for you, please raise a GitHub issue and tag Arm.

Before you begin:
1. (In a clean virtual environment with a compatible Python version) Install executorch using `./install_executorch.sh`
2. Install Arm cross-compilation toolchain and simulators using `examples/arm/setup.sh --i-agree-to-the-contained-eula`
3. Add Arm cross-compilation toolchain and simulators to PATH using `examples/arm/ethos-u-scratch/setup_path.sh` 

With all commands executed from the base `executorch` folder.



*Some scripts in this notebook produces long output logs: Configuring the 'Customizing Notebook Layout' settings to enable 'Output:scrolling' and setting 'Output:Text Line Limit' makes this more manageable*

## AOT Flow

The first step is creating the PyTorch module and exporting it. Exporting converts the python code in the module into a graph structure. The result is still runnable python code, which can be displayed by printing the `graph_module` of the exported program.  

In [None]:
import torch

class Add(torch.nn.Module):
    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        return x + y

example_inputs = (torch.ones(1,1,1,1),torch.ones(1,1,1,1))

model = Add()
model = model.eval()
exported_program = torch.export.export_for_training(model, example_inputs)
graph_module = exported_program.module()

_ = graph_module.print_readable()

To run on Ethos-U the `graph_module` must be quantized using the `arm_quantizer`. Quantization can be done in multiple ways and it can be customized for different parts of the graph; shown here is the recommended path for the EthosUBackend. Quantization also requires calibrating the module with example inputs.

Again printing the module, it can be seen that the quantization wraps the node in quantization/dequantization nodes which contain the computed quanitzation parameters.

In [None]:
from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
from executorch.backends.arm.quantizer.arm_quantizer import (
    EthosUQuantizer,
    get_symmetric_quantization_config,
)
from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e

target = "ethos-u55-128"

# Create a compilation spec describing the target for configuring the quantizer
# Some args are used by the Arm Vela graph compiler later in the example. Refer to Arm Vela documentation for an 
# explanation of its flags: https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/blob/main/OPTIONS.md
spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(
            target,
            system_config="Ethos_U55_High_End_Embedded",
            memory_mode="Shared_Sram",
            extra_flags="--output-format=raw --debug-force-regor"
        )
compile_spec = spec_builder.build()

# Create and configure quantizer to use a symmetric quantization config globally on all nodes
quantizer = EthosUQuantizer(compile_spec) 
operator_config = get_symmetric_quantization_config(is_per_channel=False)
quantizer.set_global(operator_config)

# Post training quantization
quantized_graph_module = prepare_pt2e(graph_module, quantizer) 
quantized_graph_module(*example_inputs) # Calibrate the graph module with the example input
quantized_graph_module = convert_pt2e(quantized_graph_module)

_ = quantized_graph_module.print_readable()

# Create a new exported program using the quantized_graph_module
quantized_exported_program = torch.export.export_for_training(quantized_graph_module, example_inputs)

The quantization nodes created in the previous cell are not built by default with ExecuTorch but must be included in the .pte-file, and so they need to be built separately. `backends/arm/scripts/build_quantized_ops_aot_lib.sh` is a utility script which does this. 

In [None]:
import subprocess 
import os 

# Setup paths
cwd_dir = os.getcwd()
et_dir = os.path.join(cwd_dir, "..", "..")
et_dir = os.path.abspath(et_dir)
script_dir = os.path.join(et_dir, "backends", "arm", "scripts")

# Run build_quantized_ops_aot_lib.sh
subprocess.run(os.path.join(script_dir, "build_quantized_ops_aot_lib.sh"), shell=True, cwd=et_dir)

The lowering in the EthosUBackend happens in five steps:

1. **Lowering to core Aten operator set**: Transform module to use a subset of operators applicable to edge devices. 
2. **Partitioning**: Find subgraphs which are supported for running on Ethos-U
3. **Lowering to TOSA compatible operator set**: Perform transforms to make the Ethos-U subgraph(s) compatible with TOSA 
4. **Serialization to TOSA**: Compiles the graph module into a TOSA graph 
5. **Compilation to NPU**: Compiles the TOSA graph into an EthosU command stream using the Arm Vela graph compiler. This makes use of the `compile_spec` created earlier.
Step 5 also prints a Network summary for each processed subgraph.

All of this happens behind the scenes in `to_edge_transform_and_lower`. Printing the graph module shows that what is left in the graph is two quantization nodes for `x` and `y` going into an `executorch_call_delegate` node, followed by a dequantization node.

In [None]:
from executorch.backends.arm.ethosu_partitioner import EthosUPartitioner
from executorch.exir import (
    EdgeCompileConfig,
    ExecutorchBackendConfig,
    to_edge_transform_and_lower,
)
from executorch.extension.export_util.utils import save_pte_program
import platform 

# Create partitioner from compile spec 
partitioner = EthosUPartitioner(compile_spec)

# Lower the exported program to the Ethos-U backend
edge_program_manager = to_edge_transform_and_lower(
            quantized_exported_program,
            partitioner=[partitioner],
            compile_config=EdgeCompileConfig(
                _check_ir_validity=False,
            ),
        )

# Load quantization ops library
os_aot_lib_names = {"Darwin" : "libquantized_ops_aot_lib.dylib", 
                "Linux"  : "libquantized_ops_aot_lib.so", 
                "Windows": "libquantized_ops_aot_lib.dll"}
aot_lib_name = os_aot_lib_names[platform.system()]

libquantized_ops_aot_lib_path = os.path.join(et_dir, "cmake-out-aot-lib", "kernels", "quantized", aot_lib_name)
torch.ops.load_library(libquantized_ops_aot_lib_path)

# Convert edge program to executorch
executorch_program_manager = edge_program_manager.to_executorch(
            config=ExecutorchBackendConfig(extract_delegate_segments=False)
        )

executorch_program_manager.exported_program().module().print_readable()

# Save pte file
pte_base_name = "simple_example"
pte_name = pte_base_name + ".pte"
pte_path = os.path.join(cwd_dir, pte_name)
save_pte_program(executorch_program_manager, pte_name)
assert os.path.exists(pte_path), "Build failed; no .pte-file found"

## Build executor runtime

After the AOT compilation flow is done, the runtime can be cross compiled and linked to the produced .pte-file using the Arm cross-compilation toolchain. This is done in three steps:
1. Build the executorch library and EthosUDelegate.
2. Build any external kernels required. In this example this is not needed as the graph is fully delegated, but its included for completeness.
3. Build and link the `arm_executor_runner`.

In [None]:
# Build executorch 
subprocess.run(os.path.join(script_dir, "build_executorch.sh"), shell=True, cwd=et_dir)

# Build portable kernels
subprocess.run(os.path.join(script_dir, "build_portable_kernels.sh"), shell=True, cwd=et_dir)

# Build executorch runner
args = f"--pte={pte_path} --target={target}"
subprocess.run(os.path.join(script_dir, "build_executorch_runner.sh") + " " + args, shell=True, cwd=et_dir)

elf_path = os.path.join(cwd_dir, pte_base_name, "cmake-out", "arm_executor_runner")
assert os.path.exists(elf_path), "Build failed; no .elf-file found"

# Run on simulated model

We can finally use the `backends/arm/scripts/run_fvp.sh` utility script to run the .elf-file on simulated Arm hardware. This Script runs the model with an input of ones, so the expected result of the addition should be close to 2.

In [None]:
args = f"--elf={elf_path}  --target={target}"
subprocess.run(os.path.join(script_dir, "run_fvp.sh") + " " + args, shell=True, cwd=et_dir)