In [None]:
# Copyright 2026 Arm Limited and/or its affiliates.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# Cortex-M backend flow example - Running MobileNetV2

This guide demonstrates the full flow for running a module on Arm Cortex-M using ExecuTorch. 
**Note that this backend is currently WIP and things may change / break without notice.**

Before you begin:
1. (In a clean virtual environment with a compatible Python version) Install executorch using `./install_executorch.sh`
2. Install Arm cross-compilation toolchain and simulators using `examples/arm/setup.sh --i-agree-to-the-contained-eula`
3. Add Arm cross-compilation toolchain and simulators to PATH using `examples/arm/arm-scratch/setup_path.sh` 
4. Install the following pip packages: `pip install matplotlib`

With all commands executed from the base `executorch` folder.



*Some scripts in this notebook produces long output logs: Configuring the 'Customizing Notebook Layout' settings to enable 'Output:scrolling' and setting 'Output:Text Line Limit' makes this more manageable*


## Setup the dataset 

This example uses the [imagenette dataset](https://huggingface.co/datasets/frgfm/imagenette) from huggingface. MobileNetV2 requires some transforms to be applied to the images before interference, and additionally the Cortex-M backend has requirements on the data being of rank 4 and having a channels-last memory format.

These transforms are applied in the `sample_generator` to ensure a compatible dataset. After that, plot one image to sanity check the input.

In [None]:
import torch
from datasets import load_dataset
from torchvision.models import MobileNet_V2_Weights

# Load a small ImageNet validation split from Hugging Face
dataset = load_dataset("frgfm/imagenette", "full_size", split="validation")

# Shuffle deterministically
dataset = dataset.shuffle(seed=0)

# MobileNetV2 preprocessing transforms
weights = MobileNet_V2_Weights.DEFAULT
preprocess = weights.transforms()

label_names = dataset.features["label"].names

# Generator yielding (transformed_img, label_name, original_img)
def sample_generator(max_samples=200):
    for i, sample in enumerate(dataset):
        if max_samples is not None and i >= max_samples:
            break
        image = sample["image"].convert("RGB") 
        yield (
            preprocess(image).unsqueeze(0).to(memory_format=torch.channels_last),
            label_names[sample["label"]],
            sample["image"],
        )


In [None]:
import matplotlib.pyplot as plt

# Display sample image before and after transform
transformed_img, label, original_img = next(sample_generator())

fig, axes = plt.subplots(1,2)
axes[0].imshow(original_img)
axes[0].set_title("Original image")

# Undo squeeze and memory format changes and normalize to plot well.
plottable_img = (transformed_img.squeeze(0).permute(1,2,0) - transformed_img.min()) / (transformed_img.max() - transformed_img.min())
axes[1].imshow(plottable_img)
axes[1].set_title("Transformed image")
fig.suptitle(label)
plt.show(fig)


## AOT Flow

The first step is creating the PyTorch module and exporting it. Exporting converts the python code in the module into a graph structure. The result is still runnable python code, which can be displayed by printing the `graph_module` of the exported program. 


In [None]:
from torchvision.models import mobilenet_v2

# Init model
model = mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT)
model = model.eval()

imagenet_labels = MobileNet_V2_Weights.DEFAULT.meta["categories"]

# Test model. The output is a likeness-score for each class in ImageNet, pick top-1 for result
def print_result(output_data, true_label):
    output_data = torch.Tensor(output_data)
    index = output_data.argmax().item()
    predicted_label = imagenet_labels[index]
    print(f"True label: {true_label}. Model output: {predicted_label}")
    
    return true_label == predicted_label

output_data = model(transformed_img)
print_result(output_data, label)


In [None]:
exported_program = torch.export.export(model, (transformed_img,))
graph_module = exported_program.module()

_ = graph_module.print_readable()


You can also get a visual representation of exported programs using model-explorer:
```
$ pip install ai-edge-model-explorer
$ model-explorer
```

In [None]:
from executorch.devtools.visualization import visualize
visualize(exported_program)

To run on Cortex-M the `graph_module` must be quantized using the CortexMQuantizer. Quantization also requires calibrating the module with example inputs to ensure optimal quantization parameters given the dataset.

Again printing the module, it can be seen that the quantization wraps operators in quantization/dequantization operators which contain the computed quantization parameters. Note that there are no int8-operators used at this point, those are introduced in the pass lowering by replacing sequences of (int8 dequantization + fp32 operator + int8 quantization) operators with quantized operators. This also means that operators not supported by the backend will run in fp32.

In [None]:
from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e

# Create and configure quantizer to use a symmetric quantization config globally on all nodes
quantizer = CortexMQuantizer()

# Post training quantization
quantized_graph_module = prepare_pt2e(graph_module, quantizer)
N_CALIBRATION_SAMPLES=100
for i, (calibration_img, _, _) in enumerate(sample_generator(N_CALIBRATION_SAMPLES)): 
    quantized_graph_module(calibration_img)
    print(f"{i+1}/{N_CALIBRATION_SAMPLES} samples calibrated.", end='\r')

quantized_graph_module = convert_pt2e(quantized_graph_module)

_ = quantized_graph_module.print_readable()

# Create a new exported program using the quantized_graph_module
quantized_exported_program = torch.export.export(quantized_graph_module, (transformed_img,))


In [None]:
# Test the quantized model
output_data = quantized_exported_program.module()(transformed_img)
print_result(output_data, label)

The lowering to Cortex-M backend is done by first lowering to the edge-dialect using `to_edge` with a custom `EdgeCompileConfig`, followed by running passes from the `CortexMPassManager`. When lowered, the model is serialized into the flatbuffer which is loaded and run on the embedded device using `to_executorch`. 



In [None]:
import os
from executorch.exir import (
    EdgeCompileConfig,
    ExecutorchBackendConfig,
    to_edge,
)
from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager

# Create compile config for Cortex-M lowering
config = EdgeCompileConfig(
            preserve_ops=[
                torch.ops.aten.linear.default,
                torch.ops.aten.hardsigmoid.default,
                torch.ops.aten.hardsigmoid_.default,
                torch.ops.aten.hardswish.default,
                torch.ops.aten.hardswish_.default,
            ],
            _check_ir_validity=False,
            _core_aten_ops_exception_list=[torch.ops.aten.max_pool2d.default],
        )

# Lower the exported program for the Cortex-M backend - note to_edge usage rather than to_edge_transform_and_lower, currently required to use preserve_ops w/o partitioner.
edge_program_manager = to_edge(
            quantized_exported_program,
            compile_config=config,
        )

# Run pass manager on the forward graph_module - use of pass_manager.transform() over edge_program_mangager.transform() is currently required to ensure that the passes can modify the exported_program and not only the graph_module.
pass_manager = CortexMPassManager(edge_program_manager.exported_program())
edge_program_manager._edge_programs["forward"] = pass_manager.transform()

# Test converted edge program running python implementations of cortex-m dialect.
output_data = edge_program_manager.exported_program().module()(transformed_img)
print_result(output_data, label)



In [None]:
# It can be a good idea to visualize the final exported program to check the final graph structure after all transformations.
visualize(edge_program_manager)

In [None]:
# Serialize edge program
executorch_program_manager = edge_program_manager.to_executorch(
            config=ExecutorchBackendConfig(extract_delegate_segments=False)
        )

To run the model with custom data packed into the model, it can be bundled together into what is called a bundled pte, or `.bpte`. 
This can be used for building your runtime with a full testsuite to be validated when running, but here we only use it 
to run custom data. 

In [None]:
from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
from executorch.devtools import BundledProgram
from executorch.devtools.bundled_program.serialize import serialize_from_bundled_program_to_flatbuffer

transformed_img, label, original_img = next(sample_generator())
test_case = MethodTestCase(
    inputs=transformed_img,
    expected_outputs=None)

test_suite = MethodTestSuite(
                method_name="forward",
                test_cases=[test_case],
            )

bundled_program = BundledProgram(executorch_program_manager, [test_suite])
bundled_program_buffer = serialize_from_bundled_program_to_flatbuffer(bundled_program)

cwd_dir = os.getcwd()
pte_base_name = "cortex_m_mv2_example"
pte_name = pte_base_name + ".bpte"
pte_path = os.path.join(cwd_dir, pte_name)
with open(pte_path, "wb") as file:
    file.write(bundled_program_buffer)


## Build executor runtime

After the AOT compilation flow is done, the runtime can be cross compiled and linked to the produced .bpte-file using the Arm cross-compilation toolchain. 

This is done in three steps:
1. Build the executorch library and Cortex-M ops/kernels.
2. Build any external kernels required. In this example this is not needed as the graph is fully lowered to Cortex-M, but its included for completeness.
3. Build and link the `arm_executor_runner`.




In [None]:
%%bash

# Build executorch libraries cross-compiled for arm baremetal to executorch/cmake-out-arm
cmake --preset arm-baremetal \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_DEVTOOLS=ON \
-B../../cmake-out-arm ../..
cmake --build ../../cmake-out-arm --target install -j$(nproc) 


In [None]:
%%bash 
# Build example executor runner application to examples/arm/cortex_m_mv2_example
# Note that his is the same runner as used in the Ethos-U example, creating some overlap in the config even though the Ethos-U is not used.
cmake -DCMAKE_TOOLCHAIN_FILE=$(pwd)/ethos-u-setup/arm-none-eabi-gcc.cmake \
      -DCMAKE_BUILD_TYPE=Release \
      -DET_PTE_FILE_PATH=cortex_m_mv2_example.bpte \
      -DTARGET_CPU=cortex-m55 \
      -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
      -DMEMORY_MODE=Shared_Sram \
      -DET_BUNDLE_IO=ON \
      -DSYSTEM_CONFIG=Ethos_U55_High_End_Embedded \
      -Bcortex_m_mv2_example \
      executor_runner
cmake --build cortex_m_mv2_example -j$(nproc) -- arm_executor_runner

# Run on simulated model

We can finally use the `backends/arm/scripts/run_fvp.sh` utility script to run the .elf-file on simulated Arm Cortex-M hardware. This is expected to take a couple of minutes and should produce the expected "golf ball" label as top-1 result.



In [None]:
import subprocess

output = subprocess.run("../../backends/arm/scripts/run_fvp.sh --elf=cortex_m_mv2_example/arm_executor_runner --target=ethos-u55-128", shell=True, capture_output=True)

In [None]:
import re

result = [[float(x) for x in re.findall("-?\d\.\d{6}" , str(output.stdout))]]
print_result(result, label)
