From 9dd2292acf5e65d750a117189fc6114a6102812b Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 13 Oct 2025 21:23:05 -0400
Subject: [PATCH] [Samsung] Docs template

Summary:
Title says it all!

Add docs for the Samsung backend based on the template introduced in https://github.com/pytorch/executorch/pull/14873.
---
 .../samsung/samsung-op-support-table.csv      |  45 +++++++
 .../backends/samsung/samsung-op-support.rst   |  11 ++
 .../backends/samsung/samsung-overview.md      | 117 ++++++++++++++++++
 .../backends/samsung/samsung-partitioner.md   |  29 +++++
 .../backends/samsung/samsung-quantization.md  |  60 +++++++++
 5 files changed, 262 insertions(+)
 create mode 100644 docs/source/backends/samsung/samsung-op-support-table.csv
 create mode 100644 docs/source/backends/samsung/samsung-op-support.rst
 create mode 100644 docs/source/backends/samsung/samsung-overview.md
 create mode 100644 docs/source/backends/samsung/samsung-partitioner.md
 create mode 100644 docs/source/backends/samsung/samsung-quantization.md

diff --git a/docs/source/backends/samsung/samsung-op-support-table.csv b/docs/source/backends/samsung/samsung-op-support-table.csv
new file mode 100644
index 00000000000..7d925c43400
--- /dev/null
+++ b/docs/source/backends/samsung/samsung-op-support-table.csv
@@ -0,0 +1,45 @@
+Operator,Quantization,Constraints
+add,static int8,
+avg_pool2d,static int8,"ceil_mode=False, divisor_override=pooling_region"
+batch_norm,static int8,
+bmm,static int8,
+cat,static int8,at most 1 constant tensor
+clamp,static int8,
+constant_pad_nd,static int8,padding_value=0.0 only
+conv2d,static int8,constant weights
+dequantize_per_channel,,
+dequantize_per_tensor,,
+div,static int8,
+embedding,static int8,
+expand_copy,,"expanding at most one axis, new dimensions must be size 1"
+gelu,static int8,
+getitem,,
+hardsigmoid,static int8,
+hardswish,static int8,
+hardtanh,static int8,
+layer_norm,static int8,norm at last axis only
+leaky_relu,static int8,
+linear,static int8,constant weights
+log_softmax,static int8,
+max_pool2d,static int8,"ceil_mode=False, indices not supported"
+maximum,,
+mean_dim,static int8,
+minimum,,
+mul,static int8,
+permute,static int8,
+pixel_shuffle,,
+quantize_per_channel,,
+quantize_per_tensor,,
+relu,static int8,
+reshape,static int8,
+rsqrt,static int8,
+select,static int8,
+slice_copy,static int8,
+softmax,static int8,
+sqrt,static int8,
+squeeze,static int8,
+sub,static int8,
+to_copy,,memory_format=contiguous only
+unsqueeze,static int8,
+upsample_bilinear2d,static int8,
+upsample_nearest2d,static int8,
diff --git a/docs/source/backends/samsung/samsung-op-support.rst b/docs/source/backends/samsung/samsung-op-support.rst
new file mode 100644
index 00000000000..ecccd565021
--- /dev/null
+++ b/docs/source/backends/samsung/samsung-op-support.rst
@@ -0,0 +1,11 @@
+================
+Operator Support
+================
+
+This page lists the PyTorch operators currently supported by the Samsung Exynos backend.
+
+.. csv-table:: Operator Support
+   :file: samsung-op-support-table.csv
+   :header-rows: 1
+   :widths: 25 15 55
+   :align: center
diff --git a/docs/source/backends/samsung/samsung-overview.md b/docs/source/backends/samsung/samsung-overview.md
new file mode 100644
index 00000000000..9bdc4eb4289
--- /dev/null
+++ b/docs/source/backends/samsung/samsung-overview.md
@@ -0,0 +1,117 @@
+# Samsung Exynos Backend
+
+ExecuTorch's Samsung Exynos backend enables the execution of ExecuTorch models on
+Samsung SoCs via the NPU/DSP. The delegate is built on top of the
+[Samsung Exynos AI Litecore SDK]((https://soc-developer.semiconductor.samsung.com/global/development/ai-litecore)).
+
+## Features
+
+- Wide range of operator support
+- Supported inference precisions:
+  - FP16
+  - 8-bit statically quantized (int8/uint8)
+  - 16-bit statically quantized (int16/uint16)
+
+## Target Requirements
+
+Currently, the Samsung Exynos backend is supported only for devices with the
+following chipsets:
+
+- Exynos 2500 (E9955)
+
+## Development Requirements
+
+The [Samsung Exynos AI Litecore SDK](https://soc-developer.semiconductor.samsung.com/global/development/ai-litecore)
+is required to build the Exynos backend from source, and is also required to
+export models to the Exynos delegate.
+
+----
+
+## Using the Samsung Exynos Backend
+
+To target the Exynos backend during the export and lowering process, pass an instance of
+the `EnnPartitioner` to `to_edge_transform_and_lower`. The example below
+demonstrates this process using the MobileNet V2 model from torchvision.
+
+```python
+import torch
+import torchvision.models as models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.exir import to_edge_transform_and_lower
+
+mobilenet_v2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+chipset = "E9955"
+compile_specs = [gen_samsung_backend_compile_spec(chipset)]
+
+et_program = to_edge_transform_and_lower(
+    torch.export.export(mobilenet_v2, sample_inputs),
+    partitioner=[EnnPartitioner(compile_specs)],
+).to_executorch()
+
+with open("mv2_xnnpack.pte", "wb") as file:
+    et_program.write_to_file(file)
+```
+
+See [Partitioner API](/backends/samsung/samsung-partitioner) for a reference on available partitioner options.
+
+----
+
+## Quantization
+
+The Samsung Exynos backend support statically quantized models with 8-bit and 16-bit
+integral types.
+
+See [Samsung Exynos Quantization](/backends/samsung/samsung-quantization) for more
+information on available quantization schemes and APIs.
+
+----
+
+## Runtime Integration
+
+To run the model on-device, use the standard ExecuTorch runtime APIs.
+
+The Exynos backend is currently not available in any of ExecuTorch's published packages.
+To access it, build ExecuTorch from source. When building from source, pass
+`-DEXECUTORCH_BUILD_EXYNOS=ON` when configuring the CMake build. See [Running on Device](/getting-started.md#running-on-device)
+for more information.
+
+Then, to link against the backend, add the `executorch_backends` CMake target as a build
+dependency.
+
+```
+# CMakeLists.txt
+add_subdirectory("executorch")
+...
+target_link_libraries(
+    my_target
+    PRIVATE executorch
+    executorch_backends
+    ...
+)
+```
+
+No additional steps are necessary to use the backend beyond linking the target. Any
+Exynos delegated .pte file will automatically run on the registered backend.
+
+## Reference
+
+**→{doc}`exynos-partitioner` — Partitioner options.**
+
+**→{doc}`exynos-quantization` — Supported quantization schemes.**
+
+**→{doc}`exynos-op-support` — Supported operators.**
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+:caption: Exynos Backend
+
+exynos-partitioner
+exynos-quantization
+exynos-op-support
diff --git a/docs/source/backends/samsung/samsung-partitioner.md b/docs/source/backends/samsung/samsung-partitioner.md
new file mode 100644
index 00000000000..eb84a795551
--- /dev/null
+++ b/docs/source/backends/samsung/samsung-partitioner.md
@@ -0,0 +1,29 @@
+# Partitioner API
+
+The `EnnPartitioner` API is the primary entrypoint when exporting a model to the Samsung
+Exynos backend. The partitioner is responsible for determining which parts of the model
+should be lowered to the backend and also provides an interface for configuring the
+behaviour of the backend.
+
+Currently, the configuration options for `EnnPartitioner` can be generated automatically
+using the `gen_samsung_backend_compile_spec` API. For instance,
+
+```python
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+
+from executorch.exir import to_edge_transform_and_lower
+
+chipset = "E9955"
+compile_specs = [gen_samsung_backend_compile_spec(chipset)]
+
+et_program = to_edge_transform_and_lower(
+    exported_program,
+    partitioner=[EnnPartitioner(compile_specs)],
+).to_executorch()
+```
+
+At the moment, only `"E9955"` is supported as a valid chipset name, which corresponds to
+the Exynose 2500 SoC. Support for additional chipsets will be added in the future.
diff --git a/docs/source/backends/samsung/samsung-quantization.md b/docs/source/backends/samsung/samsung-quantization.md
new file mode 100644
index 00000000000..ad4b50cb93d
--- /dev/null
+++ b/docs/source/backends/samsung/samsung-quantization.md
@@ -0,0 +1,60 @@
+# Quantization
+
+The Exynos backend currently supports executing statically quantized 8-bit models.
+
+### 8-bit quantization with the PT2E quantization flow
+
+To perform 8-bit quantization with the PT2E flow, perform the following steps prior to exporting the model:
+
+1) Create an instance of the `EnnQuantizer` class and set the desired quantization behaviour.
+2) Use `torch.export.export` to obtain a graph module representation of the source model.
+3) Use `prepare_pt2e` to prepare the model for quantization.
+4) Execute the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
+5) Use `convert_pt2e` to quantize the model.
+6) Export and lower the model using the standard export flow.
+
+The output of `convert_pt2e` is a PyTorch model which can be exported and lowered using
+the same export flow as non-quantized models. As it is a regular PyTorch model, it can
+also be used to evaluate the accuracy of the quantized model using standard PyTorch
+techniques.
+
+The below example shows how to quantize a MobileNetV2 model using the PT2E quantization flow.
+
+```python
+import torch
+import torchvision.models as models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer.quantizer import EnnQuantizer, Precision
+
+from executorch.exir import to_edge_transform_and_lower
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+# Currently, "A8W8" is the only supported precision mode
+precision = "A8W8"
+is_per_channel = True
+is_qat = False
+
+quantizer = EnnQuantizer()
+quantizer.set_quant_params(precision, is_per_channel, is_qat) # (1)
+
+training_ep = torch.export.export(model, sample_inputs).module() # (2)
+prepared_model = prepare_pt2e(training_ep, quantizer) # (3)
+
+for cal_sample in [torch.randn(1, 3, 224, 224)]: # Replace with representative model inputs
+	prepared_model(cal_sample) # (4) Calibrate
+
+quantized_model = convert_pt2e(prepared_model) # (5)
+
+et_program = to_edge_transform_and_lower( # (6)
+    torch.export.export(quantized_model, sample_inputs),
+    partitioner=[EnnPartitioner()],
+).to_executorch()
+```
+
+See [PyTorch 2 Export Post Training Quantization](https://docs.pytorch.org/ao/main/tutorials_source/pt2e_quant_ptq.html)
+for more information.