From d6b9d614de51e2e2d45cb499307b55cda1415a6d Mon Sep 17 00:00:00 2001
From: Gregory Comer <gregoryjcomer@gmail.com>
Date: Mon, 7 Jul 2025 21:06:32 -0700
Subject: [PATCH] Update

[ghstack-poisoned]
---
 backends/test/harness/tester.py          |   1 +
 backends/test/operators/facto_specs.py   |  16 +-
 backends/test/operators/test_facto.py    | 127 ++++++-----
 backends/test/runner/CMakeLists.txt      |  16 ++
 backends/test/runner/test_runner.cpp     | 260 +++++++++++++++++++++++
 backends/xnnpack/test/tester/__init__.py |   7 +-
 pytest.ini                               |   5 +
 7 files changed, 359 insertions(+), 73 deletions(-)
 create mode 100644 backends/test/runner/CMakeLists.txt
 create mode 100644 backends/test/runner/test_runner.cpp

diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py
index f1dfeb23531..3f717d824bc 100644
--- a/backends/test/harness/tester.py
+++ b/backends/test/harness/tester.py
@@ -361,6 +361,7 @@ def _assert_outputs_equal(model_output, ref_output, atol=1e-03, rtol=1e-03):
                     ref,
                     atol=atol,
                     rtol=rtol,
+                    equal_nan=True,
                 ), (
                     f"Output {i} does not match reference output.\n"
                     f"\tGiven atol: {atol}, rtol: {rtol}.\n"
diff --git a/backends/test/operators/facto_specs.py b/backends/test/operators/facto_specs.py
index 96fe86b2ea7..3427c302f6a 100644
--- a/backends/test/operators/facto_specs.py
+++ b/backends/test/operators/facto_specs.py
@@ -2,14 +2,20 @@
 import torch
 
 from facto.inputgen.argument.type import ArgType
-from facto.inputgen.specs.model import ConstraintProducer as cp, InPosArg, OutArg, Spec
+from facto.inputgen.specs.model import (
+    ConstraintProducer as cp,
+    InKwArg,
+    InPosArg,
+    OutArg,
+    Spec,
+)
 
 """
 This file contains FACTO operator specs for ops not in the standard FACTO db. This mainly
 includes ops not in the Core ATen op set and preserved by a backend, such as linear.
 """
 
-LiNEAR_DEFAULT_SPEC = Spec(
+LINEAR_DEFAULT_SPEC = Spec(
     op="linear.default",  # (Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
     inspec=[
         InPosArg(
@@ -53,7 +59,9 @@
 )
 
 _extra_specs = [
-    LiNEAR_DEFAULT_SPEC,
+    LINEAR_DEFAULT_SPEC,
 ]
 
-ExtraSpecDB: dict[str, Spec] = {s.op: s for s in _extra_specs}
+ExtraSpecDB: dict[str, Spec] = {
+    s.op: s for s in _extra_specs
+}
\ No newline at end of file
diff --git a/backends/test/operators/test_facto.py b/backends/test/operators/test_facto.py
index 208aaa042a9..ec4459f6086 100644
--- a/backends/test/operators/test_facto.py
+++ b/backends/test/operators/test_facto.py
@@ -4,29 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
-
-#
-# This file contains logic to run generated operator tests using the FACTO
-# library (https://github.com/pytorch-labs/FACTO). To run the tests, first
-# clone and install FACTO by running pip install . from the FACTO source
-# directory. Then, from the executorch root directory, run the following:
-#
-# python -m unittest backends.test.operators.test_facto.FactoTestsXNNPACK
-#
+# pyre-strict
 
 import copy
 import functools
 import traceback
+from typing import Any, Callable, List, OrderedDict, Sequence, Tuple
 import unittest
-from typing import Any, Callable, Sequence
 
 import torch
 from executorch.backends.test.harness.tester import Tester as TesterBase
-from executorch.backends.xnnpack.test.tester.tester import Tester as XnnpackTester
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower, Tester as XnnpackTester
 from facto.inputgen.argtuple.gen import ArgumentTupleGenerator
-from facto.inputgen.specs.model import ConstraintProducer as cp, Spec
+from facto.inputgen.specs.model import Constraint, ConstraintProducer as cp, Spec
 from facto.inputgen.utils.random_manager import random_manager
+from facto.inputgen.variable.type import ScalarDtype
 from facto.specdb.db import SpecDictDB
 from torch._ops import OpOverload
 
@@ -35,9 +27,9 @@
 CombinedSpecDB = SpecDictDB | ExtraSpecDB
 
 COMMON_TENSOR_CONSTRAINTS = [
-    cp.Rank.Ge(lambda deps: 1),  # Avoid zero and high rank tensors.
+    cp.Rank.Ge(lambda deps: 1),
     cp.Rank.Le(lambda deps: 4),
-    cp.Size.Ge(lambda deps, r, d: 1),  # Keep sizes reasonable.
+    cp.Size.Ge(lambda deps, r, d: 1),
     cp.Size.Le(lambda deps, r, d: 2**9),
 ]
 
@@ -54,7 +46,6 @@
     "other",
 }
 
-
 def _patch_spec(spec: Spec) -> Spec:
     spec = copy.deepcopy(spec)
     for inspec in spec.inspec:
@@ -64,18 +55,16 @@ def _patch_spec(spec: Spec) -> Spec:
             inspec.constraints.extend(COMMON_SCALAR_CONSTRAINS)
     return spec
 
-
 class OpModel(torch.nn.Module):
     """
     Wraps a single torch operator in an nn.Module.
     """
-
     def __init__(
-        self,
-        op: OpOverload,
-        runtime_input_count: int,
+        self, 
+        op: OpOverload, 
+        runtime_input_count: int, 
         fixed_args: Sequence[Any],
-        fixed_kwargs: dict[str, Any],
+        fixed_kwargs: dict[str, Any]
     ):
         super().__init__()
         self.op = op
@@ -99,12 +88,9 @@ def __init__(
     def forward(self, *args, **kwargs):
         return self.op(*(args + self.fixed_args), **(kwargs | self.fixed_kwargs))
 
-
 class ConvModel(OpModel):
     def forward(self, *args, **kwargs):
-        weight, bias, stride, padding, dilation, transposed, output_padding, groups = (
-            self.fixed_args
-        )
+        weight, bias, stride, padding, dilation, transposed, output_padding, groups = self.fixed_args
 
         if not transposed:
             if len(weight.shape) == 3:
@@ -113,7 +99,7 @@ def forward(self, *args, **kwargs):
                 op = torch.nn.functional.conv2d
             elif len(weight.shape) == 5:
                 op = torch.nn.functional.conv3d
-
+            
             return op(args[0], weight, bias, stride, padding, dilation, groups)
         else:
             if len(weight.shape) == 3:
@@ -122,11 +108,8 @@ def forward(self, *args, **kwargs):
                 op = torch.nn.functional.conv_transpose2d
             elif len(weight.shape) == 5:
                 op = torch.nn.functional.conv_transpose3d
-
-            return op(
-                args[0], weight, bias, stride, padding, output_padding, groups, dilation
-            )
-
+            
+            return op(args[0], weight, bias, stride, padding, output_padding, groups, dilation)
 
 def get_module_for_op(op: OpOverload):
     if op == torch.ops.aten.convolution.default:
@@ -134,7 +117,6 @@ def get_module_for_op(op: OpOverload):
     else:
         return OpModel
 
-
 class FactoTestsBase(unittest.TestCase):
     def __init__(self, tester_factory: Callable[[], TesterBase], *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -147,37 +129,36 @@ def _generate_test(op_name: str) -> None:
         torch_op = functools.reduce(getattr, sections, torch.ops.aten)
 
         test_name = "test_" + op_name.replace(".", "_")
-
-        def test_body(self):
-            self._test_op(torch_op)
+        test_body = lambda self: self._test_op(torch_op)
 
         setattr(FactoTestsBase, test_name, test_body)
-
+    
     @staticmethod
     def get_runtime_input_count(spec: Spec):
         # Determine which inputs are fixed at tracing time (weights, for example),
         # vs inputs to the runtime graph. We currently assume that the runtime graph
         # inputs start at the beginning of the arg list and are contiguous.
-        #
+        # 
         # Args are consider to be runtime inputs if they are positional and are named
         # one of RUNTIME_INPUT_NAMES. If none match, we assume only the first arg is a
         # runtime input.
         runtime_input_count = 0
         for inspec in spec.inspec:
             is_runtime_input = (
-                inspec.type.is_tensor() and inspec.name.lower() in RUNTIME_INPUT_NAMES
+                inspec.type.is_tensor() and 
+                inspec.name.lower() in RUNTIME_INPUT_NAMES
             )
             if is_runtime_input:
                 runtime_input_count += 1
             else:
                 break
-
+        
         return max(1, runtime_input_count)
 
     def setUp(self):
         torch.set_printoptions(threshold=3)
-
-    def _test_op(self, op: OpOverload) -> None:  # noqa: C901
+    
+    def _test_op(self, op: OpOverload) -> None:
         random_manager.seed(0)
 
         # Strip namespace
@@ -186,15 +167,15 @@ def _test_op(self, op: OpOverload) -> None:  # noqa: C901
         # Default to .default overload
         if "." not in op_name:
             op_name += ".default"
-
+        
         # Find and patch op spec
-        if op_name not in CombinedSpecDB:
+        if not op_name in CombinedSpecDB:
             raise ValueError(f"Operator {op_name} not found in SpecDictDB.")
         spec = _patch_spec(CombinedSpecDB[op_name])
 
         runtime_input_count = FactoTestsBase.get_runtime_input_count(spec)
 
-        print(f"Op: {op_name}, {runtime_input_count} runtime inputs")
+        print(f"Op: {op_name}, {runtime_input_count} runtime inputs") 
 
         # Run test cases
         success_count_delegated = 0
@@ -207,14 +188,18 @@ def _test_op(self, op: OpOverload) -> None:  # noqa: C901
 
             try:
                 if isinstance(posargs[0], torch.Tensor):
-                    # Temporary for getting around XNN crashes
-                    if posargs[0].dtype not in {torch.float32, torch.float16}:
-                        print("SKIPPING NON FLOAT CASE")
+                    # Temporary for getting around XNN crashes (https://github.com/pytorch/executorch/issues/10960).
+                    # TODO Re-enable when resolved.
+                    if posargs[0].dtype in {torch.int8, torch.uint8}:
+                        print("Skipping (u)int8 case.")
                         continue
 
                 module_cls = get_module_for_op(op)
                 model = module_cls(
-                    op, runtime_input_count, posargs[runtime_input_count:], inkwargs
+                    op,
+                    runtime_input_count,
+                    posargs[runtime_input_count:],
+                    inkwargs
                 )
 
                 # Sanity check to make sure it runs in eager. This can present nicer error
@@ -225,13 +210,20 @@ def _test_op(self, op: OpOverload) -> None:  # noqa: C901
                     print(f"Eager execution failed: {e}")
                     continue
 
-                tester = (
-                    self._tester_factory(model, tuple(posargs[:runtime_input_count]))
-                    .export()
-                    .dump_artifact()
-                    .to_edge_transform_and_lower()
+                tester = self._tester_factory(
+                        model,
+                        tuple(posargs[:runtime_input_count])
                 )
 
+                # Dynamo will also fail to handle some patterns that are valid in eager.
+                try:
+                    tester.export()
+                except Exception as e:
+                    print(f"Export failed.")
+                    continue
+
+                tester.to_edge_transform_and_lower()
+
                 is_delegated = any(
                     n.target == torch._higher_order_ops.executorch_call_delegate
                     for n in tester.stages[tester.cur].graph_module.graph.nodes
@@ -241,19 +233,20 @@ def _test_op(self, op: OpOverload) -> None:  # noqa: C901
                 # Only run the runtime test if the op was delegated.
                 if is_delegated:
                     (
-                        tester.to_executorch()
+                        tester
+                        .to_executorch()
                         .serialize()
                         .run_method_and_compare_outputs()
                     )
-
+                
                 if is_delegated:
                     success_count_delegated += 1
                 else:
                     success_count_undelegated += 1
+            #finally:
             except Exception as e:
                 fail_count += 1
-                print(f"Error: {e}")
-                print("Args:")
+                print(f"Args:")
                 for arg in posargs:
                     if isinstance(arg, torch.Tensor):
                         print(f"  {arg.dtype} {arg.shape}")
@@ -262,20 +255,22 @@ def _test_op(self, op: OpOverload) -> None:  # noqa: C901
 
                 traceback.print_exc()
 
-        print(
-            f"{success_count_delegated + success_count_undelegated} PASS, {fail_count} FAIL"
-        )
-        print(
-            f"  {success_count_delegated} DELEGATED, {success_count_undelegated} UNDELEGATED"
-        )
-
+        print(f"{success_count_delegated + success_count_undelegated} PASS, {fail_count} FAIL")
+        print(f"  {success_count_delegated} DELEGATED, {success_count_undelegated} UNDELEGATED")
 
 # Programatically generate tests for each operator.
 for op_name in CombinedSpecDB.keys():
     FactoTestsBase._generate_test(op_name)
 
-
 # TODO Figure out where to put these
 class FactoTestsXNNPACK(FactoTestsBase):
     def __init__(self, *args, **kwargs):
         super().__init__(XnnpackTester, *args, **kwargs)
+
+try:
+    from executorch.backends.apple.coreml.test.tester import CoreMLTester
+    class FactoTestsCoreML(FactoTestsBase):
+        def __init__(self, *args, **kwargs):
+            super().__init__(CoreMLTester, *args, **kwargs)
+except:
+    print("Skipping Core ML facto tests as Core ML AOT is not available.")
\ No newline at end of file
diff --git a/backends/test/runner/CMakeLists.txt b/backends/test/runner/CMakeLists.txt
new file mode 100644
index 00000000000..d0ee29f8d6a
--- /dev/null
+++ b/backends/test/runner/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_executable(executorch-test-runner
+    test_runner.cpp
+    # TODO
+    ../../../runtime/platform/runtime.cpp
+)
+
+target_link_libraries(
+    executorch-test-runner
+    PRIVATE executorch
+    gflags
+    extension_flat_tensor
+    extension_flat_tensor_serialize
+    extension_module
+    extension_tensor
+    optimized_native_cpu_ops_lib
+    xnnpack_backend)
diff --git a/backends/test/runner/test_runner.cpp b/backends/test/runner/test_runner.cpp
new file mode 100644
index 00000000000..e17a4f91a55
--- /dev/null
+++ b/backends/test/runner/test_runner.cpp
@@ -0,0 +1,260 @@
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
+#include <executorch/extension/flat_tensor/serialize/serialize.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <iostream>
+#include <map>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include <gflags/gflags.h>
+
+/*
+ * This runner is intended to built and run as part of the backend test flow. It takes a
+ * set of inputs from a flat_tensor-format file, runs each case, and then serializes the
+ * outputs to a file, also in flat_tensor format.
+ */
+ 
+DEFINE_string(
+    model_path,
+    "model.pte",
+    "Model serialized in flatbuffer format.");
+
+DEFINE_string(
+    input_path,
+    "inputs.ptd",
+    "Input tensors in flat tensor (ptd) format.");
+
+DEFINE_string(
+    output_path,
+    "outputs.ptd",
+    "Path to write output tensor in flat tensor (ptd) format.");
+
+DEFINE_string(
+    method,
+    "forward",
+    "The model method to run.");
+
+using executorch::aten::Tensor;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::ET_RUNTIME_NAMESPACE::TensorLayout;
+
+// Contains method inputs for a single run.
+struct TestCase {
+  std::map<int, TensorPtr> inputs;
+};
+
+std::map<std::string, TestCase> collect_test_cases(FlatTensorDataMap& input_map);
+TensorPtr create_tensor(TensorLayout& layout, std::unique_ptr<char[], decltype(&free)> buffer);
+Result<FlatTensorDataMap> load_input_data(FileDataLoader& loader);
+std::optional<std::tuple<std::string, int>> parse_key(const std::string& key);
+Result<std::vector<EValue>> run_test_case(Module& module, TestCase& test_case);
+void store_outputs(std::map<std::string, TensorPtr>& output_map, const std::string& case_name, const std::vector<EValue>& outputs);
+
+const int TensorAlignment = 16;
+
+int main(int argc, char** argv){
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  executorch::runtime::runtime_init();
+
+  // Load the model.
+  Module model(FLAGS_model_path.c_str());
+  auto load_method_error = model.load_method(FLAGS_method.c_str());
+  if (load_method_error != Error::Ok) {
+      std::cerr << "Failed to load method \"" << FLAGS_method << "\": " << static_cast<int>(load_method_error) << std::endl;
+      return -1;
+  }
+  
+  // Load the input tensor data. Note that the data loader has to live as long as the flat
+  // tensor data map does.
+  auto input_loader_result = FileDataLoader::from(FLAGS_input_path.c_str());
+  if (!input_loader_result.ok()) {
+    std::cerr << "Failed to open input file: error " << static_cast<int>(input_loader_result.error()) << std::endl;
+  }
+
+  auto load_result = load_input_data(*input_loader_result);
+  if (!load_result.ok()) {
+    return -1;   
+  }
+  auto input_map = std::move(load_result.get());
+
+  auto cases = collect_test_cases(input_map);
+  std::map<std::string, TensorPtr> output_map;
+
+  // Run each case and store the outputs.
+  for (auto& [name, test_case] : cases) {
+    auto result = run_test_case(model, test_case);
+    if (!result.ok()) {
+      std::cerr << "Failed to run test case \"" << name << "\": " << static_cast<int>(result.error()) << std::endl;
+      return -1;
+    }
+
+    store_outputs(output_map, name, result.get());
+  }
+
+  // Create a map of Tensor (unowned), rather than TensorPtr (owned).
+  std::map<std::string, Tensor> output_map_tensors;
+  for (auto& [key, value] : output_map) {
+    output_map_tensors.emplace(key, *value);
+  }
+
+  // Write the output data in .ptd format.
+  auto save_result = executorch::extension::flat_tensor::save_ptd(
+    FLAGS_output_path.c_str(),
+    output_map_tensors,
+    TensorAlignment
+  );
+
+  if (save_result != Error::Ok) {
+    std::cerr << "Failed to save outputs: " << static_cast<int>(save_result) << std::endl;
+    return -1;
+  }
+
+  std::cout << "Successfully wrote output tensors to " << FLAGS_output_path << "." << std::endl; 
+}
+
+// Group inputs by test case and build tensors.
+std::map<std::string, TestCase> collect_test_cases(FlatTensorDataMap& input_map) {
+  std::map<std::string, TestCase> cases;
+
+  for (auto i = 0u; i < input_map.get_num_keys().get(); i++) {
+    auto key = input_map.get_key(i).get();
+
+    // Split key into test_case : input index
+    auto [test_case_name, input_index] = *parse_key(key);
+
+    // Get or create the test case instance.
+    auto& test_case = cases[test_case_name];
+    
+    // Create a tensor from the layout and data.
+    auto tensor_layout = input_map.get_tensor_layout(key).get();
+    auto tensor_data = std::unique_ptr<char[], decltype(&free)>((char*) malloc(tensor_layout.nbytes()), free);
+    auto load_result = input_map.load_data_into(key, tensor_data.get(), tensor_layout.nbytes());
+    if (load_result != Error::Ok) {
+      std::cerr << "Load failed: " << static_cast<int>(load_result) << std::endl;
+      exit(-1);
+    }
+
+    auto input_tensor = create_tensor(tensor_layout, std::move(tensor_data));
+    test_case.inputs[input_index] = std::move(input_tensor);
+  }
+
+  return cases;
+}
+
+// Create a tensor from a layout and data blob.
+TensorPtr create_tensor(TensorLayout& layout, std::unique_ptr<char[], decltype(&free)> buffer) {
+  // Sizes and dim order are have different types in TensorLayout vs Tensor.
+  std::vector<executorch::aten::SizesType> sizes;
+  for (auto x : layout.sizes()) {
+    sizes.push_back(x);
+  }
+  std::vector<executorch::aten::DimOrderType> dim_order;
+  for (auto x : layout.dim_order()) {
+    dim_order.push_back(x);
+  }
+
+  auto raw_data = buffer.release();
+
+  return executorch::extension::make_tensor_ptr(
+    sizes,
+    raw_data,
+    dim_order,
+    {}, // Strides - infer from sizes + dim order.
+    layout.scalar_type(),
+    exec_aten::TensorShapeDynamism::STATIC,
+    [](void* ptr) {
+      free(ptr);
+    }
+  );
+}
+
+// Load the input data (in .ptd file format) from the given path.
+Result<FlatTensorDataMap> load_input_data(FileDataLoader& loader) {
+  auto input_data_map_load_result = FlatTensorDataMap::load(&loader);
+  if (!input_data_map_load_result.ok()) {
+    std::cerr << "Failed to open load input data map: error " << static_cast<int>(input_data_map_load_result.error()) << std::endl;
+  }
+
+  return input_data_map_load_result;
+}
+
+// Parse a string key of the form "test_case:input index". Returns a tuple of the test case name
+// and input index.
+std::optional<std::tuple<std::string, int>> parse_key(const std::string& key) {
+  auto delimiter = key.find(":");
+  if (delimiter == std::string::npos) { return std::nullopt; }
+
+  auto test_case = key.substr(0, delimiter);
+  auto index_str = key.substr(delimiter + 1);
+  auto index = std::stoi(index_str);
+
+  return {{ test_case, index }};
+}
+
+// Run a given test case and return the resulting output values.
+Result<std::vector<EValue>> run_test_case(Module& module, TestCase& test_case) {
+  for (auto& [index, value] : test_case.inputs) {
+    auto set_input_error = module.set_input(FLAGS_method, value, index);
+    if (set_input_error != Error::Ok) {
+      std::cerr << "Failed to set input " << index << ": " << static_cast<int>(set_input_error) << "." << std::endl;
+    }
+  }
+
+  return module.execute(FLAGS_method.c_str());
+}
+
+// Store output tensors into the named data map.
+void store_outputs(
+    std::map<std::string, TensorPtr>& output_map, 
+    const std::string& case_name, 
+    const std::vector<EValue>& outputs) {
+  // Because the outputs are likely memory planned, we need to clone the tensor
+  // here to avoid having the data clobbered by the next run.
+  
+  for (auto i = 0u; i < outputs.size(); i++) {
+    if (!outputs[i].isTensor()) {
+      continue;
+    }
+
+    auto key_name = case_name + ":" + std::to_string(i);
+    auto& tensor = outputs[i].toTensor();
+    
+    // Copy tensor storage.
+    auto tensor_memory = malloc(tensor.nbytes());
+    memcpy(tensor_memory, tensor.const_data_ptr(), tensor.nbytes());
+
+    // Copy tensor metadata.
+    std::vector<executorch::aten::SizesType> sizes(
+      tensor.sizes().begin(),
+      tensor.sizes().end()
+    );
+
+    std::vector<executorch::aten::DimOrderType> dim_order(
+      tensor.dim_order().begin(),
+      tensor.dim_order().end()
+    );
+
+    output_map.emplace(key_name, executorch::extension::make_tensor_ptr(
+      sizes,
+      tensor_memory,
+      dim_order,
+      {}, // Strides - implicit
+      tensor.scalar_type(),
+      exec_aten::TensorShapeDynamism::STATIC,
+      [](void* ptr) {
+        free(ptr);
+      }
+    ));
+  }
+}
diff --git a/backends/xnnpack/test/tester/__init__.py b/backends/xnnpack/test/tester/__init__.py
index a4527d9edc8..5d1dcfe84cd 100644
--- a/backends/xnnpack/test/tester/__init__.py
+++ b/backends/xnnpack/test/tester/__init__.py
@@ -6,6 +6,7 @@
 
 from executorch.backends.xnnpack.test.tester.tester import (
     Export,
+    ToEdge,
     Partition,
     Quantize,
     RunPasses,
@@ -18,12 +19,12 @@
 
 __all__ = [
     "Export",
-    "ToEdge",
     "Partition",
     "Quantize",
     "RunPasses",
-    "ToEdgeTransformAndLower",
-    "Tester",
     "Serialize",
+    "Tester",
+    "ToEdge",
+    "ToEdgeTransformAndLower",
     "ToExecutorch",
 ]
diff --git a/pytest.ini b/pytest.ini
index 557a307bdf2..de7d932b946 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -39,6 +39,11 @@ addopts =
     # but maybe it is a bit of anti-pattern
     --ignore=kernels/quantized/test/test_quant_dequant_per_token.py
     kernels/test/test_case_gen.py
+    # backends/test
+    # This effort is WIP and will be enabled in CI once testing infra 
+    # is stable and signal to noise ratio is good (no irrelevant failures).
+    # See https://github.com/pytorch/executorch/discussions/11140
+    --ignore=backends/test
     # backends/xnnpack
     backends/xnnpack/test/ops
     --ignore=backends/xnnpack/test/ops/test_bmm.py