From e49880005e7826c134af05dfa0e4027881052cfb Mon Sep 17 00:00:00 2001
From: "Yanan Cao (PyTorch)" <ycao@meta.com>
Date: Mon, 31 Mar 2025 22:25:13 -0700
Subject: [PATCH] executorch (#9661)

Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/9661

Reviewed By: digantdesai

Differential Revision: D71520535
---
 .../coreml/test/test_coreml_quantizer.py      |  4 ++-
 backends/apple/mps/test/test_mps_utils.py     |  2 +-
 backends/cadence/aot/compiler.py              |  2 +-
 .../aot/tests/test_remove_ops_passes.py       |  2 +-
 backends/example/test_example_delegate.py     |  8 +++--
 backends/mediatek/quantizer/annotator.py      |  6 ++--
 backends/qualcomm/tests/utils.py              |  2 +-
 .../test_duplicate_dynamic_quant_chain.py     |  5 +--
 .../test/quantizer/test_pt2e_quantization.py  | 35 +++++++++----------
 .../test/quantizer/test_representation.py     |  5 +--
 .../test/quantizer/test_xnnpack_quantizer.py  | 18 +++-------
 backends/xnnpack/test/test_xnnpack_utils.py   |  5 +--
 backends/xnnpack/test/tester/tester.py        |  2 +-
 .../export-to-executorch-tutorial.py          |  6 ++--
 examples/apple/mps/scripts/mps_example.py     |  4 ++-
 examples/arm/aot_arm_compiler.py              |  9 +++--
 16 files changed, 55 insertions(+), 60 deletions(-)

diff --git a/backends/apple/coreml/test/test_coreml_quantizer.py b/backends/apple/coreml/test/test_coreml_quantizer.py
index 8dfb46cbbdc..db75631dbc8 100644
--- a/backends/apple/coreml/test/test_coreml_quantizer.py
+++ b/backends/apple/coreml/test/test_coreml_quantizer.py
@@ -32,7 +32,9 @@ def quantize_and_compare(
     ) -> None:
         assert quantization_type in {"PTQ", "QAT"}
 
-        pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
+        pre_autograd_aten_dialect = export_for_training(
+            model, example_inputs, strict=True
+        ).module()
 
         quantization_config = LinearQuantizerConfig.from_dict(
             {
diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index a39583c66c6..674a4b0ba62 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -207,7 +207,7 @@ def lower_module_and_test_output(
         expected_output = model(*sample_inputs)
 
         model = torch.export.export_for_training(
-            model, sample_inputs, dynamic_shapes=dynamic_shapes
+            model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
         ).module()
 
         edge_program = export_to_edge(
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 93b3f85c529..3c7d94f8fe1 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -82,7 +82,7 @@ def convert_pt2(
     remove_decompositions(decomp_table, ops_to_keep)
     # Export with dynamo
     model_gm = (
-        torch.export.export_for_training(model, inputs)
+        torch.export.export_for_training(model, inputs, strict=True)
         .run_decompositions(decomp_table)
         .module()
     )
diff --git a/backends/cadence/aot/tests/test_remove_ops_passes.py b/backends/cadence/aot/tests/test_remove_ops_passes.py
index 42f4b87bdcb..dba4f711864 100644
--- a/backends/cadence/aot/tests/test_remove_ops_passes.py
+++ b/backends/cadence/aot/tests/test_remove_ops_passes.py
@@ -474,7 +474,7 @@ def forward(self, x):
         # Run the standard quant/convert steps, but without fusing
         # this leaves two redundant quant/dequant pairs to test with
         quantizer = CadenceDefaultQuantizer()
-        model_exp = export_for_training(M(), (inp,)).module()
+        model_exp = export_for_training(M(), (inp,), strict=True).module()
         prepared_model = prepare_pt2e(model_exp, quantizer)
         prepared_model(inp)
         converted_model = convert_pt2e(prepared_model)
diff --git a/backends/example/test_example_delegate.py b/backends/example/test_example_delegate.py
index 9e2b4e458cf..a382273af07 100644
--- a/backends/example/test_example_delegate.py
+++ b/backends/example/test_example_delegate.py
@@ -46,7 +46,9 @@ def get_example_inputs():
         )
 
         m = model.eval()
-        m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
+        m = torch.export.export_for_training(
+            m, copy.deepcopy(example_inputs), strict=True
+        ).module()
         # print("original model:", m)
         quantizer = ExampleQuantizer()
         # quantizer = XNNPACKQuantizer()
@@ -82,7 +84,9 @@ def test_delegate_mobilenet_v2(self):
         )
 
         m = model.eval()
-        m = torch.export.export_for_training(m, copy.deepcopy(example_inputs)).module()
+        m = torch.export.export_for_training(
+            m, copy.deepcopy(example_inputs), strict=True
+        ).module()
         quantizer = ExampleQuantizer()
 
         m = prepare_pt2e(m, quantizer)
diff --git a/backends/mediatek/quantizer/annotator.py b/backends/mediatek/quantizer/annotator.py
index e315599cf7f..d250b774af8 100644
--- a/backends/mediatek/quantizer/annotator.py
+++ b/backends/mediatek/quantizer/annotator.py
@@ -44,7 +44,6 @@ def annotate(graph: Graph, quant_config: QuantizationConfig) -> None:
 
 
 def register_annotator(ops: List[OpOverload]):
-
     def decorator(annotator_fn: Callable):
         for op in ops:
             OP_TO_ANNOTATOR[op] = annotator_fn
@@ -147,7 +146,6 @@ def _annotate_fused_activation_pattern(
 
 
 def _annotate_rmsnorm_pattern(graph: Graph, quant_config: QuantizationConfig) -> None:
-
     class ExecuTorchPattern(torch.nn.Module):
         def forward(self, x):
             norm = x * torch.rsqrt((x * x).mean(-1, keepdim=True) + 1e-6)
@@ -159,7 +157,9 @@ def forward(self, x):
             return norm, {}
 
     for pattern_cls in (ExecuTorchPattern, MTKPattern):
-        pattern_gm = export_for_training(pattern_cls(), (torch.randn(3, 3),)).module()
+        pattern_gm = export_for_training(
+            pattern_cls(), (torch.randn(3, 3),), strict=True
+        ).module()
         matcher = SubgraphMatcherWithNameNodeMap(
             pattern_gm, ignore_literals=True, remove_overlapping_matches=False
         )
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
index 769f24ba0d8..cf3dc1d528b 100644
--- a/backends/qualcomm/tests/utils.py
+++ b/backends/qualcomm/tests/utils.py
@@ -567,7 +567,7 @@ def get_prepared_qat_module(
         custom_quant_annotations: Tuple[Callable] = (),
         quant_dtype: QuantDtype = QuantDtype.use_8a8w,
     ) -> torch.fx.GraphModule:
-        m = torch.export.export_for_training(module, inputs).module()
+        m = torch.export.export_for_training(module, inputs, strict=True).module()
 
         quantizer = make_quantizer(
             quant_dtype=quant_dtype,
diff --git a/backends/transforms/test/test_duplicate_dynamic_quant_chain.py b/backends/transforms/test/test_duplicate_dynamic_quant_chain.py
index 91d2ddc916a..ab965dd347d 100644
--- a/backends/transforms/test/test_duplicate_dynamic_quant_chain.py
+++ b/backends/transforms/test/test_duplicate_dynamic_quant_chain.py
@@ -58,10 +58,7 @@ def _test_duplicate_chain(
 
         # program capture
         m = copy.deepcopy(m_eager)
-        m = torch.export.export_for_training(
-            m,
-            example_inputs,
-        ).module()
+        m = torch.export.export_for_training(m, example_inputs, strict=True).module()
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
diff --git a/backends/xnnpack/test/quantizer/test_pt2e_quantization.py b/backends/xnnpack/test/quantizer/test_pt2e_quantization.py
index ea6116a6f0a..34b6f745044 100644
--- a/backends/xnnpack/test/quantizer/test_pt2e_quantization.py
+++ b/backends/xnnpack/test/quantizer/test_pt2e_quantization.py
@@ -326,7 +326,7 @@ def test_disallow_eval_train(self) -> None:
         m.train()
 
         # After export: this is not OK
-        m = export_for_training(m, example_inputs).module()
+        m = export_for_training(m, example_inputs, strict=True).module()
         with self.assertRaises(NotImplementedError):
             m.eval()
         with self.assertRaises(NotImplementedError):
@@ -380,7 +380,7 @@ def forward(self, x):
         m = M().train()
         example_inputs = (torch.randn(1, 3, 3, 3),)
         bn_train_op, bn_eval_op = self._get_bn_train_eval_ops()  # pyre-ignore[23]
-        m = export_for_training(m, example_inputs).module()
+        m = export_for_training(m, example_inputs, strict=True).module()
 
         def _assert_ops_are_correct(m: torch.fx.GraphModule, train: bool) -> None:
             bn_op = bn_train_op if train else bn_eval_op
@@ -449,10 +449,7 @@ def forward(self, x):
         quantizer.set_global(operator_config)
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
-        m = export_for_training(
-            m,
-            example_inputs,
-        ).module()
+        m = export_for_training(m, example_inputs, strict=True).module()
         weight_meta = None
         for n in m.graph.nodes:  # pyre-ignore[16]
             if (
@@ -481,7 +478,7 @@ def test_reentrant(self) -> None:
             get_symmetric_quantization_config(is_per_channel=True, is_qat=True)
         )
         m.conv_bn_relu = export_for_training(  # pyre-ignore[8]
-            m.conv_bn_relu, example_inputs
+            m.conv_bn_relu, example_inputs, strict=True
         ).module()
         m.conv_bn_relu = prepare_qat_pt2e(m.conv_bn_relu, quantizer)  # pyre-ignore[6,8]
         m(*example_inputs)
@@ -490,7 +487,7 @@ def test_reentrant(self) -> None:
         quantizer = XNNPACKQuantizer().set_module_type(
             torch.nn.Linear, get_symmetric_quantization_config(is_per_channel=False)
         )
-        m = export_for_training(m, example_inputs).module()
+        m = export_for_training(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)  # pyre-ignore[6]
         m = convert_pt2e(m)
 
@@ -553,7 +550,7 @@ def check_nn_module(node: torch.fx.Node) -> None:
             )
 
         m.conv_bn_relu = export_for_training(  # pyre-ignore[8]
-            m.conv_bn_relu, example_inputs
+            m.conv_bn_relu, example_inputs, strict=True
         ).module()
         for node in m.conv_bn_relu.graph.nodes:  # pyre-ignore[16]
             if node.op not in ["placeholder", "output", "get_attr"]:
@@ -568,7 +565,7 @@ def test_speed(self) -> None:
 
         def dynamic_quantize_pt2e(model, example_inputs) -> torch.fx.GraphModule:
             torch._dynamo.reset()
-            model = export_for_training(model, example_inputs).module()
+            model = export_for_training(model, example_inputs, strict=True).module()
             # Per channel quantization for weight
             # Dynamic quantization for activation
             # Please read a detail: https://fburl.com/code/30zds51q
@@ -625,7 +622,7 @@ def forward(self, x):
 
         example_inputs = (torch.randn(1, 3, 5, 5),)
         m = M()
-        m = export_for_training(m, example_inputs).module()
+        m = export_for_training(m, example_inputs, strict=True).module()
         quantizer = XNNPACKQuantizer().set_global(
             get_symmetric_quantization_config(),
         )
@@ -701,7 +698,6 @@ def test_save_load(self) -> None:
 
 
 class TestNumericDebugger(TestCase):
-
     def _extract_debug_handles(self, model) -> Dict[str, int]:
         debug_handle_map: Dict[str, int] = {}
 
@@ -731,7 +727,7 @@ def _assert_node_has_debug_handle(node: torch.fx.Node) -> None:
     def test_quantize_pt2e_preserve_handle(self) -> None:
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs)
+        ep = export_for_training(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
 
@@ -761,7 +757,7 @@ def test_quantize_pt2e_preserve_handle(self) -> None:
     def test_extract_results_from_loggers(self) -> None:
         m = TestHelperModules.Conv2dThenConv1d()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs)
+        ep = export_for_training(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)  # pyre-ignore[6]
@@ -779,18 +775,20 @@ def test_extract_results_from_loggers(self) -> None:
         ref_results = extract_results_from_loggers(m_ref_logger)
         quant_results = extract_results_from_loggers(m_quant_logger)
         comparison_results = compare_results(
-            ref_results, quant_results  # pyre-ignore[6]
+            ref_results,
+            quant_results,  # pyre-ignore[6]
         )
         for node_summary in comparison_results.values():
             if len(node_summary.results) > 0:
                 self.assertGreaterEqual(
-                    node_summary.results[0].sqnr, 35  # pyre-ignore[6]
+                    node_summary.results[0].sqnr,
+                    35,  # pyre-ignore[6]
                 )
 
     def test_extract_results_from_loggers_list_output(self) -> None:
         m = TestHelperModules.Conv2dWithSplit()
         example_inputs = m.example_inputs()
-        ep = export_for_training(m, example_inputs)
+        ep = export_for_training(m, example_inputs, strict=True)
         generate_numeric_debug_handle(ep)
         m = ep.module()
         m_ref_logger = prepare_for_propagation_comparison(m)  # pyre-ignore[6]
@@ -808,7 +806,8 @@ def test_extract_results_from_loggers_list_output(self) -> None:
         ref_results = extract_results_from_loggers(m_ref_logger)
         quant_results = extract_results_from_loggers(m_quant_logger)
         comparison_results = compare_results(
-            ref_results, quant_results  # pyre-ignore[6]
+            ref_results,
+            quant_results,  # pyre-ignore[6]
         )
         for node_summary in comparison_results.values():
             if len(node_summary.results) > 0:
diff --git a/backends/xnnpack/test/quantizer/test_representation.py b/backends/xnnpack/test/quantizer/test_representation.py
index 83cecaec5ad..e52bbbd7ae7 100644
--- a/backends/xnnpack/test/quantizer/test_representation.py
+++ b/backends/xnnpack/test/quantizer/test_representation.py
@@ -33,10 +33,7 @@ def _test_representation(
     ) -> None:
         # resetting dynamo cache
         torch._dynamo.reset()
-        model = export_for_training(
-            model,
-            example_inputs,
-        ).module()
+        model = export_for_training(model, example_inputs, strict=True).module()
         model_copy = copy.deepcopy(model)
 
         model = prepare_pt2e(model, quantizer)  # pyre-ignore[6]
diff --git a/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py b/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py
index 57aacf55263..856030755af 100644
--- a/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py
+++ b/backends/xnnpack/test/quantizer/test_xnnpack_quantizer.py
@@ -361,7 +361,7 @@ def forward(self, x):
         )
         example_inputs = (torch.randn(2, 2),)
         m = M().eval()
-        m = export_for_training(m, example_inputs).module()
+        m = export_for_training(m, example_inputs, strict=True).module()
         m = prepare_pt2e(m, quantizer)  # pyre-ignore[6]
         # Use a linear count instead of names because the names might change, but
         # the order should be the same.
@@ -497,10 +497,7 @@ def test_propagate_annotation(self):
         example_inputs = (torch.randn(1, 3, 5, 5),)
 
         # program capture
-        m = export_for_training(
-            m,
-            example_inputs,
-        ).module()
+        m = export_for_training(m, example_inputs, strict=True).module()
 
         m = prepare_pt2e(m, quantizer)
         m(*example_inputs)
@@ -766,8 +763,7 @@ def forward(self, input_tensor, hidden_tensor):
 
             with torchdynamo.config.patch(allow_rnn=True):
                 model_graph = export_for_training(
-                    model_graph,
-                    example_inputs,
+                    model_graph, example_inputs, strict=True
                 ).module()
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
@@ -829,8 +825,7 @@ def forward(self, input_tensor, hidden_tensor):
 
             with torchdynamo.config.patch(allow_rnn=True):
                 model_graph = export_for_training(
-                    model_graph,
-                    example_inputs,
+                    model_graph, example_inputs, strict=True
                 ).module()
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(
@@ -1039,10 +1034,7 @@ def test_resnet18(self):
             m = torchvision.models.resnet18().eval()
             m_copy = copy.deepcopy(m)
             # program capture
-            m = export_for_training(
-                m,
-                example_inputs,
-            ).module()
+            m = export_for_training(m, example_inputs, strict=True).module()
 
             quantizer = XNNPACKQuantizer()
             quantization_config = get_symmetric_quantization_config(is_per_channel=True)
diff --git a/backends/xnnpack/test/test_xnnpack_utils.py b/backends/xnnpack/test/test_xnnpack_utils.py
index f11075cf261..3ff2f0e4c1e 100644
--- a/backends/xnnpack/test/test_xnnpack_utils.py
+++ b/backends/xnnpack/test/test_xnnpack_utils.py
@@ -317,10 +317,7 @@ def quantize_and_test_model_with_quantizer(
         module.eval()
         # program capture
 
-        m = export_for_training(
-            module,
-            example_inputs,
-        ).module()
+        m = export_for_training(module, example_inputs, strict=True).module()
 
         quantizer = XNNPACKQuantizer()
         quantization_config = get_symmetric_quantization_config()
diff --git a/backends/xnnpack/test/tester/tester.py b/backends/xnnpack/test/tester/tester.py
index a82688cd52c..cbce817cf4b 100644
--- a/backends/xnnpack/test/tester/tester.py
+++ b/backends/xnnpack/test/tester/tester.py
@@ -166,7 +166,7 @@ def run(
         self, artifact: torch.nn.Module, inputs: Optional[Tuple[torch.Tensor]]
     ) -> None:
         assert inputs is not None
-        captured_graph = export_for_training(artifact, inputs).module()
+        captured_graph = export_for_training(artifact, inputs, strict=True).module()
 
         assert isinstance(captured_graph, torch.fx.GraphModule)
         prepared = prepare_pt2e(captured_graph, self.quantizer)
diff --git a/docs/source/tutorials_source/export-to-executorch-tutorial.py b/docs/source/tutorials_source/export-to-executorch-tutorial.py
index 86a816f1435..de42cb51bce 100644
--- a/docs/source/tutorials_source/export-to-executorch-tutorial.py
+++ b/docs/source/tutorials_source/export-to-executorch-tutorial.py
@@ -190,7 +190,9 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 from torch.export import export_for_training
 
 example_args = (torch.randn(1, 3, 256, 256),)
-pre_autograd_aten_dialect = export_for_training(SimpleConv(), example_args).module()
+pre_autograd_aten_dialect = export_for_training(
+    SimpleConv(), example_args, strict=True
+).module()
 print("Pre-Autograd ATen Dialect Graph")
 print(pre_autograd_aten_dialect)
 
@@ -555,7 +557,7 @@ def forward(self, x):
 
 
 example_args = (torch.randn(3, 4),)
-pre_autograd_aten_dialect = export_for_training(M(), example_args).module()
+pre_autograd_aten_dialect = export_for_training(M(), example_args, strict=True).module()
 # Optionally do quantization:
 # pre_autograd_aten_dialect = convert_pt2e(prepare_pt2e(pre_autograd_aten_dialect, CustomBackendQuantizer))
 aten_dialect = export(pre_autograd_aten_dialect, example_args, strict=True)
diff --git a/examples/apple/mps/scripts/mps_example.py b/examples/apple/mps/scripts/mps_example.py
index fb1c4e971f9..c1a2e150286 100644
--- a/examples/apple/mps/scripts/mps_example.py
+++ b/examples/apple/mps/scripts/mps_example.py
@@ -166,7 +166,9 @@ def get_model_config(args):
 
     # pre-autograd export. eventually this will become torch.export
     with torch.no_grad():
-        model = torch.export.export_for_training(model, example_inputs).module()
+        model = torch.export.export_for_training(
+            model, example_inputs, strict=True
+        ).module()
         edge: EdgeProgramManager = export_to_edge(
             model,
             example_inputs,
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 5fb12342a2d..446d1a4eca4 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -224,7 +224,6 @@ def forward(self, x):
 
 
 class MultipleOutputsModule(torch.nn.Module):
-
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         return (x * y, x.sum(dim=-1, keepdim=True))
 
@@ -648,7 +647,9 @@ def to_edge_TOSA_delegate(
         )
         model_int8 = model
         # Wrap quantized model back into an exported_program
-        exported_program = torch.export.export_for_training(model, example_inputs)
+        exported_program = torch.export.export_for_training(
+            model, example_inputs, strict=True
+        )
 
         if args.intermediates:
             os.makedirs(args.intermediates, exist_ok=True)
@@ -681,7 +682,9 @@ def to_edge_TOSA_delegate(
 
     # export_for_training under the assumption we quantize, the exported form also works
     # in to_edge if we don't quantize
-    exported_program = torch.export.export_for_training(model, example_inputs)
+    exported_program = torch.export.export_for_training(
+        model, example_inputs, strict=True
+    )
     model = exported_program.module()
     model_fp32 = model