diff --git a/backends/apple/coreml/test/test_coreml_quantizer.py b/backends/apple/coreml/test/test_coreml_quantizer.py
index 461044f4d53..8dfb46cbbdc 100644
--- a/backends/apple/coreml/test/test_coreml_quantizer.py
+++ b/backends/apple/coreml/test/test_coreml_quantizer.py
@@ -15,12 +15,12 @@
 )
 
 from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
-from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
     prepare_qat_pt2e,
 )
+from torch.export import export_for_training
 
 
 class TestCoreMLQuantizer:
@@ -32,7 +32,7 @@ def quantize_and_compare(
     ) -> None:
         assert quantization_type in {"PTQ", "QAT"}
 
-        pre_autograd_aten_dialect = capture_pre_autograd_graph(model, example_inputs)
+        pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
 
         quantization_config = LinearQuantizerConfig.from_dict(
             {
diff --git a/backends/apple/mps/test/test_mps_utils.py b/backends/apple/mps/test/test_mps_utils.py
index 6f7d00d7b09..39ce5df5115 100644
--- a/backends/apple/mps/test/test_mps_utils.py
+++ b/backends/apple/mps/test/test_mps_utils.py
@@ -209,9 +209,9 @@ def lower_module_and_test_output(
 
         expected_output = model(*sample_inputs)
 
-        model = torch._export.capture_pre_autograd_graph(
+        model = torch.export.export_for_training(
             model, sample_inputs, dynamic_shapes=dynamic_shapes
-        )
+        ).module()
 
         edge_program = export_to_edge(
             model,
diff --git a/backends/mediatek/quantizer/annotator.py b/backends/mediatek/quantizer/annotator.py
index dcbaf58833a..e315599cf7f 100644
--- a/backends/mediatek/quantizer/annotator.py
+++ b/backends/mediatek/quantizer/annotator.py
@@ -7,8 +7,6 @@
 from typing import Callable, List
 
 import torch
-
-from torch._export import capture_pre_autograd_graph
 from torch._ops import OpOverload
 from torch._subclasses import FakeTensor
 
@@ -17,6 +15,8 @@
     _annotate_input_qspec_map,
     _annotate_output_qspec,
 )
+
+from torch.export import export_for_training
 from torch.fx import Graph, Node
 from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
     SubgraphMatcherWithNameNodeMap,
@@ -159,7 +159,7 @@ def forward(self, x):
             return norm, {}
 
     for pattern_cls in (ExecuTorchPattern, MTKPattern):
-        pattern_gm = capture_pre_autograd_graph(pattern_cls(), (torch.randn(3, 3),))
+        pattern_gm = export_for_training(pattern_cls(), (torch.randn(3, 3),)).module()
         matcher = SubgraphMatcherWithNameNodeMap(
             pattern_gm, ignore_literals=True, remove_overlapping_matches=False
         )
diff --git a/backends/transforms/test/test_duplicate_dynamic_quant_chain.py b/backends/transforms/test/test_duplicate_dynamic_quant_chain.py
index 7d1ef169c8c..637ce807c18 100644
--- a/backends/transforms/test/test_duplicate_dynamic_quant_chain.py
+++ b/backends/transforms/test/test_duplicate_dynamic_quant_chain.py
@@ -8,7 +8,6 @@
 import unittest
 
 import torch
-import torch._export as export
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
     DuplicateDynamicQuantChainPass,
 )
@@ -59,10 +58,10 @@ def _test_duplicate_chain(
 
         # program capture
         m = copy.deepcopy(m_eager)
-        m = export.capture_pre_autograd_graph(
+        m = torch.export.export_for_training(
             m,
             example_inputs,
-        )
+        ).module()
 
         m = prepare_pt2e(m, quantizer)
         # Calibrate
diff --git a/examples/llm_manual/export_nanogpt.py b/examples/llm_manual/export_nanogpt.py
index cf29a69c080..2d69c50ec99 100644
--- a/examples/llm_manual/export_nanogpt.py
+++ b/examples/llm_manual/export_nanogpt.py
@@ -15,8 +15,7 @@
 from executorch.exir import to_edge
 
 from model import GPT
-from torch._export import capture_pre_autograd_graph
-from torch.export import export
+from torch.export import export, export_for_training
 from torch.nn.attention import sdpa_kernel, SDPBackend
 
 model = GPT.from_pretrained("gpt2")  # use gpt2 weight as pretrained weight
@@ -28,7 +27,9 @@
 # Trace the model, converting it to a portable intermediate representation.
 # The torch.no_grad() call tells PyTorch to exclude training-specific logic.
 with sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-    m = capture_pre_autograd_graph(model, example_inputs, dynamic_shapes=dynamic_shape)
+    m = export_for_training(
+        model, example_inputs, dynamic_shapes=dynamic_shape
+    ).module()
     traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape)
 
 # Convert the model into a runnable ExecuTorch program.
diff --git a/examples/mediatek/aot_utils/oss_utils/utils.py b/examples/mediatek/aot_utils/oss_utils/utils.py
index 8b4de4aac3a..cb55822b9de 100755
--- a/examples/mediatek/aot_utils/oss_utils/utils.py
+++ b/examples/mediatek/aot_utils/oss_utils/utils.py
@@ -30,7 +30,7 @@ def build_executorch_binary(
         if quant_dtype not in Precision:
             raise AssertionError(f"No support for Precision {quant_dtype}.")
 
-        captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
+        captured_model = torch.export.export_for_training(model, inputs).module()
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
         # calibration
diff --git a/examples/mediatek/model_export_scripts/llama.py b/examples/mediatek/model_export_scripts/llama.py
index b2fef26a4cf..180195ee2c7 100644
--- a/examples/mediatek/model_export_scripts/llama.py
+++ b/examples/mediatek/model_export_scripts/llama.py
@@ -318,9 +318,9 @@ def export_to_et_ir(
         max_num_token, max_cache_size, True
     )
     print("Getting pre autograd ATen Dialect Graph")
-    pre_autograd_aten_dialect = torch._export.capture_pre_autograd_graph(
+    pre_autograd_aten_dialect = torch.export.export_for_training(
         model, example_inputs, dynamic_shapes=dynamic_shapes
-    )  # NOTE: Will be replaced with export
+    ).module()  # NOTE: Will be replaced with export
     quantizer = NeuropilotQuantizer()
     quantizer.setup_precision(getattr(Precision, precision))
     prepared_graph = prepare_pt2e(pre_autograd_aten_dialect, quantizer)
diff --git a/exir/tests/test_quantization.py b/exir/tests/test_quantization.py
index ebe94775221..269a9ee11bc 100644
--- a/exir/tests/test_quantization.py
+++ b/exir/tests/test_quantization.py
@@ -51,9 +51,9 @@ def test_resnet(self) -> None:
             m = torchvision.models.resnet18().eval()
             m_copy = copy.deepcopy(m)
             # program capture
-            m = torch._export.capture_pre_autograd_graph(
+            m = torch.export.export_for_training(
                 m, copy.deepcopy(example_inputs)
-            )
+            ).module()
 
             quantizer = XNNPACKQuantizer()
             operator_config = get_symmetric_quantization_config(is_per_channel=True)
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index ee54fe3660d..71588f44ac9 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -29,10 +29,10 @@
 
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
 from executorch.extension.llm.tokenizer.utils import get_tokenizer
-from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer import Quantizer
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
+from torch.export import export_for_training
 from torch.nn.attention import SDPBackend
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -190,9 +190,9 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager":
                     strict=True,
                 ).module()
             else:
-                self.pre_autograd_graph_module = capture_pre_autograd_graph(
+                self.pre_autograd_graph_module = export_for_training(
                     self.model, self.example_inputs, dynamic_shapes=dynamic_shape
-                )
+                ).module()
 
         return self