pytorch · bobrenjc93 · Nov 29, 2025 · Nov 29, 2025 · ezyang · Dec 2, 2025
@@ -19,6 +19,6 @@ TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
 
 PYTORCH_ALLOC_CONF="expandable_segments:True" \
 TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \
-torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+torchrun --virtual-local-rank --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
 -m ${TRAIN_FILE} --job.config_file ${CONFIG_FILE} "$@"
@@ -660,7 +660,12 @@ class Compile:
         default_factory=lambda: ["model", "loss"]
     )
     """Which components to compile"""
+
     backend: str = "inductor"
+    """Which backend to compile with"""
+
+    enable_precompilation: bool = False
+    """Enable AOT precompilation to save compiled function to disk"""
 
 
 @dataclass

diff --git a/torchtitan/experiments/simple_fsdp/simple_fsdp.py b/torchtitan/experiments/simple_fsdp/simple_fsdp.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import sys
 from collections.abc import Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -159,6 +160,8 @@ def _distribute_dtensor(
     )
 
 
+# global counter to ensure unique class names for precompilation
+_wrap_class_counter = 0
 def _register_parametrization(
     module: nn.Module, param_names: list[str], parametrization: nn.Module
 ) -> None:
@@ -169,18 +172,28 @@ def _register_parametrization(
     TODO: In checkpoint saving/loading, avoid parametrization calls when calling
     get_model_state_dict func in torchtitan's torchtitan/components/checkpoint.py.
     """
+    global _wrap_class_counter
+    _wrap_class_counter += 1
     param_name_to_property = {
         param_name: property(
             lambda self, pn=param_name: parametrization(self._parameters[pn])
         )
         for param_name in param_names
     }
     module_cls = type(
-        f"SimpleFSDP{module.__class__.__name__}",
+        f"SimpleFSDP{module.__class__.__name__}_{_wrap_class_counter}",
         (module.__class__,),
         param_name_to_property,
     )
     module.__class__ = module_cls
+    module_cls.__module__ = module.__class__.__module__
+    # Expose the dynamically created subclass as a real, importable symbol in its module.
+    # This is necessary for precompilation to work.
+    setattr(
+        sys.modules[module_cls.__module__],
+        module_cls.__name__,
+        module_cls,
+    )
 
 
 def fsdp_policy():

@@ -74,6 +74,9 @@ def __init__(self, job_config: JobConfig):
 
         self.job_config = job_config
 
+        if job_config.compile.enable_precompilation:
 def apply_compile(model: nn.Module, compile_config: CompileConfig): 
     """ 
     Apply torch.compile to each TransformerBlock, which makes compilation efficient due to 
     repeated structure. Alternatively one can compile the whole model (after applying DP). 
     """ 
     for layer_id, transformer_block in model.layers.named_children(): 
         transformer_block = torch.compile( 
             transformer_block, backend=compile_config.backend, fullgraph=True 
         ) 
         model.layers.register_module(layer_id, transformer_block) 
     logger.info("Compiling each TransformerBlock with torch.compile") 
 def apply_compile(model: nn.Module, compile_config: CompileConfig): 
     """ 
     Apply torch.compile to each TransformerBlock, which makes compilation efficient due to 
     repeated structure. Alternatively one can compile the whole model (after applying DP). 
     """ 
     for layer_id, transformer_block in model.layers.named_children(): 
         transformer_block = torch.compile( 
             transformer_block, backend=compile_config.backend, fullgraph=True 
         ) 
         model.layers.register_module(layer_id, transformer_block) 
  
     logger.info("Compiling each TransformerBlock with torch.compile") 
+            torch._dynamo.config.enable_aot_compile = True
+
         logger.info(f"Starting job: {job_config.job.description}")
 
         if job_config.experimental.custom_import:
@@ -380,6 +383,53 @@ def init_distributed(self) -> ParallelDims:
             world_size=world_size,
         )
 
+    def _get_precompiled_function_path(self) -> str:
+        """
+        Generate a unique path for the precompiled function based on model configuration.
+
+        Returns:
+            Path to the precompiled function file.
+        """
+        rank = int(os.environ["RANK"])
+        model_name = self.job_config.model.name
+        model_flavor = self.job_config.model.flavor
+
+        # Create a unique filename based on model configuration and rank
+        filename = f"compiled_fn_{model_name}_{model_flavor}_rank_{rank}.pt"
+        return os.path.join("/tmp", filename)
+
+    def _load_or_compile_model(
+        self,
+        model: torch.nn.Module,
+        inputs: torch.Tensor,
+        extra_inputs: dict[str, torch.Tensor],
+        extra_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
+        """
+        Load a precompiled model function or compile and save it if not available.
+
+        Args:
+            model: The model to compile.
+            inputs: Main input tensor.
+            extra_inputs: Additional input tensors.
+            extra_kwargs: Additional keyword arguments.
+
+        Returns:
+            Model output predictions.
+        """
+        compiled_fn_path = self._get_precompiled_function_path()
+
+        if not os.path.exists(compiled_fn_path):
+            logger.info(f"Compiling model and saving to {compiled_fn_path}")
+            model.forward \
+                .aot_compile(((inputs,), {**extra_inputs, **extra_kwargs})) \
+                .save_compiled_function(compiled_fn_path)
+
+        with open(compiled_fn_path, "rb") as f:
+            return torch.compiler.load_compiled_function(f)(
+                model._orig_mod, inputs, **extra_inputs, **extra_kwargs
+            )
+
     def batch_generator(
         self, data_iterable: Iterable[tuple[dict[str, torch.Tensor], torch.Tensor]]
     ) -> Iterable[tuple[dict[str, torch.Tensor], torch.Tensor]]:
@@ -524,8 +574,14 @@ def forward_backward_step(
             with self.train_context(optional_context_parallel_ctx):
                 assert len(model_parts) == 1
                 with self.maybe_enable_amp:
-                    pred = model_parts[0](inputs, **extra_inputs, **extra_kwargs)
-                    loss = self.loss_fn(pred, labels)
+                    if self.job_config.compile.enable_precompilation:
+                        pred = self._load_or_compile_model(
+                            model_parts[0], inputs, extra_inputs, extra_kwargs
+                        )
+                        loss = self.loss_fn(pred, labels)
+                    else:
+                        pred = model_parts[0](inputs, **extra_inputs, **extra_kwargs)
+                        loss = self.loss_fn(pred, labels)
                 # need to free pred before bwd to avoid peaking memory
                 del pred
                 loss.backward()