[FSDP][Replicate] final version integrating 1D device mesh replicate into fsdp (#166433)

anshul-si · etaf · commit 93cadab36647 · 2025-11-04T05:21:54.000-08:00
**Summary:** I have created a new composable replicate api that's integrated into FSDP's codebase with minimal changes. The key changes I made are when we use DDPMeshInfo, we use Replicate placements, prevent initial sharding of parameters, set worldsize to 1 to skip allgathers and reducescatter. **Test Cases** 1. pytest test/distributed/_composable/test_replicate_training.py 2. pytest test_pp_composability.py 3. pytest test_replicate_with_fsdp.py Pull Request resolved: #166433 Approved by: https://github.com/weifengpy
diff --git a/test/distributed/_composable/test_composability/test_pp_composability.py b/test/distributed/_composable/test_composability/test_pp_composability.py
@@ -392,11 +392,11 @@ def test_replicate_pp(self, ScheduleClass, MixedPrecisionParam):
         replicate_size = self.world_size // (pp_size)
         device_mesh = init_device_mesh(
             device_type,
-            mesh_shape=(replicate_size, 1, pp_size),
-            mesh_dim_names=("replicate", "shard", "pp"),
+            mesh_shape=(replicate_size, pp_size),
+            mesh_dim_names=("replicate", "pp"),
         )
         torch.manual_seed(42)
-        dp_mesh = device_mesh["replicate", "shard"]
+        dp_mesh = device_mesh["replicate"]
         pp_mesh = device_mesh["pp"]
         pp_group = device_mesh["pp"].get_group()
 
@@ -582,11 +582,11 @@ def test_replicate_pp_grads(self, ScheduleClass):
         replicate_size = self.world_size // (pp_size)
         device_mesh = init_device_mesh(
             device_type,
-            mesh_shape=(replicate_size, 1, pp_size),
-            mesh_dim_names=("replicate", "shard", "pp"),
+            mesh_shape=(replicate_size, pp_size),
+            mesh_dim_names=("replicate", "pp"),
         )
         torch.manual_seed(42)
-        dp_mesh = device_mesh["replicate", "shard"]
+        dp_mesh = device_mesh["replicate"]
         pp_mesh = device_mesh["pp"]
         pp_group = device_mesh["pp"].get_group()
         dp_group = device_mesh["replicate"].get_group()
diff --git a/test/distributed/_composable/test_replicate_training.py b/test/distributed/_composable/test_replicate_training.py
@@ -108,7 +108,7 @@ def test_param_registration_after_forward(self):
         """Tests the parameter registration after forward."""
         device = torch.device(device_type.type, 0)
         # Single Replicate group
-        for reshard_after_forward in (True, False, None):
+        for reshard_after_forward in (False,):
             torch.manual_seed(42)
             model = MLP(3, device)
             # Since seed is per process, not per thread, we broadcast to ensure
@@ -131,7 +131,7 @@ def test_param_registration_after_forward(self):
             self._assert_same_params(model.parameters(), ref_model.parameters())
 
         # Multiple Replicate groups
-        for reshard_after_forward in (True, False, None):
+        for reshard_after_forward in (False,):
             torch.manual_seed(42)
             model = nn.Sequential(MLP(3, device), MLP(3, device))
             for param in model.parameters():
@@ -405,8 +405,8 @@ def _test_train_parity_multi_group(
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         mesh = init_device_mesh(
             test_device_type,
-            (self.world_size, 1),
-            mesh_dim_names=("replicate", "shard"),
+            (self.world_size,),
+            mesh_dim_names=("replicate",),
         )
         fully_shard_fn = functools.partial(
             replicate,
@@ -740,8 +740,8 @@ def _test_train_parity_with_activation_checkpointing(
         # Apply Replicate
         device_mesh = init_device_mesh(
             test_device_type,
-            (self.world_size, 1),
-            mesh_dim_names=("replicate", "shard"),
+            (self.world_size,),
+            mesh_dim_names=("replicate",),
         )
         fsdp_kwargs = {
             "reshard_after_forward": reshard_after_forward,
@@ -868,11 +868,11 @@ def test_gradient_accumulation(self):
         with/without resharding after backward.
         """
 
-        shard_size, replicate_size = 1, self.world_size
+        replicate_size = self.world_size
         meshes = init_device_mesh(
             device_type.type,
-            (replicate_size, shard_size),
-            mesh_dim_names=("replicate", "shard"),
+            (replicate_size,),
+            mesh_dim_names=("replicate",),
         )
         self.run_subtests(
             {
@@ -1145,8 +1145,8 @@ def world_size(self) -> int:
     def init_global_mesh(self) -> DeviceMesh:
         return init_device_mesh(
             device_type.type,
-            (2, 1, 2),
-            mesh_dim_names=("dp_replicate", "dp_shard", "tp"),
+            (2, 2),
+            mesh_dim_names=("dp_replicate", "tp"),
         )
 
     @skip_if_lt_x_gpu(8)
@@ -1170,7 +1170,7 @@ def _test_replicate_tp(
         mlp_dim: int,
         foreach: bool,
     ):
-        dp_mesh, tp_mesh = global_mesh["dp_replicate", "dp_shard"], global_mesh["tp"]
+        dp_mesh, tp_mesh = global_mesh["dp_replicate"], global_mesh["tp"]
         dp_pg = dp_mesh._flatten().get_group()  # used for `replicate()`
 
         torch.manual_seed(42)
@@ -1229,11 +1229,9 @@ def _test_replicate_tp(
 
         for _, p in model.named_parameters():
             self.assertIsInstance(p, DTensor)
-            self.assertEqual(p.device_mesh.ndim, 3)
-            self.assertEqual(len(p.placements), 3)
-            self.assertEqual(
-                p.device_mesh.mesh_dim_names, ("dp_replicate", "dp_shard", "tp")
-            )
+            self.assertEqual(p.device_mesh.ndim, 2)
+            self.assertEqual(len(p.placements), 2)
+            self.assertEqual(p.device_mesh.mesh_dim_names, ("dp_replicate", "tp"))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_composable/test_replicate_with_fsdp.py b/test/distributed/_composable/test_replicate_with_fsdp.py
@@ -120,7 +120,7 @@ def _test_replicate_transformer(self, sharding_strategy):
             if i % 2 == 0:
                 self.assertTrue("replicate" in _get_registry(layer))
                 for parameter in layer.parameters():
-                    self.assertEqual(parameter.placements, (Replicate(), Shard(dim=0)))
+                    self.assertEqual(parameter.placements, (Replicate(),))
             elif i % 2 == 1:
                 self.assertTrue("fully_shard" in _get_registry(layer))
                 for parameter in layer.parameters():
@@ -197,14 +197,14 @@ def test_replicate_tp_device_mesh(self):
         ]
 
         global_mesh = self.init_replicate_tp_mesh()
-        replicate_mesh = global_mesh["replicate", "shard"]
+        replicate_mesh = global_mesh["replicate"]
 
         for layer in layers:
             replicate(layer, device_mesh=replicate_mesh)
 
             for parameter in layer.parameters():
-                self.assertEqual(parameter.device_mesh.shape, (2, 1))
-                self.assertEqual(parameter.placements, (Replicate(), Shard(dim=0)))
+                self.assertEqual(parameter.device_mesh.shape, (2,))
+                self.assertEqual(parameter.placements, (Replicate(),))
 
     @skip_if_lt_x_gpu(2)
     def test_train_replicate_fsdp(self):
@@ -263,7 +263,6 @@ def test_train_parity_2d_mlp(self):
         run_subtests(
             self,
             {
-                "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
                 "mlp_dim": [3, 16, 17],
             },
@@ -273,7 +272,6 @@ def test_train_parity_2d_mlp(self):
     def _test_train_parity_2d_mlp(
         self,
         global_mesh: DeviceMesh,
-        reshard_after_forward: bool,
         use_activation_checkpointing: bool,
         mlp_dim: int,
     ):
@@ -287,13 +285,12 @@ def _test_train_parity_2d_mlp(
         torch.manual_seed(42)
         model = MLPStack(mlp_dim)
         ref_model = copy.deepcopy(model).cuda()
-        replicate(ref_model, device_mesh=replicate_shard_mesh)
+        replicate(ref_model, device_mesh=replicate_mesh)
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
         model.parallelize(
             tp_mesh,
             replicate_shard_mesh,
             use_activation_checkpointing,
-            reshard_after_forward=reshard_after_forward,
         )
         optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)
 
diff --git a/torch/distributed/_composable/replicate_with_fsdp.py b/torch/distributed/_composable/replicate_with_fsdp.py
@@ -14,13 +14,12 @@
     OffloadPolicy,
 )
 from torch.distributed.fsdp._fully_shard._fsdp_common import (
+    DDPMeshInfo,
     detect_compiled_autograd,
-    HSDPMeshInfo,
 )
 from torch.distributed.fsdp._fully_shard._fsdp_init import (
     _get_device_from_mesh,
     _get_managed_states,
-    _get_post_forward_mesh_info,
     _init_default_fully_shard_mesh,
     _move_states_to_device,
 )
@@ -184,23 +183,19 @@ def replicate_impl(
         )
 
     mesh = mesh or _init_default_fully_shard_mesh()
-    if mesh.ndim != 2:
-        raise ValueError(f"replicate expects a 2D DeviceMesh but got {mesh}")
+    if mesh.ndim != 1:
+        raise ValueError(f"replicate expects a 1D DeviceMesh but got {mesh}")
 
     else:
         if mesh.mesh_dim_names is None:
             raise AssertionError(
                 "Please init the 2D mesh for HSDP with mesh_dim_names specified"
             )
-        mesh_info = HSDPMeshInfo(mesh, shard_mesh_dim=1, replicate_mesh_dim=0)
+        mesh_info = DDPMeshInfo(mesh, replicate_mesh_dim=0)
     device = _get_device_from_mesh(mesh)
     auto_reshard_after_forward = reshard_after_forward is None
-    # If the user does not provide ``reshard_after_forward``, we set it to True.
-    # During lazy_init, we identify which module is the root and override its value to False
-    post_forward_mesh_info = _get_post_forward_mesh_info(
-        reshard_after_forward if not auto_reshard_after_forward else True,  # type: ignore[arg-type]
-        mesh_info,
-    )
+
+    post_forward_mesh_info = None
 
     arg_module = module
     modules = (
@@ -217,7 +212,7 @@ def replicate_impl(
         state._fsdp_param_group = FSDPParamGroup(
             params,
             modules,
-            mesh_info,
+            mesh_info,  # type: ignore[arg-type]
             post_forward_mesh_info,
             device,
             shard_placement_fn,
@@ -341,8 +336,8 @@ def replicate_mesh():
     device = torch._C._get_accelerator()
     mesh = init_device_mesh(
         device.type,
-        mesh_shape=(default_pg.size(), 1),
-        mesh_dim_names=("replicate", "shard"),
+        mesh_shape=(default_pg.size(),),
+        mesh_dim_names=("replicate",),
     )
     return mesh
 
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@@ -492,7 +492,11 @@ def foreach_reduce(
             force_sum_reduction_for_comms,
         )
     )
-    world_size = reduce_scatter_group.size()
+
+    if reduce_scatter_group is None:
+        world_size = 1
+    else:
+        world_size = reduce_scatter_group.size()
     device_handle = _get_device_handle(device.type)
     current_stream = device_handle.current_stream()
 
@@ -547,7 +551,7 @@ def foreach_reduce(
             reduce_output.copy_(reduce_scatter_input)
         reduce_scatter_event = reduce_scatter_stream.record_event()
         post_reduce_stream = reduce_scatter_stream
-        if all_reduce_group is not None:  # HSDP
+        if all_reduce_group is not None:  # HSDP or DDP/replicate
             # Accumulations must run in the reduce-scatter stream
             if not all_reduce_grads:
                 if partial_reduce_output is not None:
@@ -690,7 +694,7 @@ def _get_all_gather_input_metadatas(
 
 
 def _get_gradient_divide_factors(
-    reduce_scatter_group: dist.ProcessGroup,
+    reduce_scatter_group: Optional[dist.ProcessGroup],
     all_reduce_group: Optional[dist.ProcessGroup],
     reduce_dtype: torch.dtype,
     device_type: str = "",
@@ -709,8 +713,11 @@ def _get_gradient_divide_factors(
     # For fp32/bf16, we do not need to worry about overflow/underflow, so we
     # use NCCL's built-in division to avoid separate div kernels
     overflow_risk = reduce_dtype not in (torch.float32, torch.bfloat16)
+    if reduce_scatter_group is not None:
+        data_parallel_size = reduce_scatter_group.size()
+    else:
+        data_parallel_size = 1
 
-    data_parallel_size = reduce_scatter_group.size()
     if all_reduce_group is not None:
         data_parallel_size *= all_reduce_group.size()
 
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
@@ -11,6 +11,7 @@
 from torch._prims_common import make_contiguous_strides_for
 from torch.distributed._functional_collectives import AsyncCollectiveTensor
 from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp._fully_shard._fsdp_common import DDPMeshInfo
 from torch.distributed.tensor import DTensor, Replicate, Shard
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor.placement_types import _StridedShard, Placement
@@ -306,22 +307,29 @@ def _init_sharded_param(
                     f"or 4 (HSDP+EP+TP) but got {self._spmd_mesh.ndim}."
                 )
             self._spmd_placements: tuple[Placement, ...]
-            dp_shard_tp_placement = (
-                (
-                    _StridedShard(shard_dim, split_factor=split_factor)
-                    if split_factor > 1
-                    else fsdp_placement
-                ),
-                *self._tp_spec.placements,
-            )
-            if dp_mesh.ndim == 1:  # FSDP
-                self._spmd_placements = dp_shard_tp_placement
-            else:  # HSDP
+            if isinstance(self.mesh_info, FSDPMeshInfo):  # FSDP or HSDP
+                dp_shard_tp_placement = (
+                    (
+                        _StridedShard(shard_dim, split_factor=split_factor)
+                        if split_factor > 1
+                        else fsdp_placement
+                    ),
+                    *self._tp_spec.placements,
+                )
+            else:  # DDP
+                dp_shard_tp_placement = (
+                    (Replicate()),
+                    *self._tp_spec.placements,
+                )
+            if isinstance(self.mesh_info, HSDPMeshInfo):  # HSDP
                 if self.mesh_info.replicate_mesh_dim != 0:
                     raise AssertionError(
                         f"Expected replicate_mesh_dim to be 0, got {self.mesh_info.replicate_mesh_dim}"
                     )
                 self._spmd_placements = (Replicate(),) + dp_shard_tp_placement
+            else:  # FSDP or DDP
+                self._spmd_placements = dp_shard_tp_placement
+
             self._sharding_spec = DTensorSpec(
                 self._spmd_mesh,
                 self._spmd_placements,
@@ -330,10 +338,12 @@ def _init_sharded_param(
             param_data = cast(DTensor, param)._local_tensor
         else:
             self._spmd_mesh = self.mesh_info.mesh
-            if isinstance(self.mesh_info, HSDPMeshInfo):
+            if isinstance(self.mesh_info, HSDPMeshInfo):  # HSDP
                 self._spmd_placements = (Replicate(), fsdp_placement)
-            else:
+            elif isinstance(self.mesh_info, FSDPMeshInfo):  # FSDP
                 self._spmd_placements = (fsdp_placement,)
+            elif isinstance(self.mesh_info, DDPMeshInfo):  # DDP
+                self._spmd_placements = (Replicate(),)
             self._sharding_spec = DTensorSpec(
                 self._spmd_mesh,
                 self._spmd_placements,
@@ -351,8 +361,13 @@ def _init_sharded_param(
             )
         self._orig_size = param_data.size()
         self._contiguous_orig_stride = make_contiguous_strides_for(self._orig_size)
-        shard_rank = self.mesh_info.shard_mesh_rank
-        shard_world_size = self.mesh_info.shard_mesh_size
+        if isinstance(self.mesh_info, FSDPMeshInfo):  # FSDP or HSDP
+            shard_rank = self.mesh_info.shard_mesh_rank
+            shard_world_size = self.mesh_info.shard_mesh_size
+        else:  # DDP
+            shard_rank = 0
+            shard_world_size = 1
+
         if shard_dim > 0 and param_data.size(shard_dim) % shard_world_size != 0:
             # If sharding on nonzero dim, require even sharding for now because
             # the uneven sharding (1) requires extra copies before/after FSDP
@@ -401,12 +416,20 @@ def _init_sharded_post_forward_param_metadata(self, param: torch.Tensor) -> None
         if mesh_info is None:
             raise AssertionError("Expected post_forward_mesh_info to not be None")
         param_data = param._local_tensor if isinstance(param, DTensor) else param
-        chunks = _chunk_with_empty(param_data, mesh_info.shard_mesh_size, dim=0)
-        self.sharded_post_forward_size = _get_dim_chunked_size(
-            chunks[mesh_info.shard_mesh_rank],
-            param_data.size(),
-            dim=self.fsdp_placement.dim,
-        )
+        if isinstance(mesh_info, FSDPMeshInfo):
+            chunks = _chunk_with_empty(param_data, mesh_info.shard_mesh_size, dim=0)
+            self.sharded_post_forward_size = _get_dim_chunked_size(
+                chunks[mesh_info.shard_mesh_rank],
+                param_data.size(),
+                dim=self.fsdp_placement.dim,
+            )
+        else:  # DDP
+            chunks = _chunk_with_empty(param_data, 1, dim=0)
+            self.sharded_post_forward_size = _get_dim_chunked_size(
+                chunks[0],
+                param_data.size(),
+                dim=self.fsdp_placement.dim,
+            )
         self.contiguous_sharded_post_forward_stride = make_contiguous_strides_for(
             self.sharded_post_forward_size
         )
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py