[FSDP2] Changed grad acc test to use data parallel ref model (#126161)

This simplifies the test a bit. **Context** Option 1: Ref model is data parallel. Each rank's ref model receives local batch. We manually all-reduce gradients and divide them by world size to match DDP/FSDP semantics. Option 2: Ref model is not data parallel. Each rank's ref model receives the same global batch. We manually divide the ref model's gradients by world size to match DDP/FSDP semantics. (Note that all ranks have the same ref model and same global batch.) All of our other unit tests are written following Option 1, which is simpler and a more direct comparison to what our claimed semantics are. This PR switches the gradient accumulation test from being written as following Option 2 to as following Option 1. Pull Request resolved: #126161 Approved by: https://github.com/wanchaol ghstack dependencies: #126067, #126070
pytorch · May 14, 2024 · 3892e86 · 3892e86
1 parent 4ded666
commit 3892e86
Showing 1 changed file with 9 additions and 20 deletions.
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -694,8 +694,7 @@ def _test_gradient_accumulation(
             return  # skip since not common
 
         torch.manual_seed(42)
-        local_batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3)
-        global_batch_size = local_batch_size * self.world_size
+        batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3)
         if mode == "some_mlps":
             num_mlps_to_disable_reduce_scatter = 2
         modules = [nn.Linear(lin_dim, lin_dim)]
@@ -714,7 +713,7 @@ def _test_gradient_accumulation(
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
 
-        torch.manual_seed(1)  # same on all ranks
+        torch.manual_seed(42 + self.rank + 1)
         for iter_idx in range(5):
             with CommDebugMode() as comm_mode:
                 for microbatch_idx in range(num_microbatches):
@@ -737,17 +736,11 @@ def _test_gradient_accumulation(
                                 is_last_microbatch, recurse=False
                             )
 
-                    global_inp = torch.rand((global_batch_size, lin_dim), device="cuda")
-                    local_inp = global_inp[
-                        self.rank
-                        * local_batch_size : (self.rank + 1)
-                        * local_batch_size
-                    ].detach()
+                    inp = torch.randn(batch_size, lin_dim, device="cuda")
                     losses: List[torch.Tensor] = []
-                    for _model, inp in ((ref_model, global_inp), (model, local_inp)):
+                    for _model in (ref_model, model):
                         losses.append(_model(inp).sum())
                         losses[-1].backward()
-                    dist.all_reduce(losses[1])  # partial -> replicated
                     self.assertEqual(losses[0], losses[1])
 
             comm_counts = comm_mode.get_comm_counts()
@@ -768,12 +761,10 @@ def _test_gradient_accumulation(
                 # Expect additional reduce-scatters for all MLPs
                 expected_reduce_scatter_count += (num_mlps) * (num_microbatches - 1)
             self.assertEqual(reduce_scatter_count, expected_reduce_scatter_count)
-            # Exclude the loss all-reduce per microbatch in our training loop
-            all_reduce_count -= num_microbatches
-            if mesh.ndim == 2:
-                self.assertEqual(all_reduce_count, expected_reduce_scatter_count)
-            else:
-                self.assertEqual(all_reduce_count, 0)
+            expected_all_reduce_count = (
+                expected_reduce_scatter_count if mesh.ndim == 2 else 0
+            )
+            self.assertEqual(all_reduce_count, expected_all_reduce_count)
 
             # Expect one all-gather per MLP plus one for the root's linear in
             # the first microbatch's forward
@@ -797,11 +788,9 @@ def _test_gradient_accumulation(
                 expected_all_gather_count += num_mlps * (num_microbatches - 1)
             self.assertEqual(all_gather_count, expected_all_gather_count)
 
-            # Average the ref model's gradients over the world size to match
-            # data parallel semantics
             for param in ref_model.parameters():
                 if param.grad is not None:
-                    param.grad.div_(self.world_size)
+                    dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
             check_sharded_parity(self, ref_model, model)
             for _optim in (optim, ref_optim):
                 _optim.step()