[DDP] Make uneven inputs work with comm. hook (#61020)

rohan-varma · facebook-github-bot · commit 43fb39c3eb6a · 2021-07-02T18:48:21.000-07:00
Summary: Pull Request resolved: #61020 Makes uneven input support with `join` context manager work with custom communication hooks. This will ensure that the two features can work well together. Added relevant unittests to test allreduce and powerSGD hooks. Instead of calling `allreduce`, the join manager now calls into `_run_reduction_hook` which will automatically run whatever hook is installed. ghstack-source-id: 132950108 Test Plan: CI Reviewed By: SciPioneer Differential Revision: D29480028 fbshipit-source-id: c91dc467a62c5f1e0ec702a2944ae3deb10f93f4
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -399,6 +399,15 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           "_delay_all_reduce",
           &::c10d::Reducer::delay_all_reduce,
           py::call_guard<py::gil_scoped_release>())
+      .def(
+          "_run_comm_hook",
+          [](::c10d::Reducer& reducer, ::c10d::GradBucket& bucket)
+              -> std::shared_ptr<jit::PythonFutureWrapper> {
+            c10::intrusive_ptr<c10::ivalue::Future> fut =
+                reducer.run_comm_hook(bucket);
+            return std::make_shared<jit::PythonFutureWrapper>(fut);
+          },
+          py::call_guard<py::gil_scoped_release>())
       .def(
           "set_logger",
           [](::c10d::Reducer& reducer,
@@ -1472,7 +1481,7 @@ Example::
       .def(
           "get_future",
           [](::c10d::ProcessGroup::Work& work)
-              -> std::shared_ptr<jit::PythonFutureWrapper> {
+            -> std::shared_ptr<jit::PythonFutureWrapper> {
             return std::make_shared<jit::PythonFutureWrapper>(work.getFuture());
           },
           R"(
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
@@ -1014,11 +1014,11 @@ def _sync_final_model(self, is_last_joiner):
         )
         self._sync_params_and_buffers(authoritative_rank=self._authoritative_rank)
 
-    # Schedule allreduce ops to match those scheduled in the reducer's backward
+    # Schedule comm ops to match those scheduled in the reducer's backward
     # pass.
     def _match_all_reduce_for_bwd_pass(self):
-        allreduce_work = []
-        # Schedule allreduce in the same order as Reducer schedules them, i.e.
+        comm_work = []
+        # Schedule comm in the same order as Reducer schedules them, i.e.
         # the order of the buckets. Retrieving the bucket order from the reducer
         # ensures that we keep the same order in join mode, such as when bucket
         # order is rebuilt dynamically.
@@ -1031,10 +1031,9 @@ def _match_all_reduce_for_bwd_pass(self):
             # divide_by_initial_world_size=True, we divide grads by the static
             # world size, if not, the dividing factor is reduced by the number
             # of joined processes.
-            zero_tensor = grad_bucket.get_tensor()
-            work = self.process_group.allreduce(zero_tensor)
-            allreduce_work.append(work)
-        for work in allreduce_work:
+            work = self.reducer._run_comm_hook(grad_bucket)
+            comm_work.append(work)
+        for work in comm_work:
             work.wait()
 
     # Allreduces the used parameter mapping across ranks.
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -11,7 +11,7 @@
 from contextlib import contextmanager, suppress
 from datetime import timedelta
 from functools import reduce
-from typing import Union, NamedTuple
+from typing import Union, NamedTuple, Callable, Any
 
 import torch
 import torch.cuda
@@ -183,6 +183,8 @@ class DDPUnevenTestInput(NamedTuple):
     inp: Union[torch.tensor, tuple]
     sync_interval: int
     throw_on_early_termination: bool = False
+    hook: Callable = None
+    state: Any = None
 
 
 class _FC2(nn.Module):
@@ -5384,6 +5386,11 @@ def _run_uneven_inputs_test(
                 bucket_cap_mb=1,
                 find_unused_parameters=find_unused_params,
             )
+            # Register hook if specified
+            if test_case.hook is not None:
+                net.register_comm_hook(test_case.state, test_case.hook)
+                print(f"registered hook {test_case.hook}")
+
 
             # Determine num iters for this rank via the passed in mapping.
             num_iters = iteration_mapping[rank]
@@ -5602,6 +5609,35 @@ def forward(self, x, rank):
                 ),
             ]
 
+            # Test models that have hook installed.
+            models_with_hook = [
+                DDPUnevenTestInput(
+                    name="small_model_allreduce_hook",
+                    model=small_model,
+                    hook=default.allreduce_hook,
+                    state=None,
+                    inp=torch.ones(batch, dim, device=rank),
+                    sync_interval=1,
+                ),
+                DDPUnevenTestInput(
+                    name="small_model_power_sgd_hook",
+                    model=small_model,
+                    hook=powerSGD.powerSGD_hook,
+                    state=powerSGD.PowerSGDState(
+                        process_group=None,
+                        matrix_approximation_rank=1,
+                        # Config so that powerSGD runs immediately instead of
+                        # allreduce.
+                        start_powerSGD_iter=1,
+                        warm_start=False,
+                        use_error_feedback=False,
+                    ),
+                    inp=torch.ones(batch, dim, device=rank),
+                    sync_interval=1,
+                ),
+            ]
+            models_to_test.extend(models_with_hook)
+
             # Add resnet model if we have torchvision installed.
             if HAS_TORCHVISION:
                 resnet_model = torchvision.models.resnet50()