[DDP] Add a test case to test a larger model (#4085)

alanwaketan · web-flow · commit b9a32d3d90ac · 2022-10-12T23:31:42.000-07:00
Summary:
This commit adds a test case to test a larger model that can trigger
multiple all_reduces instead of one. It also fixes a xmp issue while
the launcher being used to run consecutive mp experiments.

Test Plan:
XRT:
MASTER_ADDR=localhost MASTER_PORT=6000 python test/test_ddp.py TestXrtDistributedDataParallel.test_ddp_correctness_large_net
PJRT:
PJRT_DEVICE=TPU python test/pjrt/test_ddp.py TestPjRtDistributedDataParallel.test_ddp_correctness_large_net
diff --git a/test/args_parse.py b/test/args_parse.py
@@ -33,6 +33,7 @@ def parse_common_options(datadir=None,
   parser.add_argument('--tidy', action='store_true')
   parser.add_argument('--metrics_debug', action='store_true')
   parser.add_argument('--async_closures', action='store_true')
+  parser.add_argument('--debug', action='store_true')
   if opts:
     for name, aopts in opts:
       parser.add_argument(name, **aopts)
diff --git a/test/distributed_util.py b/test/distributed_util.py
@@ -1,4 +1,3 @@
-from absl import logging
 import copy
 import torch
 import torch.distributed as dist
@@ -7,7 +6,61 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.xla_backend
-from torch_xla.experimental import pjrt
+
+
+# The followings are helpers useful for debugging purpose.
+def comp_hook(state: object,
+              bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+  """
+  Debug utils. Please refer to DistributedDataParallel.register_comm_hook to learn
+  how to use it.
+  """
+  print("comp_hook called.")
+  fut = torch.futures.Future()
+  fut.set_result(bucket.buffer())
+  return fut
+
+
+def calculate_model_size(model):
+  """
+  Debug utils. Calculate the given model's size in mb.
+  """
+  param_size = 0
+  for param in model.parameters():
+    param_size += param.nelement() * param.element_size()
+  buffer_size = 0
+  for buffer in model.buffers():
+    buffer_size += buffer.nelement() * buffer.element_size()
+
+  size_all_mb = (param_size + buffer_size) / 1024**2
+  print('model size: {:.3f}MB'.format(size_all_mb))
+
+
+class LargeNet(nn.Module):
+
+  def __init__(self):
+    super(LargeNet, self).__init__()
+    self.net1 = nn.Linear(10, 1000)
+    self.net2 = nn.Linear(1000, 1000)
+    self.net3 = nn.Linear(1000, 1000)
+    self.relu = nn.ReLU()
+    self.net4 = nn.Linear(1000, 10)
+
+  def forward(self, x):
+    output1 = self.relu(self.net1(x))
+    output2 = self.relu(self.net2(output1))
+    output3 = self.relu(self.net3(output2))
+    return self.net4(output3)
+
+
+class SmallNet(nn.Module):
+
+  def __init__(self):
+    super(SmallNet, self).__init__()
+    self.net = nn.Linear(10, 10)
+
+  def forward(self, x):
+    return self.net(x)
 
 
 def init_xla_backend(init_file: str):
@@ -40,17 +93,32 @@ def train_step(model, inputs, labels, optimizer, loss_fn):
   return loss
 
 
-def ddp_correctness(init_file: str):
+def ddp_correctness(init_file: str,
+                    *,
+                    use_large_net: bool = False,
+                    debug: bool = False):
   rank, world_size = init_xla_backend(init_file)
 
   device = xm.xla_device()
 
   # To make nn.Linear init same parameters across devices.
   torch.manual_seed(2022)
-  cpu_model = nn.Linear(10, 10)
+  # Lower range probably makes sense too. Anyway, stick to 100 as the original PoC.
+  steps = 100
+  cpu_model = SmallNet()
+  if use_large_net:
+    steps = 5  # To save test time.
+    cpu_model = LargeNet()
+
   # TODO(@alanwaketan): Investigate whether we can omit the gradient_as_bucket_view option.
+  # bucket_cap_mb is set to 1 mb such that we can still have multiple all_reduces while avoiding
+  # using models that are too larger (25 mb).
+  # To be noted, DDP currently uses one bucket for the first iteration. See pytorch#73732.
   ddp_model = DDP(
-      copy.deepcopy(cpu_model).to(device), gradient_as_bucket_view=True)
+      copy.deepcopy(cpu_model).to(device),
+      gradient_as_bucket_view=True,
+      bucket_cap_mb=1)
+  # ddp_model.register_comm_hook(state=None, hook=comp_hook)
 
   cpu_optimizer = optim.SGD(cpu_model.parameters(), lr=1e-100)
   ddp_optimizer = optim.SGD(ddp_model.parameters(), lr=1e-100)
@@ -59,8 +127,7 @@ def ddp_correctness(init_file: str):
   local_batch_size = 2
   global_batch_size = local_batch_size * world_size
   offset = rank * local_batch_size
-  # Lower range probably makes sense too. Anyway, stick to 100 as the original PoC.
-  for step in range(100):
+  for step in range(steps):
     # To make torch.randn produce same results across devices.
     torch.manual_seed(2022 + step)
 
@@ -82,7 +149,9 @@ def ddp_correctness(init_file: str):
     # TODO(@alanwaketan): Investigate why the atol here is this low.
     assert torch.allclose(cpu_loss, ddp_loss, atol=1e-02)
     assert_all_close(cpu_model.parameters(), ddp_model.parameters())
-    # To display the below messages, set '--verbosity=1'.
-    logging.debug(
-        "iteration %d: cpu_loss = %f, ddp_loss = %f, cpu_model.parameters() ~= ddp_model.parameters()",
-        step, cpu_loss, ddp_loss)
+    # To display the below messages, set '--debug'.
+    # Here we don't use FLAGS.debug because this function is often ran in different processes than the launcher.
+    if debug:
+      print(
+          f"iteration {step}: cpu_loss = {cpu_loss}, ddp_loss = {ddp_loss}, cpu_model.parameters() ~= ddp_model.parameters()"
+      )
diff --git a/test/pjrt/test_ddp.py b/test/pjrt/test_ddp.py
@@ -10,8 +10,11 @@
 xla_test_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
 sys.path.append(xla_test_folder)
 
+import args_parse
 import distributed_util as util
 
+FLAGS = args_parse.parse_common_options()
+
 
 class TestPjRtDistributedDataParallel(parameterized.TestCase):
 
@@ -27,8 +30,17 @@ def test_ddp_init(self):
     pjrt._run_multiprocess(self._ddp_init, self.create_tempfile().full_path)
 
   def test_ddp_correctness(self):
-    pjrt._run_multiprocess(util.ddp_correctness,
-                           self.create_tempfile().full_path)
+    pjrt._run_multiprocess(
+        util.ddp_correctness,
+        self.create_tempfile().full_path,
+        debug=FLAGS.debug)
+
+  def test_ddp_correctness_large_net(self):
+    pjrt._run_multiprocess(
+        util.ddp_correctness,
+        self.create_tempfile().full_path,
+        use_large_net=True,
+        debug=FLAGS.debug)
 
 
 if __name__ == "__main__":
diff --git a/test/test_ddp.py b/test/test_ddp.py
@@ -3,13 +3,16 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.xla_multiprocessing as xmp
 
+import args_parse
 import distributed_util as util
 
+FLAGS = args_parse.parse_common_options()
+
 
 class TestXrtDistributedDataParallel(parameterized.TestCase):
 
   @staticmethod
-  def _ddp_correctness(rank):
+  def _ddp_correctness(rank, use_large_net: bool, debug: bool):
     # We cannot run this guard before XMP,
     # see API_GUIDE.md#running-on-multiple-xla-devices-with-multi-processing.
     device = xm.xla_device()
@@ -19,10 +22,13 @@ def _ddp_correctness(rank):
           'Default device {} is not a TPU device'.format(device),
           file=sys.stderr)
       return
-    util.ddp_correctness(None)
+    util.ddp_correctness(None, use_large_net=use_large_net, debug=debug)
 
   def test_ddp_correctness(self):
-    xmp.spawn(self._ddp_correctness, args=())
+    xmp.spawn(self._ddp_correctness, args=(False, FLAGS.debug))
+
+  def test_ddp_correctness_large_net(self):
+    xmp.spawn(self._ddp_correctness, args=(True, FLAGS.debug))
 
 
 if __name__ == "__main__":
diff --git a/torch_xla/distributed/xla_multiprocessing.py b/torch_xla/distributed/xla_multiprocessing.py
@@ -32,6 +32,10 @@ def _is_xla_config():
   return False
 
 
+# TODO: Some usages of this function are to caculate the number of hosts (a TPU concept),
+# and some are to caculate the number of processes within a world (which can span multiple hosts).
+# The latter should really be what this function is supposed to do. It's so confusing. We
+# should improve it.
 def _get_world_size():
   # We cannot use the xla_model.py API here, as the features used in that module
   # needs the setup provided by this one.
@@ -145,10 +149,12 @@ def _get_mp_device_ordinal(index, gindex):
   return index if xenv.HOST_ORDINAL in os.environ else gindex
 
 
-def _setup_workers(num_devices):
+# TODO: Consolidate this with _setup_gpu_worker.
+def _setup_gpu_workers(num_devices):
   world_size = _get_world_size()
   workers_env = os.environ.get(xenv.WORKERS, None)
   workers = []
+  # TODO: Is this path actually being used? This seems to support multi-host GPUs (is this a thing at all?).
   if workers_env is not None:
     wcfg = _parse_workers_config(workers_env)
     assert world_size == len(
@@ -209,7 +215,7 @@ def _pre_fork_setup(num_devices):
         socket.getfqdn(),
         xu.get_free_tcp_ports()[0])
   if dev_kind == 'GPU':
-    _setup_workers(num_devices)
+    _setup_gpu_workers(num_devices)
     _create_gpu_devices(num_devices)
   elif dev_kind == 'CPU':
     _pre_fork_cpu_setup(num_devices)
@@ -377,7 +383,7 @@ def spawn(fn,
   Returns:
     The same object returned by the `torch.multiprocessing.spawn` API. If
     `nprocs` is 1 the `fn` function will be called directly, and the API will
-    not return.
+    return None.
   """
   if pjrt.using_pjrt():
     return pjrt.spawn(fn, args)
@@ -387,17 +393,27 @@ def spawn(fn,
     return _run_direct(fn, args, nprocs, join, daemon, start_method)
 
   pf_cfg = _pre_fork_setup(nprocs)
+  result = None
   if pf_cfg.num_devices == 1:
     _start_fn(0, pf_cfg, fn, args)
   else:
-    return torch.multiprocessing.start_processes(
+    result = torch.multiprocessing.start_processes(
         _mp_start_fn,
         args=(pf_cfg, fn, args),
         nprocs=pf_cfg.num_devices,
         join=join,
         daemon=daemon,
         start_method=start_method)
 
+  # For GPU, xenv.WORKERS are set in the launcher and then get carried to the children.
+  # However, if the launcher is reused to do another multi-process experiment, _setup_gpu_workers
+  # would mistake the xenv.WORKERS as configured to enable multi-host experiments. Each worker then
+  # represents a host. Therefore, reset it after launching all children.
+  if pf_cfg.dev_kind == 'GPU':
+    os.environ.pop(xenv.WORKERS)
+
+  return result
+
 
 class MpModelWrapper(object):
   """Wraps a model to minimize host memory usage when `fork` method is used.