pytorch · H-Huang · Oct 6, 2022 · Oct 6, 2022 · Oct 7, 2022 · Oct 31, 2022
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
@@ -1464,7 +1464,9 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # ensure supported devices (cpu, cuda) succeeds during dispatch call
         tensor = torch.zeros(2, 2, device=torch.device(device))
         # multi tensor collectives
-        if collective == dist.all_gather:
+        if collective == dist.barrier:
+            collective()
+        elif collective == dist.all_gather:
             collective([tensor], tensor, *args)
         elif collective == dist.reduce_scatter:
             if backend != "gloo":
@@ -1488,6 +1490,7 @@ def _test_collectives(self, backend):
             (dist.all_reduce,),
             (dist.all_gather,),
             (dist.reduce_scatter,),
+            (dist.barrier,),
         ]
         for collective, *args in collectives_and_args:
             with self.subTest(collective=collective, args=args):

diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -219,6 +219,22 @@ reduce_scatter_cuda_(
       output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> barrier_cpu(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const std::vector<int64_t>& device_ids,
+    int64_t timeout) {
+  return process_group->barrier(
+      BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
+}
+
+c10::intrusive_ptr<Work> barrier_cuda(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const std::vector<int64_t>& device_ids,
+    int64_t timeout) {
+  return process_group->barrier(
+      BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
+}
+
 // register functions to dispatcher
 namespace {
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
@@ -286,6 +302,15 @@ TORCH_LIBRARY_IMPL(c10d, CPU, m) {
 TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("reduce_scatter_", reduce_scatter_cuda_);
 }
+
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("barrier", barrier_cpu);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("barrier", barrier_cuda);
+}
+
 } // namespace
 
 } // namespace ops