pytorch · fegin · Nov 8, 2023 · Nov 9, 2023 · wanchaol · Nov 8, 2023
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
@@ -15,6 +15,7 @@
 
 from functorch import make_fx
 from torch.testing import FileCheck
+from torch.testing._internal.distributed.fake_pg import FakeStore
 from torch.utils._triton import has_triton
 
 if not dist.is_available():
@@ -575,6 +576,20 @@ def allreduce(t, pg):
         compiled_allreduce = torch.compile(allreduce, fullgraph=True)
         compiled_allreduce(torch.randn(8, device=self.device), self.process_group)
 
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_tracing_with_fakepg(self):
+        def allreduce(t, pg):
+            return ft_c.all_reduce(t, "sum", pg)
+
+        compiled_allreduce = torch.compile(allreduce, fullgraph=True)
+        dist.init_process_group(
+            backend="fake",
+            rank=0,
+            world_size=8,
+            store=FakeStore(),
+        )
+        allreduce(torch.randn(8, device=self.device), pg=dist.group.WORLD)
+
 
 class TestOpWaitiness(MultiThreadedTestCase):
     @property

diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
@@ -92,6 +92,7 @@
 )
 from .distributed import (
     DeviceMeshVariable,
+    FakeProcessGroupVariable,
     PlacementClassVariable,
     PlacementVariable,
     ProcessGroupVariable,
@@ -660,6 +661,12 @@ def index_source(key):
                 source=self.source,
                 guards=self.make_guards(GuardBuilder.ID_MATCH),
             )
+        elif FakeProcessGroupVariable.is_process_group(value):
+            return FakeProcessGroupVariable(
+                value,
+                source=self.source,
+                guards=self.make_guards(GuardBuilder.ID_MATCH),
+            )
         elif DeviceMeshVariable.is_device_mesh(value):
             # TODO: see if we need to add custom guard instead
             # of a simple ID_MATCH

diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
@@ -213,3 +213,14 @@ def is_process_group(value):
         from torch._C._distributed_c10d import ProcessGroup
 
         return istype(value, ProcessGroup)
+
+
+class FakeProcessGroupVariable(ProcessGroupVariable):
+    @staticmethod
+    def is_process_group(value):
+        if not DistributedVariable.is_available():
+            return False
+
+        from torch.testing._internal.distributed.fake_pg import FakeProcessGroup
+
+        return istype(value, FakeProcessGroup)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
@@ -42,7 +42,12 @@
     NullContextVariable,
     TorchFunctionDisableVariable,
 )
-from .distributed import is_constant_pg_functions, is_from_local, ProcessGroupVariable
+from .distributed import (
+    FakeProcessGroupVariable,
+    is_constant_pg_functions,
+    is_from_local,
+    ProcessGroupVariable,
+)
 from .higher_order_ops import TorchHigherOrderOperatorVariable
 from .lists import ListVariable, TupleVariable
 from .torch_function import can_dispatch_torch_function, dispatch_torch_function
@@ -584,7 +589,7 @@ def call_function(
             # We desugar it at trace-time into ranks by directly calling util
             # bake the result into the trace
             assert len(args) == 1, "Expected one arg (pg)"
-            assert isinstance(args[0], ProcessGroupVariable)
+            assert isinstance(args[0], (ProcessGroupVariable, FakeProcessGroupVariable))
 
             invocation_result = self.value(args[0].as_python_constant())
             # Note - while we *could* cook up sources around invocations, like a FunctionSource

diff --git a/torch/testing/_internal/distributed/fake_pg.py b/torch/testing/_internal/distributed/fake_pg.py
@@ -41,6 +41,9 @@ def __init__(self, rank, world_size):
     def allreduce(self, tensor_list, opts=AllreduceOptions()):
         return ret_work(tensor_list)
 
+    def allreduce_coalesced(self, tensor_list, opts=AllreduceOptions()):
+        return ret_work(tensor_list)
+
     def allgather(self, output_tensors, input_tensor, opts=AllgatherOptions()):
         # NOTE: in general it's not good form to try to make FakePG work with 'real data',
         # but the reasoning here is that we want FakePG to work with DeviceMesh's init