pytorch · wanchaol · Oct 5, 2023 · Oct 5, 2023
diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
@@ -259,15 +259,10 @@ def wrapped(fn):
     xfail("masked.argmin"),
     xfail("masked.cumprod"),
     xfail("masked.cumsum"),
-    xfail("masked.log_softmax"),
-    xfail("masked.logaddexp"),
     xfail("masked.logsumexp"),
     xfail("masked.median"),
     xfail("masked.norm"),
     xfail("masked.prod"),
-    xfail("masked.softmin"),
-    xfail("masked.softmax"),
-    xfail("masked.sum"),
     xfail("matrix_exp"),
     xfail("max", "binary"),
     xfail("max", "reduction_no_dim"),
@@ -288,7 +283,6 @@ def wrapped(fn):
     xfail("nanquantile"),
     xfail("nansum"),
     xfail("native_batch_norm"),
-    xfail("native_dropout_backward"),
     xfail("native_layer_norm"),
     xfail("narrow_copy"),
     xfail("ne"),
@@ -363,7 +357,6 @@ def wrapped(fn):
     xfail("nn.functional.multilabel_soft_margin_loss"),
     xfail("nn.functional.nll_loss"),
     xfail("nn.functional.normalize"),
-    xfail("nn.functional.pad", "circular"),
     xfail("nn.functional.pad", "constant"),
     xfail("nn.functional.pad", "reflect"),
     xfail("nn.functional.pad", "replicate"),
@@ -500,7 +493,6 @@ def wrapped(fn):
     xfail("vdot"),
     xfail("view_copy"),
     xfail("view_as_complex"),
-    xfail("where"),
     xfail("zeros"),
     # ops inside this might even fail without dtensor
     # tests, as we rescale op db common test size factor (i.e. L, M, S)

diff --git a/test/distributed/_tensor/test_tensor_ops.py b/test/distributed/_tensor/test_tensor_ops.py
@@ -380,6 +380,18 @@ def test_index(self):
                 torch.randint(5, (12, 8, 12)),
             )
 
+    @with_comms
+    def test_where_type_promotion(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))  # 1D mesh
+
+        specs = [[Shard(0)], [Replicate()]]
+        for spec in specs:
+            global_tensor = torch.randn(12, 8)
+            mat = distribute_tensor(global_tensor, mesh, spec)
+            res = torch.where(mat > 0, 1, 0)
+            ref = torch.where(global_tensor > 0, 1, 0)
+            self.assertEqual(res.redistribute(placements=[Replicate()]).to_local(), ref)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_tensor/dispatch.py b/torch/distributed/_tensor/dispatch.py
@@ -17,7 +17,7 @@
     OutputSharding,
     OutputSpecType,
 )
-from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.distributed._tensor.placement_types import DTensorSpec, Replicate, TensorMeta
 from torch.distributed._tensor.random import is_rng_supported_mesh
 from torch.distributed._tensor.redistribute import redistribute_local_tensor
 from torch.distributed._tensor.sharding_prop import ShardingPropagator
@@ -150,10 +150,23 @@ def _operator_dispatch(
             else:
                 mesh = arg.device_mesh
         elif isinstance(arg, torch.Tensor):
-            raise RuntimeError(
-                f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
-                " torch.Tensor to DTensor before calling distributed operators!"
-            )
+            if arg.ndim == 0 and mesh is not None:
+                # scalar tensor can be safely treated as replicated
+                args_schema.append(
+                    DTensorSpec(
+                        mesh,
+                        (Replicate(),) * mesh.ndim,
+                        tensor_meta=TensorMeta(
+                            shape=arg.shape, stride=arg.stride(), dtype=arg.dtype
+                        ),
+                    )
+                )
+                local_args.append(arg)
+            else:
+                raise RuntimeError(
+                    f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
+                    " torch.Tensor to DTensor before calling distributed operators!"
+                )
         else:
             args_schema.append(arg)
             local_args.append(arg)

diff --git a/torch/distributed/_tensor/ops/pointwise_ops.py b/torch/distributed/_tensor/ops/pointwise_ops.py
@@ -350,6 +350,7 @@
     aten.trunc.out,
     aten.trunc_.default,
     aten.where.self,
+    aten.where.self_out,
     aten.xlogy.OutScalar_Self,
     aten.xlogy.OutScalar_Other,
     aten.xlogy.OutTensor,

diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -336,21 +336,11 @@ def to_dist_tensor(
         if type(t) is torch.Tensor or type(t) is torch.nn.Parameter:
             if self.is_supported_tensor(t):
                 self.hit += 1
-                # We cannot use distribute_tensor for bool tensors as c10d
-                # collectives does not support the dtype, we assume op with
-                # bool tensor args the same tensor so we don't need to broadcast
-                # TODO: add bool tensor dtype support in c10d collective
-                if t.dtype == torch.bool:
-                    r = DTensor(
-                        t,
-                        mesh,
-                        tuple(placements),
-                        size=t.size(),
-                        dtype=torch.bool,
-                        requires_grad=t.requires_grad,
-                        stride=t.stride()
-                    )
+                if t.ndim == 0:
+                    # scalar tensor by default will be replicated
+                    r = distribute_tensor(t, mesh, [Replicate()] * mesh.ndim)
                 else:
+                    # distribute non-scalar tensors
                     r = distribute_tensor(t, mesh, placements)
                 if type(t) is torch.nn.Parameter:
                     r = torch.nn.Parameter(  # type: ignore[assignment]