Revert "[dtensor] rewrite embedding ops using op strategy (#118079)"

pytorchmergebot · pytorchmergebot · commit fc30bd3b7b04 · 2024-01-26T08:47:14.000Z
This reverts commit e599a08. Reverted #118079 on behalf of https://github.com/DanilBaibak due to Break internal build ([comment](#118079 (comment)))
diff --git a/test/distributed/_tensor/test_embedding_ops.py b/test/distributed/_tensor/test_embedding_ops.py
@@ -3,14 +3,13 @@
 import sys
 
 import torch
-from torch.distributed._tensor import (
-    distribute_module,
-    distribute_tensor,
-    DTensor,
-    Replicate,
-    Shard,
+from torch.distributed._tensor import DTensor
+from torch.distributed._tensor.placement_types import Replicate
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
 )
-from torch.distributed._tensor.debug import CommDebugMode
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -54,16 +53,12 @@ def _run_embedding_op_test(
         sharded_embedding.weight = torch.nn.Parameter(
             local_embedding.weight.clone().detach()
         )
-
-        def shard_embedding_fn(name, module, device_mesh):
-            for name, param in module.named_parameters():
-                dist_param = torch.nn.Parameter(
-                    distribute_tensor(param, device_mesh, [Shard(shard_dim)])
-                )
-                module.register_parameter(name, dist_param)
-
-        sharded_embedding = distribute_module(
-            sharded_embedding, device_mesh, shard_embedding_fn
+        parallelize_module(
+            module=sharded_embedding,
+            device_mesh=device_mesh,
+            parallelize_plan=ColwiseParallel(output_layouts=Replicate())
+            if shard_dim == 1
+            else RowwiseParallel(),
         )
 
         # Run sharded computation
@@ -74,14 +69,8 @@ def shard_embedding_fn(name, module, device_mesh):
         target = torch.empty(
             *inp.size(), embedding_dim, dtype=torch.float, device=self.device_type
         ).random_(0, 1)
-        dist_inp = distribute_tensor(inp, device_mesh, [Replicate()])
-
-        # fwd computation, ensure no comm happened
-        with CommDebugMode() as fwd_mode:
-            dist_output = sharded_embedding(dist_inp)
-            self.assertEqual(fwd_mode.get_total_counts(), 0)
+        output = sharded_embedding(inp)
 
-        output = dist_output.full_tensor()
         # Run local computation
         local_output = local_embedding(inp)
 
@@ -90,24 +79,20 @@ def shard_embedding_fn(name, module, device_mesh):
 
         # Use a sample cross entry loss to verify backward and grad computation.
         loss = torch.nn.CrossEntropyLoss()
-        emb_loss = loss(
+        attn_loss = loss(
             output,
             target,
         )
-        emb_dup_loss = loss(
+        attn_dup_loss = loss(
             local_output,
             target,
         )
+        attn_loss.backward()
+        attn_dup_loss.backward()
 
-        # local embedding backward
-        emb_dup_loss.backward()
-
-        # sharded embedding bwd computation, ensure no comm happened
-        with CommDebugMode() as bwd_mode:
-            emb_loss.backward()
-            self.assertEqual(bwd_mode.get_total_counts(), 0)
-
-        gradient = sharded_embedding.weight.grad.full_tensor()
+        gradient = sharded_embedding.weight.grad.redistribute(
+            device_mesh, [Replicate()]
+        ).to_local()
 
         local_grad = local_embedding.weight.grad
 
@@ -138,10 +123,10 @@ def test_sharded_embedding_colwise(self):
         self._run_embedding_op_test(1, [8, 6, 5, 4], 23, 13, padding_idx=12)
 
     @with_comms
-    def test_sharded_embedding_colwise_max_norm_errors(self):
+    def test_sharded_embedding_colwise_errors(self):
         with self.assertRaisesRegex(
             NotImplementedError,
-            "aten.embedding_renorm_.default does not have a sharding strategy registered.",
+            "DTensor does not support sharded embedding operation with max_norm yet!",
         ):
             self._run_embedding_op_test(
                 1, [8, 6, 5, 4], 23, 13, padding_idx=12, max_norm=2.0
@@ -151,7 +136,7 @@ def test_sharded_embedding_colwise_max_norm_errors(self):
     def test_sharded_embedding_rowwise(self):
         with self.assertRaisesRegex(
             NotImplementedError,
-            "row-wise sharded embedding operation yet",
+            "RowwiseParallel currently only support nn.Linear!",
         ):
             self._run_embedding_op_test(0, [5, 12], 16, 22)
 
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
@@ -67,27 +67,6 @@ def test_colwise_parallel_style(self):
             self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1)
             self.assertEqual(comm_mode.get_total_counts(), 2)
 
-    @with_comms
-    def test_colwise_parallel_embedding(self):
-        mesh = init_device_mesh(self.device_type, (self.world_size,))
-
-        comm_mode = CommDebugMode()
-        tensor = torch.arange(8, device=self.device_type).reshape(4, 2)
-        model = nn.Embedding(16, 16, device=self.device_type)
-
-        default_col_parallel = ColwiseParallel()
-        with comm_mode:
-            colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
-            out = colwise_mod(tensor)
-            # ensure output shard on the last dim
-            self.assertEqual(out.shape, (4, 2, 16 // self.world_size))
-            # ensure no communication happened in fwd
-            self.assertEqual(comm_mode.get_total_counts(), 0)
-
-            out.sum().backward()
-            # no comm in bwd
-            self.assertEqual(comm_mode.get_total_counts(), 0)
-
     @with_comms
     def test_rowwise_parallel_style(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
diff --git a/torch/distributed/_tensor/op_schema.py b/torch/distributed/_tensor/op_schema.py
@@ -246,7 +246,7 @@ def __str__(self) -> str:
                 args_sharding.append(str(arg))
             else:
                 args_sharding.append(str(arg))
-        return f"Op(op={self.op}, args_sharding={', '.join(args_sharding)} @ mesh: {mesh_shape})"
+        return f"Op(op={self.op}, args_sharding={', '.join(args_sharding)}@ mesh: {mesh_shape})"
 
     def __post_init__(self) -> None:
         has_symints = False
diff --git a/torch/distributed/_tensor/ops/embedding_ops.py b/torch/distributed/_tensor/ops/embedding_ops.py
@@ -1,168 +1,97 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # implement matrix related ops for distributed tensor
-import itertools
-from typing import cast, List
 
 import torch
-from torch.distributed._tensor.op_schema import (
-    OpSchema,
-    OpStrategy,
-    PlacementStrategy,
-    StrategyType,
-)
-from torch.distributed._tensor.ops.utils import (
-    generate_redistribute_costs,
-    is_tensor_shardable,
-    register_op_strategy,
-)
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.utils import register_prop_rule
 
 from torch.distributed._tensor.placement_types import (
     _Partial,
     DTensorSpec,
-    Placement,
     Replicate,
     Shard,
 )
 
-from torch.distributed.device_mesh import DeviceMesh
-
 aten = torch.ops.aten
 
 
-@register_op_strategy(aten.embedding.default)
-def embedding_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
-    """
-    This strategy handles embedding op. We have two possible embedding shardings:
-    rowwise and colwise
-    # TODO: implement rowwise sharding
-    """
-    weight_strategy = cast(OpStrategy, op_schema.args_schema[0])
-    indices_strategy = cast(OpStrategy, op_schema.args_schema[1])
-
-    weight_shape = weight_strategy.output_shape
-    indices_shape = indices_strategy.output_shape
-    output_emd_dim = len(indices_shape)
-
-    # guard rowwise sharding not implemented for now
-    weight_spec = weight_strategy.strategies[0].output_spec
+# TODO: Enable BWD for embedding op.
+@register_prop_rule(aten.embedding.default)
+def embedding_rules(op_schema: OpSchema) -> OutputSharding:
+    weight_spec, inp_spec = op_schema.args_spec
     if any(placement.is_shard(0) for placement in weight_spec.placements):
         raise NotImplementedError(
             "DTensor does not support row-wise sharded embedding operation yet!"
         )
 
-    all_mesh_dim_strategies = []
-
-    for mesh_dim in range(mesh.ndim):
-        single_mesh_dim_strategies = []
-
-        # placement list stores placements of [output, weight, input_indices]
-        # first we always have replicate all for inputs and output
-        all_replicate: List[Placement] = [Replicate()] * 3
-        single_mesh_dim_strategies.append(all_replicate)
-
-        # colwise sharding, output shard on last dim, weight shard on dim 1, input replicate
-        colwise_sharding = [Shard(output_emd_dim), Shard(1), Replicate()]
-        single_mesh_dim_strategies.append(colwise_sharding)
-
-        # batch dim sharding, weight replicated, input can shard on any dim, output follows input
-        for input_dim in range(len(indices_shape)):
-            batch_sharding = [Shard(input_dim), Replicate(), Shard(input_dim)]
-            single_mesh_dim_strategies.append(batch_sharding)
-
-        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
-
-    strategy_combs = itertools.product(*all_mesh_dim_strategies)
-
-    all_strategies = []
-    for strategy_comb in strategy_combs:
-        spec_list = []
-        for specs in zip(*strategy_comb):
-            spec_list.append(DTensorSpec(mesh, tuple(specs)))
-
-        if is_tensor_shardable(weight_shape, spec_list[1]) and is_tensor_shardable(
-            indices_shape, spec_list[2]
-        ):
-            # only add to the strategy list when both weight and indices are shardable
-            weight_spec, indices_spec = spec_list[1:]
-            redistribute_cost = [
-                generate_redistribute_costs(weight_strategy, weight_spec),
-                generate_redistribute_costs(indices_strategy, indices_spec),
-            ]
-            strat = PlacementStrategy(
-                output_specs=spec_list[0],
-                input_specs=spec_list[1:],
-                redistribute_cost=redistribute_cost,
-            )
-            all_strategies.append(strat)
-
-    return OpStrategy(all_strategies)
-
-
-@register_op_strategy(aten.embedding_dense_backward.default)
-def embedding_dense_backward_strategy(
-    mesh: DeviceMesh, op_schema: OpSchema
-) -> StrategyType:
-    """
-    This strategy handles embedding op. We have two possible embedding shardings:
-    rowwise and colwise
-    # TODO: implement rowwise sharding backward
-    """
-    grad_out_strategy = cast(OpStrategy, op_schema.args_schema[0])
-    indices_strategy = cast(OpStrategy, op_schema.args_schema[1])
-
-    grad_out_shape = grad_out_strategy.output_shape
-    indices_shape = indices_strategy.output_shape
-    grad_out_ndim = len(grad_out_shape)
-
-    all_mesh_dim_strategies = []
-
-    for mesh_dim in range(mesh.ndim):
-        single_mesh_dim_strategies = []
-
-        # placement list stores placements of [output, weight, input_indices]
-        # first we always have replicate all for inputs and output
-        all_replicate: List[Placement] = [Replicate()] * 3
-        single_mesh_dim_strategies.append(all_replicate)
-
-        # colwise sharding backward, grad_out shard on last dim, input replicate,
-        # weight grad shard colwise
-        colwise_sharding = [Shard(1), Shard(grad_out_ndim - 1), Replicate()]
-        single_mesh_dim_strategies.append(colwise_sharding)
-
-        # batch dim sharding, weight replicated, grad_out/input have same sharding
-        # that can shard on any dim, weight grad partial
-        for input_dim in range(len(indices_shape)):
-            batch_sharding = [_Partial(), Shard(input_dim), Shard(input_dim)]
-            single_mesh_dim_strategies.append(batch_sharding)
-
-        # grad_out partial, input replicate, weight grad keep partial
-        partial_sharding = [_Partial(), _Partial(), Replicate()]
-        single_mesh_dim_strategies.append(partial_sharding)
-
-        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
-
-    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+    if weight_spec.is_replicated() and inp_spec.placements == [Shard(0)]:
+        # Embedding table is replicated, input ids are sharded along batch
+        # dimension. Output lookups should match input sharding spec in this case.
+        return OutputSharding(
+            output_spec=DTensorSpec(mesh=inp_spec.mesh, placements=inp_spec.placements)
+        )
 
-    all_strategies = []
-    for strategy_comb in strategy_combs:
-        spec_list = []
-        for specs in zip(*strategy_comb):
-            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+    if inp_spec.is_replicated():
+        weight_dim_map = weight_spec.dim_map
+        output_dim_map = inp_spec.dim_map
+        output_dim_map.append(weight_dim_map[1])
+        return OutputSharding(
+            output_spec=DTensorSpec.from_dim_map(inp_spec.mesh, output_dim_map, [])
+        )
 
-        if is_tensor_shardable(grad_out_shape, spec_list[1]) and is_tensor_shardable(
-            indices_shape, spec_list[2]
-        ):
-            # only add to the strategy list when both grad_out and indices are shardable
-            grad_out_spec, indices_spec = spec_list[1:]
-            redistribute_cost = [
-                generate_redistribute_costs(grad_out_strategy, grad_out_spec),
-                generate_redistribute_costs(indices_strategy, indices_spec),
-            ]
-            strat = PlacementStrategy(
-                output_specs=spec_list[0],
-                input_specs=spec_list[1:],
-                redistribute_cost=redistribute_cost,
+    return OutputSharding(
+        output_spec=None,
+        schema_suggestions=[
+            OpSchema(
+                op=op_schema.op,
+                args_schema=(
+                    weight_spec,
+                    DTensorSpec(
+                        mesh=inp_spec.mesh,
+                        placements=tuple([Replicate()] * len(inp_spec.placements)),
+                        tensor_meta=inp_spec.tensor_meta,
+                    ),
+                ),
+                kwargs_schema=op_schema.kwargs_schema,
             )
-            all_strategies.append(strat)
-
-    return OpStrategy(all_strategies)
+        ],
+    )
+
+
+@register_prop_rule(aten.embedding_renorm_.default)
+def embedding_renorm_rules(op_schema: OpSchema) -> OutputSharding:
+    raise NotImplementedError(
+        "DTensor does not support sharded embedding operation with max_norm yet!"
+    )
+
+
+@register_prop_rule(aten.embedding_dense_backward.default)
+def embedding_dense_backward_rules(op_schema: OpSchema) -> OutputSharding:
+    grad_output, indices = op_schema.args_schema[:2]
+    assert isinstance(grad_output, DTensorSpec)
+    assert isinstance(indices, DTensorSpec)
+    if grad_output.placements == indices.placements:
+        # The embedding table is replicated, and input/oupput activations are
+        # sharded. In this case, gradients for the embedding table should be
+        # Partial.
+        return OutputSharding(
+            output_spec=DTensorSpec(mesh=indices.mesh, placements=(_Partial(),))
+        )
+    elif grad_output.placements == [_Partial()] and indices.placements == [Replicate()]:
+        # The embedding table is replicated and the indices is also replicated
+        # (local is a more precise term). This is postional embedding. In this
+        # case, gradients for the embmedding table should be Partial.
+        return OutputSharding(
+            output_spec=DTensorSpec(mesh=indices.mesh, placements=(_Partial(),))
+        )
+    elif all(placement.is_replicate() for placement in indices.placements):
+        # BWD for colwise sharding case
+        return OutputSharding(
+            output_spec=DTensorSpec(mesh=indices.mesh, placements=(Shard(1),))
+        )
+    else:
+        raise NotImplementedError(
+            "Unsupported embedding dense backward schema:\n"
+            f"grad_output - {grad_output}\n"
+            f"indices - {indices}"
+        )
diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py