Apply sharding when creating tensors data in data loader (#4995)

jonb377 · web-flow · commit ce1054fc6ea3 · 2023-05-15T11:31:23.000-07:00
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -9,6 +9,7 @@
 import torch.optim as optim
 import torch_xla
 import torch_xla.core.xla_model as xm
+import torch_xla.debug.metrics as met
 import torch_xla.experimental.xla_sharding as xs
 from torch_xla.experimental.xla_sharded_tensor import XLAShardedTensor
 import test_xla_sharding_base
@@ -170,6 +171,30 @@ def test_transfer_sharded_data_to_host(self):
     t1 = xt1.cpu()
     self.assertTrue(torch.allclose(t1, torch.ones(16, 16)))
 
+  def test_send_cpu_data_to_device_with_sharding(self):
+    xm.mark_step()  # Execute pending graph to avoid contaminating metrics
+    met.clear_all()
+    tensor = torch.arange(16, dtype=torch.float32).reshape(4, 4)
+    mesh = self._get_mesh((1, self.n_devices))
+
+    # Create a ShardingSpec and use it to shard the tensor while sending to device
+    sharding_spec = xs.ShardingSpec(mesh, (0, 1))
+    self.assertTrue(sharding_spec.can_apply(tensor))
+    xtensors = xm.send_cpu_data_to_device([tensor],
+                                          xm.xla_device(),
+                                          input_sharding=sharding_spec)
+    self.assertEqual(len(xtensors), 1)
+    outbound = met.metric_data("OutboundData")[1]
+    self.assertEqual(outbound, tensor.element_size() * tensor.nelement())
+
+    # Verify the resulting sharding annotation matches an explicit `mark_sharding` call
+    xt = xtensors[0]
+    explicit_xt = tensor.to(xm.xla_device())
+    xs.mark_sharding(explicit_xt, mesh, (0, 1))
+    self.assertEqual(
+        torch_xla._XLAC._get_xla_sharding_spec(xt),
+        torch_xla._XLAC._get_xla_sharding_spec(explicit_xt))
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
@@ -967,11 +967,11 @@ def send_cpu_data_to_device(data, device, input_sharding=None):
 
   def convert_fn(tensors):
     devices = [str(device)] * len(tensors)
-    xtensors = torch_xla._XLAC._xla_tensors_from_aten(tensors, devices)
+    shardings = None
     if input_sharding:
-      for xtensor in xtensors:
-        if input_sharding.can_apply(xtensor):
-          input_sharding.apply(xtensor)
+      shardings = [input_sharding.xla_spec(t) for t in tensors]
+    xtensors = torch_xla._XLAC._xla_tensors_from_aten(tensors, devices,
+                                                      shardings)
     return xtensors
 
   def select_fn(v):
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -410,13 +410,25 @@ std::ptrdiff_t GetTensorId(const at::Tensor& tensor) {
 
 std::vector<at::Tensor> GetXlaTensorsFromAten(
     const std::vector<at::Tensor>& aten_tensors,
-    const std::vector<std::string>& devices) {
-  auto data_handles = CreateTensorsData(aten_tensors, GetXlaDevices(devices));
+    const std::vector<std::string>& devices,
+    const std::optional<std::vector<XLATensor::ShardingSpecPtr>>
+        sharding_specs) {
+  std::vector<std::shared_ptr<torch::lazy::BackendData>> data_handles;
+  if (sharding_specs.has_value()) {
+    data_handles = CreateTensorsData(aten_tensors, sharding_specs.value(),
+                                     GetXlaDevices(devices));
+  } else {
+    data_handles = CreateTensorsData(aten_tensors, GetXlaDevices(devices));
+  }
 
   std::vector<at::Tensor> xla_tensors;
   xla_tensors.reserve(data_handles.size());
-  for (auto& data_handle : data_handles) {
+  for (int i = 0; i < data_handles.size(); i++) {
+    auto& data_handle = data_handles[i];
     XLATensorPtr xla_tensor = XLATensor::Create(std::move(data_handle));
+    if (sharding_specs.has_value() && sharding_specs.value()[i] != nullptr) {
+      xla_tensor->SetShardingSpec(*sharding_specs.value()[i]);
+    }
     xla_tensors.push_back(bridge::AtenFromXlaTensor(std::move(xla_tensor)));
   }
   return xla_tensors;
@@ -904,21 +916,36 @@ void InitXlaModuleBindings(py::module m) {
         [](const std::vector<at::Tensor>& tensors) -> std::string {
           return GetTensorsHloGraph(tensors);
         });
-  m.def("_xla_tensors_from_aten", [](const std::vector<at::Tensor>& tensors,
-                                     const std::vector<std::string>& devices) {
-    std::vector<at::Tensor> result;
-    {
-      NoGilSection nogil;
-      std::vector<at::Tensor> xla_tensors =
-          GetXlaTensorsFromAten(tensors, devices);
-      result.reserve(xla_tensors.size());
-      for (size_t i = 0; i < xla_tensors.size(); ++i) {
-        result.push_back(torch::autograd::make_variable(
-            xla_tensors[i], /*requires_grad=*/tensors.at(i).requires_grad()));
-      }
-    }
-    return result;
-  });
+  py::class_<XLATensor::ShardingSpec, XLATensor::ShardingSpecPtr>(
+      m, "XlaShardingSpec")
+      .def(py::init([](at::Tensor tensor, py::list& tile_assignment,
+                       bool replicated, bool manual) {
+        auto op_sharding =
+            ShardingUtil::CreateOpSharding(tile_assignment, replicated, manual);
+        auto shape = CreateComputationShapeFromTensor(tensor, nullptr);
+        return std::make_shared<XLATensor::ShardingSpec>(op_sharding, shape);
+      }));
+  m.def("_xla_tensors_from_aten",
+        [](const std::vector<at::Tensor>& tensors,
+           const std::vector<std::string>& devices,
+           const std::optional<std::vector<XLATensor::ShardingSpecPtr>>&
+               shardings) {
+          std::vector<at::Tensor> result;
+          {
+            NoGilSection nogil;
+            std::vector<at::Tensor> xla_tensors =
+                GetXlaTensorsFromAten(tensors, devices, shardings);
+            result.reserve(xla_tensors.size());
+            for (size_t i = 0; i < xla_tensors.size(); ++i) {
+              result.push_back(torch::autograd::make_variable(
+                  xla_tensors[i],
+                  /*requires_grad=*/tensors.at(i).requires_grad()));
+            }
+          }
+          return result;
+        },
+        py::arg("tensors"), py::arg("devices"),
+        py::arg("shardings") = py::none());
   m.def("_xla_get_cpu_tensors", [](const std::vector<at::Tensor>& tensors) {
     std::vector<at::Tensor> result;
     {
diff --git a/torch_xla/experimental/xla_sharding.py b/torch_xla/experimental/xla_sharding.py
@@ -1,6 +1,6 @@
 import os
 from collections import OrderedDict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import torch
 import torch_xla
 import torch_xla.core.xla_model as xm
@@ -78,7 +78,7 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
     Annotates the tensor provided with XLA partition spec. Internally,
     it annotates the corresponding XLATensor as sharded for the XLA SpmdPartitioner pass.
     Args:
-        t (Union[torch.Tensor, XLAShardedTensor]): input tensor to be annotated with partition_sepc.
+        t (Union[torch.Tensor, XLAShardedTensor]): input tensor to be annotated with partition_spec.
 
         mesh (Mesh): describes the logical XLA device topology and the underlying device IDs.
 
@@ -148,6 +148,29 @@ class ShardingSpec:
   mesh: Mesh
   partition_spec: Tuple[Union[int, None]]
 
+  # Derived fields
+  _tile_assignment: List[int] = field(init=False)
+  _replicated: bool = field(init=False)
+  _partial: bool = field(init=False)
+
+  def __post_init__(self):
+    self._tile_assignment = self.mesh.get_logical_mesh().tolist()
+    self._replicated = all(d is None for d in self.partition_spec)
+    self._partial = not self._replicated and any(
+        d is None for d in self.partition_spec)
+    # TODO(yeounoh) support partially replicated sharding.
+    assert not self._partial, "Partial replication is currently not supported"
+
+  def xla_spec(self, t: torch.Tensor) -> Union['XlaShardingSpec', None]:
+    """
+    Create an XlaShardingSpec for the given tensor. If the tensor is
+    incompatible with the ShardingSpec, returns None.
+    """
+    if not self.can_apply(t):
+      return None
+    return torch_xla._XLAC.XlaShardingSpec(t, self._tile_assignment,
+                                           self._replicated, False)
+
   def can_apply(self, t: torch.Tensor) -> bool:
     """
     Test whether the ShardingSpec is compatible with the given torch.Tensor.