[SPMD][Virtual Device]All tensors should be in SPMD:0 C++ device (#5284)

JackCaoG · web-flow · commit e42fffa0454a · 2023-07-11T15:22:22.000-07:00
* Move all tensors to SPMD:0 C++ device under spmd context

* fix load shards

* fix test_mark_sharding_2d by not creating placeholder for virtual device

* fix the waitdeviceop for spmd case

* Fix test_shard_hashing

* fix spmd device casting issue

* remove hacks in test_xla_virtual_device.py

* add test for new virtual device usage

* fix review comments

* fix IsTpuDevice

* linter
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -314,7 +314,6 @@ def test_execute_replicated_metrics(self):
     xt = torch.ones(2, 2).to(xm.xla_device())
     xs.mark_sharding(xt, self._get_mesh((1, self.n_devices)), (0, 1))
     xt += 2
-    sharding_spec = torch_xla._XLAC._get_xla_sharding_spec(xt)
     xm.mark_step()
     xm.wait_device_ops()
     self.assertEqual(met.metric_data('ExecuteReplicatedTime')[0], 1)
diff --git a/test/spmd/test_xla_virtual_device.py b/test/spmd/test_xla_virtual_device.py
@@ -79,9 +79,6 @@ def test_outbound_data_metrics(self):
 
   def test_non_tensor_scalar(self):
     sharding_spec = xs.ShardingSpec(self._get_mesh((1, self.n_devices)), (0, 1))
-    # TODO(JackCaoG)currently, execution will only happen if there is at least one
-    # tensor on non-spmd:0 device.
-    t1 = torch.randn(3, 3, device=xm.xla_device())
     # tensor will have device as `SPMD:0` in c++
     xt1 = xm.send_cpu_data_to_device([torch.randn(3, 3)],
                                      xm.xla_device(),
@@ -95,9 +92,6 @@ def test_non_tensor_scalar(self):
   def test_mark_step_on_virtual_device(self):
     xm.mark_step()
     sharding_spec = xs.ShardingSpec(self._get_mesh((1, self.n_devices)), (0, 1))
-    # TODO(JackCaoG)currently, execution will only happen if there is at least one
-    # tensor on non-spmd:0 device.
-    t1 = torch.randn(3, 3, device=xm.xla_device())
     # tensor will have device as `SPMD:0` in c++
     xt1 = xm.send_cpu_data_to_device([torch.randn(3, 3)],
                                      xm.xla_device(),
@@ -108,6 +102,63 @@ def test_mark_step_on_virtual_device(self):
     self.assertNotIn('aten::div',
                      torch_xla._XLAC._get_xla_tensor_debug_info(xt2))
 
+  def test_virtual_device_no_upload(self):
+    met.clear_all()
+    device = xm.xla_device()
+    t1 = torch.randn(5, 5).to(device)
+    t1_debug_info = torch_xla._XLAC._get_xla_tensor_debug_info(t1)
+    # t1's upload to device should be deferred
+    self.assertIn("Tensor on host: with size [5, 5]", t1_debug_info)
+    self.assertNotIn("TransferToServerTime", met.metric_names())
+    # t1 should be on SPMD device under spmd context
+    self.assertIn("Device: SPMD:0", t1_debug_info)
+    self.assertIn("IR: None", t1_debug_info)
+    self.assertIn("XLAData: None", t1_debug_info)
+
+  def test_virtual_device_upload_after_mark_sharding(self):
+    met.clear_all()
+    partition_spec = (0, 1)
+    device = xm.xla_device()
+    t1 = torch.randn(8, 8).to(device)
+    t1_debug_info = torch_xla._XLAC._get_xla_tensor_debug_info(t1)
+    self.assertIn("Tensor on host: with size [8, 8]", t1_debug_info)
+    xs.mark_sharding(t1, self._get_mesh((1, self.n_devices)), partition_spec)
+    t1_debug_info_new = torch_xla._XLAC._get_xla_tensor_debug_info(t1)
+    # tensor should be uploaded to device after mark_sharding
+    self.assertIn("Tensor on host: None", t1_debug_info_new)
+    self.assertIn("xla::device_data", t1_debug_info_new)
+    self.assertIn("XLAShardedData", t1_debug_info_new)
+    self.assertIn("TransferToServerTime", met.metric_names())
+
+  def test_virtual_device_upload_after_tracing(self):
+    met.clear_all()
+    device = xm.xla_device()
+    t1 = torch.randn(8, 8).to(device)
+    t1_debug_info = torch_xla._XLAC._get_xla_tensor_debug_info(t1)
+    self.assertIn("Tensor on host: with size [8, 8]", t1_debug_info)
+    t2 = t1 + t1
+    t1_debug_info_new = torch_xla._XLAC._get_xla_tensor_debug_info(t1)
+    # tensor should be uploaded to device after being used as input to other op.
+    self.assertIn("Tensor on host: None", t1_debug_info_new)
+    self.assertIn("xla::device_data", t1_debug_info_new)
+    self.assertIn("TransferToServerTime", met.metric_names())
+
+  def test_virtual_device_upload_for_sharded_dataloader(self):
+    met.clear_counters()
+    device = xm.xla_device()
+    sharding_spec = xs.ShardingSpec(self._get_mesh((1, self.n_devices)), (0, 1))
+    # tensor will have device as `SPMD:0` in c++
+    t1 = xm.send_cpu_data_to_device([torch.randn(8, 8)],
+                                    device,
+                                    input_sharding=sharding_spec)[0]
+    t1_debug_info = torch_xla._XLAC._get_xla_tensor_debug_info(t1)
+    self.assertIn("Device: SPMD:0", t1_debug_info)
+    # tensor should be uploaded to device after send_cpu_data_to_device + sharding_spec
+    self.assertIn("Tensor on host: None", t1_debug_info)
+    self.assertIn("xla::device_data", t1_debug_info)
+    self.assertIn("XLAShardedData", t1_debug_info)
+    self.assertIn("TransferToServerTime", met.metric_names())
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/csrc/aten_xla_bridge.cpp b/torch_xla/csrc/aten_xla_bridge.cpp
@@ -39,10 +39,15 @@ class AtenXlaDeviceMapper {
 
  private:
   AtenXlaDeviceMapper() {
-    for (auto& device_str :
-         torch_xla::runtime::GetComputationClient()->GetLocalDevices()) {
-      devices_.emplace_back(ParseDeviceString(device_str));
-      devices_ordinals_[devices_.back()] = devices_.size() - 1;
+    if (UseVirtualDevice()) {
+      devices_.emplace_back(ParseDeviceString("SPMD:0"));
+      devices_ordinals_[devices_.back()] = 0;
+    } else {
+      for (auto& device_str :
+           torch_xla::runtime::GetComputationClient()->GetLocalDevices()) {
+        devices_.emplace_back(ParseDeviceString(device_str));
+        devices_ordinals_[devices_.back()] = devices_.size() - 1;
+      }
     }
   }
 
diff --git a/torch_xla/csrc/aten_xla_type.cpp b/torch_xla/csrc/aten_xla_type.cpp
@@ -467,7 +467,7 @@ at::Tensor XLANativeFunctions::_copy_from(const at::Tensor& self,
   if (!self_tensor) {
     static bool sync_update =
         runtime::sys_util::GetEnvBool("XLA_TENSOR_UPDATE_SYNC", true) &&
-        !ShardingUtil::UseVirtualDevice();
+        !UseVirtualDevice();
     XLA_CHECK(dst_tensor);
     dst_tensor->UpdateFromTensor(self, /*sync=*/sync_update);
   } else if (!dst_tensor) {
diff --git a/torch_xla/csrc/device.cpp b/torch_xla/csrc/device.cpp
@@ -37,7 +37,9 @@ std::string DeviceType::toString() const {
 torch::lazy::BackendDevice ParseDeviceString(const std::string& device_spec) {
   if (device_spec.empty()) {
     std::string default_device_spec =
-        runtime::GetComputationClient()->GetDefaultDevice();
+        UseVirtualDevice()
+            ? "SPMD:0"
+            : runtime::GetComputationClient()->GetDefaultDevice();
     XLA_CHECK(!default_device_spec.empty());
     return ParseDeviceString(default_device_spec);
   }
@@ -101,4 +103,18 @@ torch::lazy::BackendDevice SetCurrentDevice(
   return current;
 }
 
+bool ShouldUseVirtualDevice() {
+  bool use_virtual_device =
+      runtime::sys_util::GetEnvBool("XLA_USE_SPMD", false);
+  if (use_virtual_device) {
+    TF_LOG(INFO) << "Using SPMD virtual device optimization";
+  }
+  return use_virtual_device;
+}
+
+bool UseVirtualDevice() {
+  static bool use_virtual_device = ShouldUseVirtualDevice();
+  return use_virtual_device;
+}
+
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/device.h b/torch_xla/csrc/device.h
@@ -42,6 +42,10 @@ static inline torch::lazy::BackendDevice GetDeviceOrCurrent(
   return device != nullptr ? *device : GetCurrentDevice();
 }
 
+// Test whether the XLA_USE_SPMD environment variable is set to enable the
+// virtual device optimization.
+bool UseVirtualDevice();
+
 }  // namespace torch_xla
 
 #endif  // XLA_TORCH_XLA_CSRC_DEVICE_H_
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -384,7 +384,7 @@ std::string GetXLATensorDebugInfo(const at::Tensor& tensor) {
   auto at_tensor = xtensor->CurrentTensorData();
   ss << "Tensor on host: ";
   if (at_tensor) {
-    ss << " with size " << at_tensor->sizes() << "\n";
+    ss << "with size " << at_tensor->sizes() << "\n";
   } else {
     ss << "None\n";
   }
@@ -1126,7 +1126,7 @@ void InitXlaModuleBindings(py::module m) {
         [](const std::vector<std::string>& devices) {
           NoGilSection nogil;
           XLAGraphExecutor::Get()->WaitDeviceOps(devices);
-          if (ShardingUtil::UseVirtualDevice()) {
+          if (UseVirtualDevice()) {
             std::vector<std::string> spmd_device = {"SPMD:0"};
             runtime::GetComputationClient()->WaitDeviceOps(spmd_device);
           } else {
@@ -1313,8 +1313,7 @@ void InitXlaModuleBindings(py::module m) {
            const py::list& group_assignment, const py::list& replication_groups,
            int sharding_type) {
           TORCH_LAZY_COUNTER("XlaMarkSharding", 1);
-          XLA_CHECK(ShardingUtil::UseVirtualDevice())
-              << "Please set `XLA_USE_SPMD=1`";
+          XLA_CHECK(UseVirtualDevice()) << "Please set `XLA_USE_SPMD=1`";
           XLATensorPtr xtensor = bridge::GetXlaTensor(input);
           xla::OpSharding sharding = ShardingUtil::CreateOpSharding(
               tile_assignment, group_assignment, replication_groups,
@@ -1393,23 +1392,33 @@ void InitXlaModuleBindings(py::module m) {
   // shape. Note that this padding is _not_ included in the global indices
   // returned by `_get_local_shard_indices`.
   m.def("_get_local_shards",
-        [](const at::Tensor& input) -> std::vector<at::Tensor> {
+        [](const at::Tensor& input)
+            -> std::tuple<std::vector<at::Tensor>, std::vector<std::string>> {
           XLATensorPtr xtensor = bridge::GetXlaTensor(input);
           XLA_CHECK(xtensor->GetXlaData() != nullptr)
               << "Shard data is not available";
           XLA_CHECK(xtensor->sharding_spec() != nullptr)
               << "Tensor is not sharded";
-          XLA_CHECK(ShardingUtil::UseVirtualDevice())
+          XLA_CHECK(UseVirtualDevice())
               << "Virtual device must be enabled to use _get_local_shards";
           auto handle = UnwrapXlaData(xtensor->GetXlaData());
-          auto shard_handles =
+          std::vector<runtime::ComputationClient::DataPtr> shard_handles =
               runtime::GetComputationClient()->GetDataShards(handle);
           std::vector<at::Tensor> shards;
-          for (auto& shard_handle : shard_handles) {
-            auto xshard = XLATensor::Create(WrapXlaData(shard_handle));
-            shards.push_back(bridge::AtenFromXlaTensor(std::move(xshard)));
+          std::vector<std::string> str_devices;
+          shards.reserve(shard_handles.size());
+          str_devices.reserve(shard_handles.size());
+          // Tansfer shards from the device and create cpu tensors.
+          for (const runtime::ComputationClient::DataPtr shard_handle :
+               shard_handles) {
+            shards.push_back(
+                XlaDataToTensors(
+                    {WrapXlaData(shard_handle)},
+                    TensorTypeFromXlaType(shard_handle->shape().element_type()))
+                    .front());
+            str_devices.push_back(shard_handle->device());
           }
-          return shards;
+          return std::make_tuple(shards, str_devices);
         });
   // Returns the indices of the shards into the global tensor as either
   // a Python list of slices for each dimension or a Python Ellipsis object
@@ -1478,8 +1487,7 @@ void InitXlaModuleBindings(py::module m) {
           << "Input shard shape must include padding: " << shard.sizes()
           << " vs " << shard_shape;
     }
-    auto xla_devices = GetXlaDevices(devices);
-    auto xla_data = ShardingUtil::CreateShardedData(shards, xla_devices,
+    auto xla_data = ShardingUtil::CreateShardedData(shards, devices,
                                                     xtensor->shape(), sharding);
     xtensor->SetXlaData(WrapXlaData(xla_data));
   });
@@ -1677,8 +1685,8 @@ void InitXlaModuleBindings(py::module m) {
           torch::lazy::hash_t hash = *(torch::lazy::hash_t*)(hash_str.c_str());
           // Device will be Virtual device if SPMD is enabled.
           torch::lazy::BackendDevice device =
-              ShardingUtil::UseVirtualDevice() ? ParseDeviceString("SPMD:0")
-                                               : torch_xla::GetCurrentDevice();
+              UseVirtualDevice() ? ParseDeviceString("SPMD:0")
+                                 : torch_xla::GetCurrentDevice();
           auto results = XLAGraphExecutor::Get()->ExecuteComputationWithBarrier(
               hash, graph_inputs, device);
           std::vector<at::Tensor> retlist;
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cc b/torch_xla/csrc/runtime/pjrt_computation_client.cc
@@ -533,6 +533,10 @@ PjRtComputationClient::ExecuteComputation(
                << device;
     // Grab the shared lock and block the `WaitDeviceOps` until buffer is
     // ready.
+    // TODO(JackCaoG): This lock should acquired outside of the lockfn and
+    // passed in. It is possible that lockfn started after ExecuteComputation
+    // released the xla_graph_executor lock, which will create a short windows
+    // where device is unlcoked while execution is still running.
     auto lock = lock_device_shared(device);
     TF_VLOG(5) << "ExecuteComputation acquiring PJRT device lock for " << device
                << " Done";
diff --git a/torch_xla/csrc/tensor.cpp b/torch_xla/csrc/tensor.cpp
@@ -338,6 +338,7 @@ torch::lazy::Value XLATensor::GetIrValue() const {
   c10::optional<at::Tensor> tensor_data = CurrentTensorData();
   XLA_CHECK(tensor_data);
   AssignIrValue(GetIrValueForTensor(*tensor_data, GetDevice()));
+  data()->tensor_data = c10::nullopt;
   return data()->ir_value;
 }
 
@@ -492,9 +493,8 @@ void XLATensor::SetTensor(at::Tensor tensor) {
 }
 
 void XLATensor::UpdateFromTensor(at::Tensor tensor, bool sync) {
-  torch::lazy::BackendDevice device = ShardingUtil::UseVirtualDevice()
-                                          ? ParseDeviceString("SPMD:0")
-                                          : GetDevice();
+  torch::lazy::BackendDevice device =
+      UseVirtualDevice() ? ParseDeviceString("SPMD:0") : GetDevice();
   if (sync) {
     at::Tensor typed_tensor =
         torch::lazy::CopyTensor(tensor, dtype(), /*copy=*/false);
diff --git a/torch_xla/csrc/tensor_util.cpp b/torch_xla/csrc/tensor_util.cpp
@@ -106,13 +106,20 @@ bool Use32BitLong() {
   return use_32bit_long;
 }
 
+bool IsTpuDevice(XlaDeviceType hw_type) {
+  static bool spmd_device_is_tpu =
+      (hw_type == XlaDeviceType::SPMD) &&
+      runtime::GetComputationClient()->GetDefaultDevice().find("TPU") == 0;
+  return (hw_type == XlaDeviceType::TPU) || spmd_device_is_tpu;
+}
+
 xla::PrimitiveType XlaTypeFromTensorType(
     at::ScalarType scalar_type, const torch::lazy::BackendDevice& device) {
   XlaDeviceType hw_type = static_cast<XlaDeviceType>(device.type());
   switch (scalar_type) {
     case at::ScalarType::Double:
-      return hw_type != XlaDeviceType::TPU ? xla::PrimitiveType::F64
-                                           : xla::PrimitiveType::F32;
+      return !IsTpuDevice(hw_type) ? xla::PrimitiveType::F64
+                                   : xla::PrimitiveType::F32;
     case at::ScalarType::Float:
       return xla::PrimitiveType::F32;
     case at::ScalarType::BFloat16:
@@ -600,19 +607,7 @@ torch::lazy::BackendDataPtr TensorToXlaData(
     const at::Tensor& tensor, const xla::Shape& shape,
     const torch::lazy::BackendDevice& device) {
   TORCH_LAZY_TIMED("TensorToData");
-  if (ShardingUtil::UseVirtualDevice()) {
-    // Scalar value will be replicated, no need to delay the transfer here.
-    // TODO(JackCaoG): fix this for more general cases.
-    if (device.type() == (int8_t)XlaDeviceType::SPMD && shape.rank() > 0) {
-      // When SPMD is enabled, we want to delay the data transfer for XLA
-      // tensors until the data is sharded. So, we skip the data transfer
-      // here and simply return a placeholder for the backend data ptr.
-      // Data will only be transferred via CreateTensorsData, when users
-      // call the mark_sharding API.
-      return WrapXlaData(runtime::GetComputationClient()->CreateDataPlaceholder(
-          "SPMD:0", shape));
-    }
-
+  if (UseVirtualDevice()) {
     // The tensor is bypassing the virtual device, so it should be replicated
     // to all devices.
     std::vector<std::string> local_devices =
@@ -856,7 +851,7 @@ std::vector<torch::lazy::BackendDataPtr> CreateTensorsData(
   TORCH_LAZY_TIMED("TensorToData");
   XLA_CHECK_EQ(tensors.size(), devices.size());
 
-  if (ShardingUtil::UseVirtualDevice()) {
+  if (UseVirtualDevice()) {
     // When running in SPMD mode, tensors here in the unsharded
     // CreateTensorsData should be implicitly replicated to all devices.
     // This case should always apply when using SPMD regardless
@@ -936,7 +931,7 @@ std::vector<torch::lazy::BackendDataPtr> CreateTensorsData(
 
     std::vector<runtime::ComputationClient::TensorSource> source_tensors;  // in
     std::vector<runtime::ComputationClient::DataPtr> new_handles;  // out
-    if (ShardingUtil::UseVirtualDevice()) {
+    if (UseVirtualDevice()) {
       // GetLocalDevices returns the list of local devices specified by their
       // global ordinals (e.g. ["TPU:4", "TPU:5", "TPU:6", "TPU:7"]).
       std::vector<std::string> local_devices =
@@ -1160,27 +1155,27 @@ xla::PrimitiveType GetDevicePrimitiveType(
       if (DowncastBF16() || DowncastF16()) {
         return xla::PrimitiveType::F32;
       }
-      return hw_type != XlaDeviceType::TPU ? xla::PrimitiveType::F64
-                                           : xla::PrimitiveType::F32;
+      return !IsTpuDevice(hw_type) ? xla::PrimitiveType::F64
+                                   : xla::PrimitiveType::F32;
     case xla::PrimitiveType::F32:
       if (UseF16() || DowncastF16()) {
         return xla::PrimitiveType::F16;
       }
       return UseBF16() || DowncastBF16() ? xla::PrimitiveType::BF16
                                          : xla::PrimitiveType::F32;
     case xla::PrimitiveType::U16:
-      return hw_type != XlaDeviceType::TPU ? xla::PrimitiveType::U16
-                                           : xla::PrimitiveType::U32;
+      return !IsTpuDevice(hw_type) ? xla::PrimitiveType::U16
+                                   : xla::PrimitiveType::U32;
     case xla::PrimitiveType::S16:
-      return hw_type != XlaDeviceType::TPU ? xla::PrimitiveType::S16
-                                           : xla::PrimitiveType::S32;
+      return !IsTpuDevice(hw_type) ? xla::PrimitiveType::S16
+                                   : xla::PrimitiveType::S32;
     case xla::PrimitiveType::S64:
       return Use32BitLong() ? xla::PrimitiveType::S32 : xla::PrimitiveType::S64;
     case xla::PrimitiveType::U64:
       return Use32BitLong() ? xla::PrimitiveType::U32 : xla::PrimitiveType::U64;
     case xla::PrimitiveType::C128:
-      return hw_type != XlaDeviceType::TPU ? xla::PrimitiveType::C128
-                                           : xla::PrimitiveType::C64;
+      return !IsTpuDevice(hw_type) ? xla::PrimitiveType::C128
+                                   : xla::PrimitiveType::C64;
     default:
       return type;
   }
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
diff --git a/torch_xla/csrc/xla_sharding_util.cpp b/torch_xla/csrc/xla_sharding_util.cpp
diff --git a/torch_xla/csrc/xla_sharding_util.h b/torch_xla/csrc/xla_sharding_util.h
diff --git a/torch_xla/experimental/xla_sharded_tensor.py b/torch_xla/experimental/xla_sharded_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,10 @@ static inline torch::lazy::BackendDevice GetDeviceOrCurrent(`
`42`	`42`	`return device != nullptr ? *device : GetCurrentDevice();`
`43`	`43`	`}`
`44`	`44`
	`45`	`+// Test whether the XLA_USE_SPMD environment variable is set to enable the`
	`46`	`+// virtual device optimization.`
	`47`	`+bool UseVirtualDevice();`
	`48`	`+`
`45`	`49`	`} // namespace torch_xla`
`46`	`50`
`47`	`51`	`#endif // XLA_TORCH_XLA_CSRC_DEVICE_H_`