model should be on xla device for Dynamo torchxla_trace_once backend (#4205)

JackCaoG · web-flow · commit 36e2acabca1f · 2022-11-21T20:22:08.000-08:00
* model should be on xla device for Dynamo torchxla_trace_once backend

* torch pin

* Add _clear_pending_irs API and tests

* Delete .torch_pin
diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
@@ -23,40 +23,48 @@ def fn_simple_dynamo(self, x, y):
     return self.fn_simple(x, y)
 
   @dynamo.optimize('torchxla_trace_once')
-  def resetnet_18_dynamo(self, model, data):
+  def run_model_with_dynamo(self, model, data):
     return model(data)
 
   def test_simple_model(self):
+    device = xm.xla_device()
     x = torch.tensor(100.0)
     y = torch.tensor(200.0)
+    xla_x = x.to(device)
+    xla_y = y.to(device)
     res_cpu = self.fn_simple(x, y)
-    res_xla_dynamo = self.fn_simple_dynamo(x, y)
+    res_xla_dynamo = self.fn_simple_dynamo(xla_x, xla_y)
     self.assertIn('xla::add', met.counter_names())
     torch.allclose(res_cpu, res_xla_dynamo.cpu())
     # verifiy that tracing is skipped in following runs
     met.clear_counters()
-    res_xla_dynamo_2 = self.fn_simple_dynamo(x, y)
+    res_xla_dynamo_2 = self.fn_simple_dynamo(xla_x, xla_y)
     self.assertNotIn('xla::add', met.counter_names())
     torch.allclose(res_cpu, res_xla_dynamo_2.cpu())
     # verify that dynamo can handle different inputs
-    res_xla_dynamo_3 = self.fn_simple_dynamo(x + y, y * 3)
+    res_xla_dynamo_3 = self.fn_simple_dynamo(xla_x + xla_y, xla_y * 3)
     res_cpu_3 = self.fn_simple(x + y, y * 3)
     torch.allclose(res_cpu, res_xla_dynamo_3.cpu())
 
   def test_resnet18(self):
+    device = xm.xla_device()
     batch_size = xu.getenv_as('BATCH_SIZE', int, defval=4)
     sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
     loader = xu.SampleGenerator(
-        data=(torch.randn(batch_size, 3, 224,
-                          224), torch.zeros(batch_size, dtype=torch.int64)),
+        data=(torch.randn(batch_size, 3, 224, 224, device=device),
+              torch.zeros(batch_size, dtype=torch.int64, device=device)),
         sample_count=sample_count)
-    model = torchvision.models.resnet18()
-    model.eval()
+    resnet18 = torchvision.models.resnet18()
+    resnet18.eval()
+    xla_resnet18 = torchvision.models.resnet18().to(device)
+    xla_resnet18.eval()
     for data, _ in loader:
-      output = self.resetnet_18_dynamo(model, data)
-      torch.allclose(model(data), output.cpu())
-    self.assertEqual(met.metric_data('CompileTime')[0], 1)
-    self.assertEqual(met.metric_data('ExecuteTime')[0], sample_count + 1)
+      output = self.run_model_with_dynamo(xla_resnet18, data)
+      torch.allclose(resnet18(data.cpu()), output.cpu())
+    # One graph for initial input data materialization. Another grpah for the
+    # real model code.
+    self.assertEqual(met.metric_data('CompileTime')[0], 2)
+    self.assertEqual(met.metric_data('ExecuteTime')[0], sample_count + 2)
     self.assertEqual(
         met.metric_data('RunCachedGraphInputData')[0], sample_count)
     self.assertEqual(
diff --git a/test/dynamo/test_dynamo_integrations_util.py b/test/dynamo/test_dynamo_integrations_util.py
@@ -89,6 +89,25 @@ def test_get_graph_hash(self):
     xla_out_2 = xla_dummy_model(xla_input)
     assert (hash == torch_xla._XLAC._get_graph_hash([xla_out_2]))
 
+  def test_clear_pending_irs(self):
+    xla_device = xm.xla_device()
+    xm.mark_step()
+    t1 = torch.randn(20, 5).to(xla_device)
+    t2 = torch.randn(20, 5).to(xla_device)
+    t3 = t2 + t1
+    t4 = t3 * t2
+    met.clear_metrics()
+    torch_xla._XLAC._xla_sync_multi([t4], devices=[], wait=True)
+    # only t4 is materialized
+    self.assertIn("aten::add", torch_xla._XLAC._get_xla_tensors_text([t3]))
+    self.assertEqual(met.metric_data('ExecuteTime')[0], 1)
+    torch_xla._XLAC._clear_pending_irs(str(xla_device))
+    self.assertNotIn("aten::add", torch_xla._XLAC._get_xla_tensors_text([t3]))
+    self.assertEqual(met.metric_data('ExecuteTime')[0], 1)
+    xm.mark_step()
+    # mark_step should not incur new execution
+    self.assertEqual(met.metric_data('ExecuteTime')[0], 1)
+
   def test_run_cached_graph(self):
     xla_device = xm.xla_device()
     xla_input = torch.randn(64, 256, 14, 14).to(xla_device)
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -400,6 +400,13 @@ std::string GetLiveTensorsReport(size_t nodes_threshold,
   return ss.str();
 }
 
+void ClearPendingIrs(const std::string& device_str) {
+  auto opt_device = GetOptionalDevice(device_str);
+  XLA_CHECK(opt_device);
+  auto tensors = XLATensor::GetLiveTensors(&opt_device.value());
+  XLATensor::ClearPendingIrs(tensors, opt_device.value());
+}
+
 std::ptrdiff_t GetTensorViewAliasId(const at::Tensor& tensor) {
   XLATensorPtr xtensor = bridge::GetXlaTensor(tensor);
   return xtensor->GetViewAliasId();
@@ -1621,6 +1628,12 @@ void InitXlaModuleBindings(py::module m) {
     return py::bytes(bin);
   });
 
+  m.def("_clear_pending_irs", [](const std::string& device) {
+    // Use with caution. Those tensor whole ir was cleared with be replaced
+    // with a placeholder XLAData and SHOULD NOT be accessed.
+    ClearPendingIrs(device);
+  });
+
   m.def("_run_cached_graph",
         [](const std::string& hash_str,
            const std::vector<at::IValue>& graph_inputs)
diff --git a/torch_xla/csrc/tensor.cpp b/torch_xla/csrc/tensor.cpp
@@ -1083,6 +1083,28 @@ XLATensor::ExecuteComputationWithBarrier(
                                                        device);
 }
 
+void XLATensor::ClearPendingIrs(std::vector<XLATensorPtr> tensors,
+                                const torch::lazy::BackendDevice& device) {
+  std::unordered_set<int64_t> tensor_ids;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    if (tensor_ids.insert(tensors[i]->GetUniqueId()).second &&
+        tensors[i]->CurrentXlaData() == nullptr) {
+      torch::lazy::Value ir_value = tensors[i]->CurrentIrValue();
+      if (ir_value) {
+        xla::Shape shape = MakeShapeWithDeviceLayout(
+            tensors[i]->shape(), static_cast<XlaDeviceType>(device.type()));
+        torch::lazy::BackendDataPtr xla_data =
+            WrapXlaData(xla::ComputationClient::Get()->CreateDataPlaceholder(
+                device.toString(), std::move(shape)));
+        tensors[i]->AssignIrValue(torch::lazy::Value());
+        tensors[i]->data()->xla_data = xla_data;
+        tensors[i]->data()->view = nullptr;
+        tensors[i]->data()->tensor_data = c10::nullopt;
+      }
+    }
+  }
+}
+
 std::vector<at::Tensor> XLATensor::GetTensorsOpByOp(
     std::vector<XLATensorPtr>* tensors) {
   SyncTensorsConfig config;
diff --git a/torch_xla/csrc/tensor.h b/torch_xla/csrc/tensor.h
@@ -1250,6 +1250,9 @@ class XLATensor : public c10::intrusive_ptr_target {
       c10::ArrayRef<torch::lazy::BackendDataPtr> arguments,
       const torch::lazy::BackendDevice& device);
 
+  static void ClearPendingIrs(std::vector<XLATensorPtr> tensors,
+                              const torch::lazy::BackendDevice& device);
+
  private:
   struct SyncTensorsConfig {
     // Whether we want to force XLA data on the target tensors (hence trimming