pytorch · taylanbil · Jul 29, 2019 · Jul 26, 2019
diff --git a/torch_xla/csrc/tensor.cpp b/torch_xla/csrc/tensor.cpp
@@ -30,6 +30,7 @@
 #include "torch_xla/csrc/ops/expand.h"
 #include "torch_xla/csrc/ops/ops.h"
 #include "torch_xla/csrc/ops/view.h"
+#include "torch_xla/csrc/ops/xla_ops.h"
 #include "torch_xla/csrc/tensor_util.h"
 #include "torch_xla/csrc/torch_util.h"
 
@@ -223,6 +224,10 @@ bool IsSpecialScalar(at::Scalar value) {
   return false;
 }
 
+bool ShouldSyncIrValue(const ir::Value& ir_value) {
+  return ir_value->op() != ir::ops::xla_not_supported;
+}
+
 }  // namespace
 
 // The DeviceContextArena holds per device live information and statistics,
@@ -983,9 +988,11 @@ XLATensor::SyncTensorCollection XLATensor::CollectSyncTensors(
     if (tensors[i].CurrentXlaData() == nullptr) {
       ir::Value ir_value = tensors[i].CurrentIrValue();
       if (ir_value) {
-        // Add only tensors which need to be synced.
-        coll.hash = xla::util::HashCombine(coll.hash, ir_value.hash());
-        coll.indices.push_back(i);
+        if (ShouldSyncIrValue(ir_value)) {
+          // Add only tensors which need to be synced.
+          coll.hash = xla::util::HashCombine(coll.hash, ir_value.hash());
+          coll.indices.push_back(i);
+        }
       } else if (config.force_xla_data) {
         // The tensor only has at::Tensor data. We need to queue it for a
         // device upload.

diff --git a/torch_xla_py/data_parallel.py b/torch_xla_py/data_parallel.py
@@ -56,6 +56,7 @@ def __next__(self):
     return self.next()
 
   def next(self):
+    xm.mark_step()
     item = self._loader.next_item(self._device)
     if item is None:
       raise StopIteration

diff --git a/torch_xla_py/xla_model.py b/torch_xla_py/xla_model.py
@@ -340,12 +340,17 @@ def _mark_step(replication):
     ms.save_metrics()
 
 
-def optimizer_step(optimizer, closure=None):
+def mark_step():
+  _mark_step(getattr(_TLS, 'replication', None))
+
+
+def optimizer_step(optimizer, closure=None, barrier=False):
   replication = getattr(_TLS, 'replication', None)
   gradients = _fetch_gradients(optimizer)
   count = len(replication.replication_devices()) if replication else 1
   if count > 1:
     torch_xla._XLAC._xla_cross_replica_sum(gradients, 1.0 / count, [])
   loss = optimizer.step(closure=closure)
-  _mark_step(replication)
+  if barrier:
+    _mark_step(replication)
   return loss