pytorch · JackCaoG · Nov 3, 2022 · Nov 2, 2022 · Nov 2, 2022 · Nov 2, 2022
diff --git a/test/test_metrics.py b/test/test_metrics.py
@@ -13,29 +13,80 @@ def test_clear_counters(self):
     xla_device = xm.xla_device()
     t1 = torch.tensor(100, device=xla_device)
     t1 += 2
-    assert ("xla::add" in met.metrics_report())
+    self.assertIn("xla::add", met.metrics_report())
     assert (len(met.counter_names()) > 0)
     met.clear_counters()
-    assert ("xla::add" not in met.metrics_report())
+    self.assertNotIn("xla::add", met.metrics_report())
     assert (len(met.counter_names()) == 0)
     # perform the same computation and check if counter increases again
     t1 += 2
-    assert ("xla::add" in met.metrics_report())
+    self.assertIn("xla::add", met.metrics_report())
     assert (len(met.counter_names()) > 0)
 
   def test_clear_metrics(self):
     xla_device = xm.xla_device()
-    t1 = torch.tensor(100, device=xla_device)
-    assert ("TensorToData" in met.metrics_report())
+    t1 = torch.tensor(156, device=xla_device)
+    self.assertIn("TensorToData", met.metrics_report())
     assert (len(met.metric_names()) > 0)
     met.clear_metrics()
-    assert ("TensorToData" not in met.metrics_report())
+    self.assertNotIn("TensorToData", met.metrics_report())
     assert (len(met.metric_names()) == 0)
     # perform the same computation and check if metrics increases again
     t2 = torch.tensor(200, device=xla_device)
-    assert ("TensorToData" in met.metrics_report())
+    self.assertIn("TensorToData", met.metrics_report())
     assert (len(met.metric_names()) > 0)
 
+  def test_short_metrics_report_default_list(self):
+    xla_device = xm.xla_device()
+    t1 = torch.tensor(1456, device=xla_device)
+    t2 = t1 * 2
+    xm.mark_step()
+    t2_cpu = t2.cpu()
+    short_report = met.short_metrics_report()
+    self.assertNotIn("TensorToData", short_report)
+    self.assertIn("CompileTime", short_report)
+    self.assertIn("ExecuteTime", short_report)
+    self.assertIn("TransferToServerTime", short_report)
+    self.assertIn("TransferFromServerTime", short_report)
+    self.assertIn("MarkStep", short_report)
+    # repeat the same computation and expect to see the CachedCompile counter
+    t3 = t1 * 2
+    xm.mark_step()
+    t4 = t1 * 2
+    xm.mark_step()
+    self.assertIn("CachedCompile", short_report)
+
+  def test_short_metrics_report_custom_list(self):
+    xla_device = xm.xla_device()
+    t1 = torch.tensor(100, device=xla_device)
+    t2 = t1 * 2
+    xm.mark_step()
+    t2_cpu = t2.cpu()
+    short_report = met.short_metrics_report(
+        counter_names=['CreateCompileHandles'])
+    self.assertIn('CreateCompileHandles', short_report)
+    self.assertNotIn('MarkStep', short_report)
+    # using the default metrics list in this case
+    self.assertIn('CompileTime', short_report)
+    short_report = met.short_metrics_report(
+        counter_names=['CreateCompileHandles'], metric_names=['InboundData'])
+    self.assertNotIn('CompileTime', short_report)
+    self.assertIn('InboundData', short_report)
+
+  def test_short_metrics_fallback_counter(self):
+    xla_device = xm.xla_device()
+    t1 = torch.tensor(100, device=xla_device)
+    t2 = t1 * 2
+    # this will trigger a aten::_local_scalar_dense which is the same as fallback counter
+    if t2:
+      t2 += 1
+    self.assertIn('aten::_local_scalar_dense', met.short_metrics_report())
+    self.assertIn(
+        'aten::_local_scalar_dense',
+        met.short_metrics_report(
+            counter_names=['CreateCompileHandles'],
+            metric_names=['InboundData']))
+
 
 if __name__ == '__main__':
   test = unittest.main()

diff --git a/third_party/xla_client/metrics.cc b/third_party/xla_client/metrics.cc
@@ -345,6 +345,33 @@ std::string CreateMetricReport() {
   return ss.str();
 }
 
+std::string CreateMetricReport(const std::vector<std::string>& counter_names,
+                               const std::vector<std::string>& metric_names) {
+  MetricsArena* arena = MetricsArena::Get();
+  std::stringstream ss;
+  for (const std::string& metric_name : metric_names) {
+    MetricData* data = arena->GetMetric(metric_name);
+    if (data && data->TotalSamples() > 0) {
+      EmitMetricInfo(metric_name, data, &ss);
+    }
+  }
+  for (const std::string& counter_name : counter_names) {
+    CounterData* data = arena->GetCounter(counter_name);
+    if (data && data->Value() > 0) {
+      EmitCounterInfo(counter_name, data, &ss);
+    }
+  }
+  static std::string fall_back_counter_prefix = "aten::";
+  arena->ForEachCounter([&ss](const std::string& name, CounterData* data) {
+    if (name.rfind(fall_back_counter_prefix, 0) == 0 && data->Value() > 0) {
+      // it might emit duplicated counter if user also specified exact aten
+      // counter in the `counter_names` but it should be very rare.
+      EmitCounterInfo(name, data, &ss);
+    }
+  });
+  return ss.str();
+}
+
 std::vector<std::string> GetMetricNames() {
   return MetricsArena::Get()->GetMetricNames();
 }

diff --git a/third_party/xla_client/metrics.h b/third_party/xla_client/metrics.h
@@ -196,6 +196,10 @@ class Counter {
 // Creates a report with the current metrics statistics.
 std::string CreateMetricReport();
 
+// Creates a report with the selected metrics statistics.
+std::string CreateMetricReport(const std::vector<std::string>& counter_names,
+                               const std::vector<std::string>& metric_names);
+
 // Returns the currently registered metric names. Note that the list can grow
 // since metrics are usualy function intialized (they are static function
 // variables).

diff --git a/third_party/xla_client/metrics_reader.cc b/third_party/xla_client/metrics_reader.cc
@@ -74,5 +74,10 @@ std::string CreateMetricReport() {
   return metrics::CreateMetricReport() + CreateXrtMetricReport();
 }
 
+std::string CreateMetricReport(const std::vector<std::string>& counter_names,
+                               const std::vector<std::string>& metric_names) {
+  return metrics::CreateMetricReport(counter_names, metric_names);
+}
+
 }  // namespace metrics_reader
 }  // namespace xla
diff --git a/third_party/xla_client/metrics_reader.h b/third_party/xla_client/metrics_reader.h
@@ -2,13 +2,18 @@
 #define XLA_CLIENT_METRICS_READER_H_
 
 #include <string>
+#include <vector>
 
 namespace xla {
 namespace metrics_reader {
 
 // Creates a report with the current metrics statistics.
 std::string CreateMetricReport();
 
+// Creates a report with the selected metrics statistics.
+std::string CreateMetricReport(const std::vector<std::string>& counter_names,
+                               const std::vector<std::string>& metric_names);
+
 }  // namespace metrics_reader
 }  // namespace xla
 

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1190,6 +1190,19 @@ void InitXlaModuleBindings(py::module m) {
   });
   m.def("_xla_metrics_report",
         []() { return xla::metrics_reader::CreateMetricReport(); });
+  m.def("_short_xla_metrics_report",
+        [](const py::list& counter_names, const py::list& metric_names) {
+          std::vector<std::string> counter_name_vec;
+          std::vector<std::string> metric_name_vec;
+          for (auto& counter : counter_names) {
+            counter_name_vec.push_back(counter.cast<std::string>());
+          }
+          for (auto& metric : metric_names) {
+            metric_name_vec.push_back(metric.cast<std::string>());
+          }
+          return xla::metrics_reader::CreateMetricReport(counter_name_vec,
+                                                         metric_name_vec);
+        });
   m.def("_clear_xla_counters", []() { xla::metrics::ClearCounters(); });
   m.def("_clear_xla_metrics", []() { xla::metrics::ClearMetrics(); });
   m.def("_xla_tensors_report",

diff --git a/torch_xla/debug/metrics.py b/torch_xla/debug/metrics.py
@@ -57,3 +57,20 @@ def clear_metrics():
 def metrics_report():
   """Retrieves a string containing the full metrics and counters report."""
   return torch_xla._XLAC._xla_metrics_report()
+
+
+def short_metrics_report(counter_names: list = None, metric_names: list = None):
+  """Retrieves a string containing the full metrics and counters report.
+
+  Args:
+    counter_names (list): The list of counter names whose data needs to be printed.
+    metric_names (list): The list of metric names whose data needs to be printed.
+  """
+  if not counter_names:
+    counter_names = ['CachedCompile', 'MarkStep']
+  if not metric_names:
+    metric_names = [
+        'CompileTime', 'ExecuteTime', 'TransferToServerTime',
+        'TransferFromServerTime'
+    ]
+  return torch_xla._XLAC._short_xla_metrics_report(counter_names, metric_names)