Gather more summary data when running result_analyzer.py. (#6067)

ysiraichi · web-flow · commit 64239127bc52 · 2023-12-11T09:34:49.000-03:00
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -150,6 +150,11 @@ function run_torch_xla_python_tests() {
           # echo "Running MNIST Test"
           # python test/test_train_mp_mnist_amp.py --fake_data --num_epochs=1
         fi
+      elif [[ "$RUN_XLA_OP_TESTS1" == "xla_op1" ]]; then
+          # Benchmark tests.
+          # Only run on CPU, for xla_op1.
+          echo "Running Benchmark tests."
+          ./benchmarks/test/run_tests.sh
       fi
     fi
   popd
diff --git a/benchmarks/result_analyzer.py b/benchmarks/result_analyzer.py
@@ -74,41 +74,32 @@ def run_csv(self):
     self.export_metric_report(metric_df)
 
   def get_calculated_metrics(self, d, dataline):
-    total_time = np.asarray(dataline["metrics"]["total_time"], dtype="float")
-    d["median_total_time"] = np.median(total_time)
-    per_iter_time = np.asarray(
-        dataline["metrics"]["per_iter_time"], dtype="float")
-    d["median_per_iter_time"] = np.median(per_iter_time)
-    if dataline["experiment"]["xla"]:
-      trace_per_iter_time = np.asarray(
-          dataline["metrics"]["trace_per_iter_time"], dtype="float")
-      d["xla_median_trace_per_iter_time"] = np.median(trace_per_iter_time)
-      d["xla_compile_time"] = np.max(total_time) - np.median(total_time)
-    else:
-      d["xla_median_trace_per_iter_time"] = -1
-      d["xla_compile_time"] = -1
-
-    if "total_cpu_time_s" in dataline["metrics"]:
-      total_cpu_time = np.asarray(
-          dataline["metrics"]["total_cpu_time_s"], dtype="float")
-      d["median_total_cpu_time_s"] = np.median(total_cpu_time)
-    if "per_iter_cpu_time_s" in dataline["metrics"]:
-      per_iter_cpu_time = np.asarray(
-          dataline["metrics"]["per_iter_cpu_time_s"], dtype="float")
-      d["median_per_iter_cpu_time_s"] = np.median(per_iter_cpu_time)
-    if "total_cuda_time_s" in dataline["metrics"]:
-      total_cuda_time = np.asarray(
-          dataline["metrics"]["total_cuda_time_s"], dtype="float")
-      d["median_total_cuda_time_s"] = np.median(total_cuda_time)
-    if "per_iter_cuda_time_s" in dataline["metrics"]:
-      per_iter_cuda_time = np.asarray(
-          dataline["metrics"]["per_iter_cuda_time_s"], dtype="float")
-      d["median_per_iter_cuda_time_s"] = np.median(per_iter_cuda_time)
-
-    if dataline["experiment"]["dynamo"]:
-      d["dynamo_compile_time"] = np.max(total_time) - np.median(total_time)
-    else:
-      d["dynamo_compile_time"] = -1
+    MAX_TOTAL_TIME = f"{np.max.__name__}_total_time"
+    MEDIAN_TOTAL_TIME = f"{np.median.__name__}_total_time"
+
+    for metric, raw_values in dataline["metrics"].items():
+      values = np.asarray(raw_values, dtype="float")
+
+      is_valid = (
+          dataline["experiment"]["xla"] or metric != "trace_per_iter_time")
+
+      for fn in (np.min, np.median, np.max):
+        d[f"{fn.__name__}_{metric}"] = fn(values) if is_valid else -1
+
+      # Remove first measurement.
+      # Assumption: the first measurement has tracing + compilation times
+      # embedded into it. Therefore, we remove it from our data for computing
+      # the average and standard deviation.
+      skip_head = values[1:]
+
+      if len(skip_head) > 0:
+        for fn in (np.mean, np.std):
+          d[f"{fn.__name__}_{metric}"] = fn(skip_head) if is_valid else -1
+
+    compile_time = d[MAX_TOTAL_TIME] - d[MEDIAN_TOTAL_TIME]
+    d["dynamo_compile_time"] = compile_time if dataline["experiment"][
+        "dynamo"] else -1
+    d["xla_compile_time"] = compile_time if dataline["experiment"]["xla"] else -1
     return d
 
   # TODO: handle error message properly (database length restriction)
diff --git a/benchmarks/test/example.json b/benchmarks/test/example.json
@@ -0,0 +1,106 @@
+{
+  "model": {
+    "suite_name": "torchbench",
+    "model_name": "DALLE2_pytorch"
+  },
+  "experiment": {
+    "experiment_name": "run_all",
+    "accelerator": "cuda",
+    "accelerator_model": "One of NVIDIA GeForce RTX 2060, NVIDIA GeForce RTX 2060",
+    "xla": "PJRT",
+    "xla_flags": null,
+    "dynamo": "openxla",
+    "test": "eval",
+    "batch_size": 1
+  },
+  "repeat": 10,
+  "iterations_per_run": 1,
+  "metrics": {
+    "total_cpu_time_s": [
+      81.853362,
+      0.065951,
+      0.056186,
+      0.055567,
+      0.055391,
+      0.055835,
+      0.055767,
+      0.058623,
+      0.055612,
+      0.058594
+    ],
+    "total_cuda_time_s": [
+      81.852574,
+      0.065956,
+      0.056192,
+      0.055573,
+      0.055396,
+      0.055841,
+      0.055773,
+      0.058629,
+      0.055617,
+      0.0586
+    ],
+    "per_iter_cpu_time_s": [
+      81.853362,
+      0.065951,
+      0.056186,
+      0.055567,
+      0.055391,
+      0.055835,
+      0.055767,
+      0.058623,
+      0.055612,
+      0.058594
+    ],
+    "per_iter_cuda_time_s": [
+      81.852574,
+      0.065956,
+      0.056192,
+      0.055573,
+      0.055396,
+      0.055841,
+      0.055773,
+      0.058629,
+      0.055617,
+      0.0586
+    ],
+    "total_time": [
+      120.4606251809746,
+      0.08297968655824661,
+      0.0747979823499918,
+      0.07257041148841381,
+      0.0746086947619915,
+      0.07293416373431683,
+      0.07472928613424301,
+      0.07585464790463448,
+      0.07447021268308163,
+      0.07592942006886005
+    ],
+    "per_iter_time": [
+      120.4606251809746,
+      0.08297968655824661,
+      0.0747979823499918,
+      0.07257041148841381,
+      0.0746086947619915,
+      0.07293416373431683,
+      0.07472928613424301,
+      0.07585464790463448,
+      0.07447021268308163,
+      0.07592942006886005
+    ],
+    "trace_per_iter_time": [
+      81.8553378880024,
+      0.06609796732664108,
+      0.05630519799888134,
+      0.05567726120352745,
+      0.055552588775753975,
+      0.05595448426902294,
+      0.05587966553866863,
+      0.058729616925120354,
+      0.05571739934384823,
+      0.058703068643808365
+    ],
+    "single_value": [1]
+  },
+  "outputs_file": null
+}
diff --git a/benchmarks/test/run_tests.sh b/benchmarks/test/run_tests.sh
@@ -0,0 +1,12 @@
+BASEDIR="$(dirname $(dirname $(realpath $0)))"
+export PYTHONPATH="$BASEDIR"
+
+function run_test {
+    pushd "$BASEDIR"
+    python3 "$@"
+    popd
+}
+
+if [[ "$RUN_XLA_OP_TESTS1" == "xla_op1" ]]; then
+    run_test test/test_result_analyzer.py
+fi
diff --git a/benchmarks/test/test_result_analyzer.py b/benchmarks/test/test_result_analyzer.py
@@ -0,0 +1,104 @@
+import argparse
+import functools
+import numpy
+import unittest
+import os
+
+from result_analyzer import ResultAnalyzer, parse_args
+
+fns_whole = (numpy.min, numpy.median, numpy.max)
+fns_skip_head = (numpy.mean, numpy.std)
+fns_all = fns_whole + fns_skip_head
+
+
+def apply(fn, data):
+  return fn(data)
+
+
+def apply_skip_head(fn, data):
+  return fn(data[1:])
+
+
+@functools.cache
+def get_dirname():
+  return os.path.dirname(__file__)
+
+
+@functools.cache
+def get_dataline():
+  import json
+  example_json = os.path.join(get_dirname(), "example.json")
+  with open(example_json, "r") as f:
+    return json.load(f)
+
+
+class TestResultAnalyzer(unittest.TestCase):
+
+  def _key(self, fn, metric):
+    return f"{fn.__name__}_{metric}"
+
+  def _check(self, dataline, output, fns, metric, output_value_fn):
+    for fn in fns:
+      key = self._key(fn, metric)
+      self.assertIn(key, output)
+      self.assertEqual(output[key],
+                       output_value_fn(fn, dataline["metrics"][metric]))
+
+  def _test_calculate_metrics(self, xla, dynamo):
+    dataline = get_dataline()
+    dataline["experiment"]["xla"] = xla
+    dataline["experiment"]["dynamo"] = dynamo
+
+    r = ResultAnalyzer(parse_args(["--output-dirname", get_dirname()]))
+    output = r.get_calculated_metrics({}, dataline)
+
+    # Check that output has data for each metric, summarized by
+    # each of its corresponding summary functions.
+
+    # - metrics with more than one measurement
+    for metric in ("total_cpu_time_s", "total_cuda_time_s",
+                   "per_iter_cpu_time_s", "per_iter_cuda_time_s", "total_time",
+                   "per_iter_time"):
+      self._check(dataline, output, fns_whole, metric, apply)
+      self._check(dataline, output, fns_skip_head, metric, apply_skip_head)
+
+    # - single_value: since it has only one value, we only check it for
+    #   fns_whole set of statistical functions
+    self._check(dataline, output, fns_whole, "single_value", apply)
+
+    # Check that there are is no mean and std for single-valued timings.
+    for fn in fns_skip_head:
+      self.assertNotIn(self._key(fn, "single_value"), output)
+
+    return output, dataline
+
+  def test_calculate_metrics_inductor(self):
+    output, _ = self._test_calculate_metrics(xla=None, dynamo="inductor")
+
+    # There should be a dynamo_compile_time key, if it's not an XLA run.
+    self.assertIn("dynamo_compile_time", output)
+
+    # For all trace_per_iter_time summary data inside output, all of them
+    # should be -1.
+    for fn in fns_all:
+      k = self._key(fn, "trace_per_iter_time")
+
+      # It's ok not to have it in output, since it's not an XLA data anyway.
+      if k in output:
+        self.assertEqual(output[k], -1)
+
+  def test_calculate_metrics_xla(self):
+    output, dataline = self._test_calculate_metrics(
+        xla="PJRT", dynamo="openxla")
+
+    # There should be an xla_compile_time key.
+    self.assertIn("xla_compile_time", output)
+
+    # The trace_per_iter_time summary data should be populated.
+    self._check(dataline, output, fns_whole, "trace_per_iter_time", apply)
+    self._check(dataline, output, fns_skip_head, "trace_per_iter_time",
+                apply_skip_head)
+
+
+if __name__ == "__main__":
+  unittest.main()