Support reporting geometric mean by benchmark tags

Addresses python/pyperformance#208 This reports geometric mean organized by the tag(s) assigned to each benchmark. This will allow us to include benchmarks in the pyperformance suite that we don't necessarily want to include in "one big overall number" to represent progress.
psf · May 25, 2022 · efb04a7 · efb04a7
1 parent 968f247
commit efb04a7
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 18 deletions.
diff --git a/pyperf/_compare.py b/pyperf/_compare.py
@@ -49,6 +49,10 @@ def format_geometric_mean(norm_means):
     return format_normalized_mean(geo_mean)
 
 
+def get_tags_for_result(result):
+    return result.ref.benchmark.get_metadata().get("tags", [])
+
+
 class CompareResult(object):
     def __init__(self, ref, changed, min_speed=None):
         # CompareData object
@@ -242,6 +246,12 @@ def __init__(self, benchmarks, args):
 
         self.show_name = (len(grouped_by_name) > 1)
 
+        self.tags = set()
+        for results in self.all_results:
+            for result in results:
+                self.tags.update(get_tags_for_result(result))
+        self.tags = sorted(list(self.tags))
+
     def compare_benchmarks(self, name, benchmarks):
         min_speed = self.min_speed
 
@@ -258,11 +268,11 @@ def compare_benchmarks(self, name, benchmarks):
         return results
 
     @staticmethod
-    def display_not_signiticant(not_significant):
+    def display_not_significant(not_significant):
         print("Benchmark hidden because not significant (%s): %s"
               % (len(not_significant), ', '.join(not_significant)))
 
-    def compare_suites_table(self):
+    def compare_suites_table(self, all_results):
         if self.group_by_speed:
             def sort_key(results):
                 result = results[0]
@@ -280,7 +290,7 @@ def sort_key(results):
 
         rows = []
         not_significant = []
-        for results in self.all_results:
+        for results in all_results:
             row = [results.name]
 
             ref_bench = results[0].ref.benchmark
@@ -324,14 +334,14 @@ def sort_key(results):
         if not_significant:
             if rows:
                 print()
-            self.display_not_signiticant(not_significant)
+            self.display_not_significant(not_significant)
 
-    def compare_suites_by_speed(self):
+    def compare_suites_by_speed(self, all_results):
         not_significant = []
         slower = []
         faster = []
         same = []
-        for results in self.all_results:
+        for results in all_results:
             result = results[0]
             if not result.significant:
                 not_significant.append(results.name)
@@ -372,14 +382,14 @@ def sort_key(item):
         if not self.quiet and not_significant:
             if empty_line:
                 print()
-            self.display_not_signiticant(not_significant)
+            self.display_not_significant(not_significant)
 
-    def compare_suites_list(self):
+    def compare_suites_list(self, all_results):
         not_significant = []
         empty_line = False
         last_index = (len(self.all_results) - 1)
 
-        for index, results in enumerate(self.all_results):
+        for index, results in enumerate(all_results):
             significant = any(result.significant for result in results)
             lines = []
             for result in results:
@@ -406,7 +416,7 @@ def compare_suites_list(self):
         if not self.quiet and not_significant:
             if empty_line:
                 print()
-            self.display_not_signiticant(not_significant)
+            self.display_not_significant(not_significant)
 
     def list_ignored(self):
         for suite, hidden in self.benchmarks.group_by_name_ignored():
@@ -416,9 +426,7 @@ def list_ignored(self):
             print("Ignored benchmarks (%s) of %s: %s"
                   % (len(hidden), suite.filename, ', '.join(sorted(hidden_names))))
 
-    def compare_geometric_mean(self):
-        all_results = self.all_results
-
+    def compare_geometric_mean(self, all_results):
         # use a list since two filenames can be identical,
         # even if results are different
         all_norm_means = []
@@ -443,16 +451,29 @@ def compare_geometric_mean(self):
             geo_mean = format_geometric_mean(all_norm_means[0][1])
             print(f'Geometric mean: {geo_mean}')
 
-    def compare(self):
+    def compare_suites(self, results):
         if self.table:
-            self.compare_suites_table()
+            self.compare_suites_table(results)
         else:
             if self.group_by_speed:
-                self.compare_suites_by_speed()
+                self.compare_suites_by_speed(results)
             else:
-                self.compare_suites_list()
+                self.compare_suites_list(results)
 
-            self.compare_geometric_mean()
+            self.compare_geometric_mean(results)
+
+    def compare(self):
+        if len(self.tags):
+            for tag in self.tags:
+                display_title(f"Benchmarks with tag '{tag}':")
+                all_results = [
+                    results for results in self.all_results
+                    if tag is None or tag in get_tags_for_result(results[0])
+                ]
+                self.compare_suites(all_results)
+                print()
+            display_title(f"All benchmarks:")
+        self.compare_suites(self.all_results)
 
         if not self.quiet:
             self.list_ignored()

diff --git a/pyperf/_metadata.py b/pyperf/_metadata.py
@@ -42,6 +42,12 @@ def is_positive(value):
     return (value >= 0)
 
 
+def is_tags(value):
+    if not isinstance(value, list):
+        return False
+    return all(isinstance(x, str) and x not in ('all', '') for x in value)
+
+
 def parse_load_avg(value):
     if isinstance(value, NUMBER_TYPES):
         return value
@@ -62,6 +68,7 @@ def format_noop(value):
 LOOPS = _MetadataInfo(format_number, (int,), is_strictly_positive, 'integer')
 WARMUPS = _MetadataInfo(format_number, (int,), is_positive, 'integer')
 SECONDS = _MetadataInfo(format_seconds, NUMBER_TYPES, is_positive, 'second')
+TAGS = _MetadataInfo(format_generic, (list,), is_tags, 'tag')
 
 # Registry of metadata keys
 METADATA = {
@@ -84,6 +91,7 @@ def format_noop(value):
     'recalibrate_loops': LOOPS,
     'calibrate_warmups': WARMUPS,
     'recalibrate_warmups': WARMUPS,
+    'tags': TAGS,
 }
 
 DEFAULT_METADATA_INFO = _MetadataInfo(format_generic, METADATA_VALUE_TYPES, None, None)

diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py
@@ -330,6 +330,83 @@ def test_compare_to_cli(self):
         """
         self.check_command(expected, 'compare_to', '--table', "--group-by-speed", py36, py37)
 
+    def test_compare_to_cli_tags(self):
+        py36 = os.path.join(TESTDIR, 'mult_list_py36_tags.json')
+        py37 = os.path.join(TESTDIR, 'mult_list_py37_tags.json')
+
+        # 2 files
+        expected = """
+            Benchmarks with tag 'bar':
+            ==========================
+
+            [1,2]*1000: Mean +- std dev: [mult_list_py36_tags] 3.70 us +- 0.05 us -> [mult_list_py37_tags] 5.28 us +- 0.09 us: 1.42x slower
+            [1,2,3]*1000: Mean +- std dev: [mult_list_py36_tags] 4.61 us +- 0.13 us -> [mult_list_py37_tags] 6.05 us +- 0.11 us: 1.31x slower
+
+            Geometric mean: 1.37x slower
+
+            Benchmarks with tag 'foo':
+            ==========================
+
+            [1]*1000: Mean +- std dev: [mult_list_py36_tags] 2.13 us +- 0.06 us -> [mult_list_py37_tags] 2.09 us +- 0.04 us: 1.02x faster
+            [1,2]*1000: Mean +- std dev: [mult_list_py36_tags] 3.70 us +- 0.05 us -> [mult_list_py37_tags] 5.28 us +- 0.09 us: 1.42x slower
+
+            Geometric mean: 1.18x slower
+
+            All benchmarks:
+            ===============
+
+            [1]*1000: Mean +- std dev: [mult_list_py36_tags] 2.13 us +- 0.06 us -> [mult_list_py37_tags] 2.09 us +- 0.04 us: 1.02x faster
+            [1,2]*1000: Mean +- std dev: [mult_list_py36_tags] 3.70 us +- 0.05 us -> [mult_list_py37_tags] 5.28 us +- 0.09 us: 1.42x slower
+            [1,2,3]*1000: Mean +- std dev: [mult_list_py36_tags] 4.61 us +- 0.13 us -> [mult_list_py37_tags] 6.05 us +- 0.11 us: 1.31x slower
+
+            Geometric mean: 1.22x slower
+        """
+        self.check_command(expected, 'compare_to', py36, py37)
+
+        expected = """
+            Benchmarks with tag 'bar':
+            ==========================
+
+            +----------------+---------------------+-----------------------+
+            | Benchmark      | mult_list_py36_tags | mult_list_py37_tags   |
+            +================+=====================+=======================+
+            | [1,2]*1000     | 3.70 us             | 5.28 us: 1.42x slower |
+            +----------------+---------------------+-----------------------+
+            | [1,2,3]*1000   | 4.61 us             | 6.05 us: 1.31x slower |
+            +----------------+---------------------+-----------------------+
+            | Geometric mean | (ref)               | 1.37x slower          |
+            +----------------+---------------------+-----------------------+
+
+            Benchmarks with tag 'foo':
+            ==========================
+
+            +----------------+---------------------+-----------------------+
+            | Benchmark      | mult_list_py36_tags | mult_list_py37_tags   |
+            +================+=====================+=======================+
+            | [1]*1000       | 2.13 us             | 2.09 us: 1.02x faster |
+            +----------------+---------------------+-----------------------+
+            | [1,2]*1000     | 3.70 us             | 5.28 us: 1.42x slower |
+            +----------------+---------------------+-----------------------+
+            | Geometric mean | (ref)               | 1.18x slower          |
+            +----------------+---------------------+-----------------------+
+
+            All benchmarks:
+            ===============
+
+            +----------------+---------------------+-----------------------+
+            | Benchmark      | mult_list_py36_tags | mult_list_py37_tags   |
+            +================+=====================+=======================+
+            | [1]*1000       | 2.13 us             | 2.09 us: 1.02x faster |
+            +----------------+---------------------+-----------------------+
+            | [1,2]*1000     | 3.70 us             | 5.28 us: 1.42x slower |
+            +----------------+---------------------+-----------------------+
+            | [1,2,3]*1000   | 4.61 us             | 6.05 us: 1.31x slower |
+            +----------------+---------------------+-----------------------+
+            | Geometric mean | (ref)               | 1.22x slower          |
+            +----------------+---------------------+-----------------------+
+        """
+        self.check_command(expected, 'compare_to', '--table', py36, py37)
+
     def test_compare_to_cli_min_speed(self):
         py36 = os.path.join(TESTDIR, 'mult_list_py36.json')
         py37 = os.path.join(TESTDIR, 'mult_list_py37.json')