Skip to content

Commit

Permalink
Support reporting geometric mean by benchmark tags
Browse files Browse the repository at this point in the history
Addresses python/pyperformance#208

This reports geometric mean organized by the tag(s) assigned to each benchmark.
This will allow us to include benchmarks in the pyperformance suite that we
don't necessarily want to include in "one big overall number" to represent progress.
  • Loading branch information
mdboom committed May 25, 2022
1 parent 968f247 commit efb04a7
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 18 deletions.
57 changes: 39 additions & 18 deletions pyperf/_compare.py
Expand Up @@ -49,6 +49,10 @@ def format_geometric_mean(norm_means):
return format_normalized_mean(geo_mean)


def get_tags_for_result(result):
return result.ref.benchmark.get_metadata().get("tags", [])


class CompareResult(object):
def __init__(self, ref, changed, min_speed=None):
# CompareData object
Expand Down Expand Up @@ -242,6 +246,12 @@ def __init__(self, benchmarks, args):

self.show_name = (len(grouped_by_name) > 1)

self.tags = set()
for results in self.all_results:
for result in results:
self.tags.update(get_tags_for_result(result))
self.tags = sorted(list(self.tags))

def compare_benchmarks(self, name, benchmarks):
min_speed = self.min_speed

Expand All @@ -258,11 +268,11 @@ def compare_benchmarks(self, name, benchmarks):
return results

@staticmethod
def display_not_signiticant(not_significant):
def display_not_significant(not_significant):
print("Benchmark hidden because not significant (%s): %s"
% (len(not_significant), ', '.join(not_significant)))

def compare_suites_table(self):
def compare_suites_table(self, all_results):
if self.group_by_speed:
def sort_key(results):
result = results[0]
Expand All @@ -280,7 +290,7 @@ def sort_key(results):

rows = []
not_significant = []
for results in self.all_results:
for results in all_results:
row = [results.name]

ref_bench = results[0].ref.benchmark
Expand Down Expand Up @@ -324,14 +334,14 @@ def sort_key(results):
if not_significant:
if rows:
print()
self.display_not_signiticant(not_significant)
self.display_not_significant(not_significant)

def compare_suites_by_speed(self):
def compare_suites_by_speed(self, all_results):
not_significant = []
slower = []
faster = []
same = []
for results in self.all_results:
for results in all_results:
result = results[0]
if not result.significant:
not_significant.append(results.name)
Expand Down Expand Up @@ -372,14 +382,14 @@ def sort_key(item):
if not self.quiet and not_significant:
if empty_line:
print()
self.display_not_signiticant(not_significant)
self.display_not_significant(not_significant)

def compare_suites_list(self):
def compare_suites_list(self, all_results):
not_significant = []
empty_line = False
last_index = (len(self.all_results) - 1)

for index, results in enumerate(self.all_results):
for index, results in enumerate(all_results):
significant = any(result.significant for result in results)
lines = []
for result in results:
Expand All @@ -406,7 +416,7 @@ def compare_suites_list(self):
if not self.quiet and not_significant:
if empty_line:
print()
self.display_not_signiticant(not_significant)
self.display_not_significant(not_significant)

def list_ignored(self):
for suite, hidden in self.benchmarks.group_by_name_ignored():
Expand All @@ -416,9 +426,7 @@ def list_ignored(self):
print("Ignored benchmarks (%s) of %s: %s"
% (len(hidden), suite.filename, ', '.join(sorted(hidden_names))))

def compare_geometric_mean(self):
all_results = self.all_results

def compare_geometric_mean(self, all_results):
# use a list since two filenames can be identical,
# even if results are different
all_norm_means = []
Expand All @@ -443,16 +451,29 @@ def compare_geometric_mean(self):
geo_mean = format_geometric_mean(all_norm_means[0][1])
print(f'Geometric mean: {geo_mean}')

def compare(self):
def compare_suites(self, results):
if self.table:
self.compare_suites_table()
self.compare_suites_table(results)
else:
if self.group_by_speed:
self.compare_suites_by_speed()
self.compare_suites_by_speed(results)
else:
self.compare_suites_list()
self.compare_suites_list(results)

self.compare_geometric_mean()
self.compare_geometric_mean(results)

def compare(self):
if len(self.tags):
for tag in self.tags:
display_title(f"Benchmarks with tag '{tag}':")
all_results = [
results for results in self.all_results
if tag is None or tag in get_tags_for_result(results[0])
]
self.compare_suites(all_results)
print()
display_title(f"All benchmarks:")
self.compare_suites(self.all_results)

if not self.quiet:
self.list_ignored()
Expand Down
8 changes: 8 additions & 0 deletions pyperf/_metadata.py
Expand Up @@ -42,6 +42,12 @@ def is_positive(value):
return (value >= 0)


def is_tags(value):
if not isinstance(value, list):
return False
return all(isinstance(x, str) and x not in ('all', '') for x in value)


def parse_load_avg(value):
if isinstance(value, NUMBER_TYPES):
return value
Expand All @@ -62,6 +68,7 @@ def format_noop(value):
LOOPS = _MetadataInfo(format_number, (int,), is_strictly_positive, 'integer')
WARMUPS = _MetadataInfo(format_number, (int,), is_positive, 'integer')
SECONDS = _MetadataInfo(format_seconds, NUMBER_TYPES, is_positive, 'second')
TAGS = _MetadataInfo(format_generic, (list,), is_tags, 'tag')

# Registry of metadata keys
METADATA = {
Expand All @@ -84,6 +91,7 @@ def format_noop(value):
'recalibrate_loops': LOOPS,
'calibrate_warmups': WARMUPS,
'recalibrate_warmups': WARMUPS,
'tags': TAGS,
}

DEFAULT_METADATA_INFO = _MetadataInfo(format_generic, METADATA_VALUE_TYPES, None, None)
Expand Down
77 changes: 77 additions & 0 deletions pyperf/tests/test_perf_cli.py
Expand Up @@ -330,6 +330,83 @@ def test_compare_to_cli(self):
"""
self.check_command(expected, 'compare_to', '--table', "--group-by-speed", py36, py37)

def test_compare_to_cli_tags(self):
py36 = os.path.join(TESTDIR, 'mult_list_py36_tags.json')
py37 = os.path.join(TESTDIR, 'mult_list_py37_tags.json')

# 2 files
expected = """
Benchmarks with tag 'bar':
==========================
[1,2]*1000: Mean +- std dev: [mult_list_py36_tags] 3.70 us +- 0.05 us -> [mult_list_py37_tags] 5.28 us +- 0.09 us: 1.42x slower
[1,2,3]*1000: Mean +- std dev: [mult_list_py36_tags] 4.61 us +- 0.13 us -> [mult_list_py37_tags] 6.05 us +- 0.11 us: 1.31x slower
Geometric mean: 1.37x slower
Benchmarks with tag 'foo':
==========================
[1]*1000: Mean +- std dev: [mult_list_py36_tags] 2.13 us +- 0.06 us -> [mult_list_py37_tags] 2.09 us +- 0.04 us: 1.02x faster
[1,2]*1000: Mean +- std dev: [mult_list_py36_tags] 3.70 us +- 0.05 us -> [mult_list_py37_tags] 5.28 us +- 0.09 us: 1.42x slower
Geometric mean: 1.18x slower
All benchmarks:
===============
[1]*1000: Mean +- std dev: [mult_list_py36_tags] 2.13 us +- 0.06 us -> [mult_list_py37_tags] 2.09 us +- 0.04 us: 1.02x faster
[1,2]*1000: Mean +- std dev: [mult_list_py36_tags] 3.70 us +- 0.05 us -> [mult_list_py37_tags] 5.28 us +- 0.09 us: 1.42x slower
[1,2,3]*1000: Mean +- std dev: [mult_list_py36_tags] 4.61 us +- 0.13 us -> [mult_list_py37_tags] 6.05 us +- 0.11 us: 1.31x slower
Geometric mean: 1.22x slower
"""
self.check_command(expected, 'compare_to', py36, py37)

expected = """
Benchmarks with tag 'bar':
==========================
+----------------+---------------------+-----------------------+
| Benchmark | mult_list_py36_tags | mult_list_py37_tags |
+================+=====================+=======================+
| [1,2]*1000 | 3.70 us | 5.28 us: 1.42x slower |
+----------------+---------------------+-----------------------+
| [1,2,3]*1000 | 4.61 us | 6.05 us: 1.31x slower |
+----------------+---------------------+-----------------------+
| Geometric mean | (ref) | 1.37x slower |
+----------------+---------------------+-----------------------+
Benchmarks with tag 'foo':
==========================
+----------------+---------------------+-----------------------+
| Benchmark | mult_list_py36_tags | mult_list_py37_tags |
+================+=====================+=======================+
| [1]*1000 | 2.13 us | 2.09 us: 1.02x faster |
+----------------+---------------------+-----------------------+
| [1,2]*1000 | 3.70 us | 5.28 us: 1.42x slower |
+----------------+---------------------+-----------------------+
| Geometric mean | (ref) | 1.18x slower |
+----------------+---------------------+-----------------------+
All benchmarks:
===============
+----------------+---------------------+-----------------------+
| Benchmark | mult_list_py36_tags | mult_list_py37_tags |
+================+=====================+=======================+
| [1]*1000 | 2.13 us | 2.09 us: 1.02x faster |
+----------------+---------------------+-----------------------+
| [1,2]*1000 | 3.70 us | 5.28 us: 1.42x slower |
+----------------+---------------------+-----------------------+
| [1,2,3]*1000 | 4.61 us | 6.05 us: 1.31x slower |
+----------------+---------------------+-----------------------+
| Geometric mean | (ref) | 1.22x slower |
+----------------+---------------------+-----------------------+
"""
self.check_command(expected, 'compare_to', '--table', py36, py37)

def test_compare_to_cli_min_speed(self):
py36 = os.path.join(TESTDIR, 'mult_list_py36.json')
py37 = os.path.join(TESTDIR, 'mult_list_py37.json')
Expand Down

0 comments on commit efb04a7

Please sign in to comment.