Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 53 additions & 22 deletions benchmarks/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,26 +175,43 @@ def compute_speedups(acc_map: Dict[str, Any], baseline: Dict[str, Any],
summarize_speedups(acc_map, out_label)


# A benchmark's baseline is the oldest Inductor perf number we have for it.
# This way we can track both Pytorch/XLA and Inductor perf improvements over
# time.
def compute_baseline(results_map: Dict[str, Any]) -> Dict[str, Any]:
def populate_baseline(baseline: Dict[str, Any], inductor_results: Dict[str,
Any]):
for model_name in inductor_results:
if model_name not in baseline:
baseline[model_name] = {}
for batch_size in inductor_results[model_name]:
if batch_size not in baseline[model_name]:
baseline[model_name][batch_size] = inductor_results[model_name][
batch_size]


def compute_baseline(args, results_map: Dict[str, Any]) -> Dict[str, Any]:
baseline = {}
for ts in sorted(list(results_map.keys())):
timestamps = list(results_map.keys())
if not timestamps:
return baseline
timestamps.sort()
if args.baseline == 'oldest':
# A benchmark's baseline is the oldest Inductor perf number we have for it.
# This way we can track both Pytorch/XLA and Inductor perf improvements over
# time.
for ts in timestamps:
if 'inductor' not in results_map[ts]:
continue
populate_baseline(baseline, results_map[ts]['inductor'])

elif args.baseline == 'latest':
# Pick only results from the latest timestamp.
ts = timestamps[-1]
if 'inductor' not in results_map[ts]:
continue
for model_name in results_map[ts]['inductor']:
if model_name not in baseline:
baseline[model_name] = {}
for batch_size in results_map[ts]['inductor'][model_name]:
if batch_size not in baseline[model_name]:
baseline[model_name][batch_size] = results_map[ts]['inductor'][
model_name][batch_size]
sys.exit(f'No Inductor results in the latest timestamp {ts}')
populate_baseline(baseline, results_map[ts]['inductor'])
return baseline


def process_results(args, results_map: Dict[str, Any]):
baseline = compute_baseline(results_map)
baseline = compute_baseline(args, results_map)
for timestamp in results_map:
acc_map = results_map[timestamp]

Expand Down Expand Up @@ -239,7 +256,7 @@ def pr_latest(results_map: Dict[str, Any], args, timestamps: List[str]):

if args.format == 'csv':
print(','.join(['# WorkloadNumber'] + [
f'Speedup({title}/Oldest Inductor),StdDev,ModelName({title})'
f'Speedup({title}/{args.baseline.capitalize()} Inductor),StdDev,ModelName({title})'
for title in titles
]))
# Note: the latest timestamp might not have results for all benchmarks.
Expand Down Expand Up @@ -289,7 +306,10 @@ def pad_array(arr, desired_len, val):
# Make overlapping text more legible by making it transparent.
annotation.set_alpha(0.5)
plt.legend()
plt.title(maketitle(args, f'Speedup over Oldest Benchmarked Inductor'))
plt.title(
maketitle(
args,
f'Speedup over {args.baseline.capitalize()} Benchmarked Inductor'))
plt.xlabel('Workload Number')
plt.ylabel(f'Speedup')
plt.savefig(sys.stdout.buffer, format=args.format)
Expand Down Expand Up @@ -334,8 +354,10 @@ def pr_histogram(results_map: Dict[str, Any], args, timestamps: List[str]):
plt.xlabel("Date")
plt.ylabel("Geomean Speedup")
plt.title(
maketitle(args,
'Histogram of Speedup over Oldest Benchmarked Inductor'))
maketitle(
args,
f'Histogram of Speedup over {args.baseline.capitalize()} Benchmarked Inductor'
))
plt.savefig(sys.stdout.buffer, format=args.format)


Expand All @@ -354,9 +376,10 @@ def pr_gmean(results_map: Dict[str, Any], args, timestamps: List[str]):
pr_round(results_map[timestamp][label]) if label in
results_map[timestamp] else Datapoint('', ''))
if args.format == 'csv':
print(','.join(
['# Datetime(UTC)'] +
[f"Speedup({title}/Oldest Inductor),StdDev" for title in titles]))
print(','.join(['# Datetime(UTC)'] + [
f"Speedup({title}/{args.baseline.capitalize()} Inductor),StdDev"
for title in titles
]))
for j, x in enumerate(x):
print(','.join(
map(str, [x] + [
Expand All @@ -377,7 +400,10 @@ def pr_gmean(results_map: Dict[str, Any], args, timestamps: List[str]):
plt.legend()
plt.xlabel("Date")
plt.ylabel("Geomean Speedup")
plt.title(maketitle(args, 'Speedup over Oldest Benchmarked Inductor'))
plt.title(
maketitle(
args,
f'Speedup over {args.baseline.capitalize()} Benchmarked Inductor'))
plt.savefig(sys.stdout.buffer, format=args.format)


Expand Down Expand Up @@ -405,6 +431,11 @@ def parse_args(args=None):
default='v100',
choices=['a100', 'v100', 'a6000'],
help='Accelerator.')
parser.add_argument(
'--baseline',
default='oldest',
choices=['oldest', 'latest'],
help='Inductor baseline to be used for computing speedups.')
parser.add_argument(
"--exclude",
"-x",
Expand Down
13 changes: 8 additions & 5 deletions test/benchmarks/Makefile
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
TEST_ARGS = $(shell echo $@ | perl -pe 's/([^.]*)\.([^.]*)\.([^.]*)(?:\.tier([0-9]+))?\.test/--accelerator=$$1 --test=$$2 --report=$$3/; if (defined($$4)) { print "--filter-by-tier=$$4 " }')
TEST_ARGS = $(shell echo $@ | perl -pe 's/([^.]*)\.([^.]*)\.([^.]*).*\.test/--accelerator=$$1 --test=$$2 --report=$$3/')
EMBEDDED_TEST_ARGS = $(shell cat $@ | grep '^# ARGS: ' | perl -pe 's/^# ARGS: (.*)/$$1/')

TESTS := $(wildcard *.test)
all: $(TESTS)
.PHONY: $(TESTS) all

ifndef V
QUIET_AGGREGATE = @echo ' ' AGGREGATE $(TEST_ARGS);
QUIET_AGGREGATE = @echo ' ' AGGREGATE $(TEST_ARGS) $(EMBEDDED_TEST_ARGS);
QUIET_DIFF = @echo ' ' DIFF $@;
QUIET_RM = @echo ' ' RM $@.tmp;
endif

$(TESTS):
$(QUIET_AGGREGATE)python3 ../../benchmarks/aggregate.py $(TEST_ARGS) \
--format=csv $(wildcard *.jsonl) > $@.tmp
$(QUIET_DIFF)git diff --no-index $@ $@.tmp
$(QUIET_AGGREGATE)python3 ../../benchmarks/aggregate.py \
--format=csv \
$(TEST_ARGS) $(EMBEDDED_TEST_ARGS) \
$(wildcard *.jsonl) > $@.tmp
$(QUIET_DIFF)git diff -I'^# ARGS: ' --no-index $@ $@.tmp
$(QUIET_RM)$(RM) $@.tmp

clean:
Expand Down
1 change: 1 addition & 0 deletions test/benchmarks/v100.inference.latest.tier1.test
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# ARGS: --filter-by-tier=1
# WorkloadNumber,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(PytorchXLA/Oldest Inductor),StdDev,ModelName(PytorchXLA),Speedup(PytorchXLA_Eval/Oldest Inductor),StdDev,ModelName(PytorchXLA_Eval)
0,1.51952596,6.914e-05,BERT_pytorch,1.56880282,7.138e-05,BERT_pytorch,1.36859903,6.227e-05,BERT_pytorch
4 changes: 4 additions & 0 deletions test/benchmarks/v100.inference.speedup.baseline_latest.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# ARGS: --baseline=latest
# Datetime(UTC),Speedup(Inductor/Latest Inductor),StdDev,Speedup(PytorchXLA/Latest Inductor),StdDev,Speedup(PytorchXLA_Eval/Latest Inductor),StdDev
2023-11-11 05:32:18.723407,0.71267792,1.621e-05,0.60245072,0.0,0.55375084,0.0
2023-11-12 05:32:18,1.0,0.0,0.78480315,0.0,0.71435904,0.0