diff --git a/benchmarks/aggregate.py b/benchmarks/aggregate.py index 0b41ec5ee809..d39377e82699 100644 --- a/benchmarks/aggregate.py +++ b/benchmarks/aggregate.py @@ -175,26 +175,43 @@ def compute_speedups(acc_map: Dict[str, Any], baseline: Dict[str, Any], summarize_speedups(acc_map, out_label) -# A benchmark's baseline is the oldest Inductor perf number we have for it. -# This way we can track both Pytorch/XLA and Inductor perf improvements over -# time. -def compute_baseline(results_map: Dict[str, Any]) -> Dict[str, Any]: +def populate_baseline(baseline: Dict[str, Any], inductor_results: Dict[str, + Any]): + for model_name in inductor_results: + if model_name not in baseline: + baseline[model_name] = {} + for batch_size in inductor_results[model_name]: + if batch_size not in baseline[model_name]: + baseline[model_name][batch_size] = inductor_results[model_name][ + batch_size] + + +def compute_baseline(args, results_map: Dict[str, Any]) -> Dict[str, Any]: baseline = {} - for ts in sorted(list(results_map.keys())): + timestamps = list(results_map.keys()) + if not timestamps: + return baseline + timestamps.sort() + if args.baseline == 'oldest': + # A benchmark's baseline is the oldest Inductor perf number we have for it. + # This way we can track both Pytorch/XLA and Inductor perf improvements over + # time. + for ts in timestamps: + if 'inductor' not in results_map[ts]: + continue + populate_baseline(baseline, results_map[ts]['inductor']) + + elif args.baseline == 'latest': + # Pick only results from the latest timestamp. + ts = timestamps[-1] if 'inductor' not in results_map[ts]: - continue - for model_name in results_map[ts]['inductor']: - if model_name not in baseline: - baseline[model_name] = {} - for batch_size in results_map[ts]['inductor'][model_name]: - if batch_size not in baseline[model_name]: - baseline[model_name][batch_size] = results_map[ts]['inductor'][ - model_name][batch_size] + sys.exit(f'No Inductor results in the latest timestamp {ts}') + populate_baseline(baseline, results_map[ts]['inductor']) return baseline def process_results(args, results_map: Dict[str, Any]): - baseline = compute_baseline(results_map) + baseline = compute_baseline(args, results_map) for timestamp in results_map: acc_map = results_map[timestamp] @@ -239,7 +256,7 @@ def pr_latest(results_map: Dict[str, Any], args, timestamps: List[str]): if args.format == 'csv': print(','.join(['# WorkloadNumber'] + [ - f'Speedup({title}/Oldest Inductor),StdDev,ModelName({title})' + f'Speedup({title}/{args.baseline.capitalize()} Inductor),StdDev,ModelName({title})' for title in titles ])) # Note: the latest timestamp might not have results for all benchmarks. @@ -289,7 +306,10 @@ def pad_array(arr, desired_len, val): # Make overlapping text more legible by making it transparent. annotation.set_alpha(0.5) plt.legend() - plt.title(maketitle(args, f'Speedup over Oldest Benchmarked Inductor')) + plt.title( + maketitle( + args, + f'Speedup over {args.baseline.capitalize()} Benchmarked Inductor')) plt.xlabel('Workload Number') plt.ylabel(f'Speedup') plt.savefig(sys.stdout.buffer, format=args.format) @@ -334,8 +354,10 @@ def pr_histogram(results_map: Dict[str, Any], args, timestamps: List[str]): plt.xlabel("Date") plt.ylabel("Geomean Speedup") plt.title( - maketitle(args, - 'Histogram of Speedup over Oldest Benchmarked Inductor')) + maketitle( + args, + f'Histogram of Speedup over {args.baseline.capitalize()} Benchmarked Inductor' + )) plt.savefig(sys.stdout.buffer, format=args.format) @@ -354,9 +376,10 @@ def pr_gmean(results_map: Dict[str, Any], args, timestamps: List[str]): pr_round(results_map[timestamp][label]) if label in results_map[timestamp] else Datapoint('', '')) if args.format == 'csv': - print(','.join( - ['# Datetime(UTC)'] + - [f"Speedup({title}/Oldest Inductor),StdDev" for title in titles])) + print(','.join(['# Datetime(UTC)'] + [ + f"Speedup({title}/{args.baseline.capitalize()} Inductor),StdDev" + for title in titles + ])) for j, x in enumerate(x): print(','.join( map(str, [x] + [ @@ -377,7 +400,10 @@ def pr_gmean(results_map: Dict[str, Any], args, timestamps: List[str]): plt.legend() plt.xlabel("Date") plt.ylabel("Geomean Speedup") - plt.title(maketitle(args, 'Speedup over Oldest Benchmarked Inductor')) + plt.title( + maketitle( + args, + f'Speedup over {args.baseline.capitalize()} Benchmarked Inductor')) plt.savefig(sys.stdout.buffer, format=args.format) @@ -405,6 +431,11 @@ def parse_args(args=None): default='v100', choices=['a100', 'v100', 'a6000'], help='Accelerator.') + parser.add_argument( + '--baseline', + default='oldest', + choices=['oldest', 'latest'], + help='Inductor baseline to be used for computing speedups.') parser.add_argument( "--exclude", "-x", diff --git a/test/benchmarks/Makefile b/test/benchmarks/Makefile index 75480502eb8e..ddfa83644930 100644 --- a/test/benchmarks/Makefile +++ b/test/benchmarks/Makefile @@ -1,19 +1,22 @@ -TEST_ARGS = $(shell echo $@ | perl -pe 's/([^.]*)\.([^.]*)\.([^.]*)(?:\.tier([0-9]+))?\.test/--accelerator=$$1 --test=$$2 --report=$$3/; if (defined($$4)) { print "--filter-by-tier=$$4 " }') +TEST_ARGS = $(shell echo $@ | perl -pe 's/([^.]*)\.([^.]*)\.([^.]*).*\.test/--accelerator=$$1 --test=$$2 --report=$$3/') +EMBEDDED_TEST_ARGS = $(shell cat $@ | grep '^# ARGS: ' | perl -pe 's/^# ARGS: (.*)/$$1/') TESTS := $(wildcard *.test) all: $(TESTS) .PHONY: $(TESTS) all ifndef V - QUIET_AGGREGATE = @echo ' ' AGGREGATE $(TEST_ARGS); + QUIET_AGGREGATE = @echo ' ' AGGREGATE $(TEST_ARGS) $(EMBEDDED_TEST_ARGS); QUIET_DIFF = @echo ' ' DIFF $@; QUIET_RM = @echo ' ' RM $@.tmp; endif $(TESTS): - $(QUIET_AGGREGATE)python3 ../../benchmarks/aggregate.py $(TEST_ARGS) \ - --format=csv $(wildcard *.jsonl) > $@.tmp - $(QUIET_DIFF)git diff --no-index $@ $@.tmp + $(QUIET_AGGREGATE)python3 ../../benchmarks/aggregate.py \ + --format=csv \ + $(TEST_ARGS) $(EMBEDDED_TEST_ARGS) \ + $(wildcard *.jsonl) > $@.tmp + $(QUIET_DIFF)git diff -I'^# ARGS: ' --no-index $@ $@.tmp $(QUIET_RM)$(RM) $@.tmp clean: diff --git a/test/benchmarks/v100.inference.latest.tier1.test b/test/benchmarks/v100.inference.latest.tier1.test index 902b31b8bf7c..d688ae2396be 100644 --- a/test/benchmarks/v100.inference.latest.tier1.test +++ b/test/benchmarks/v100.inference.latest.tier1.test @@ -1,2 +1,3 @@ +# ARGS: --filter-by-tier=1 # WorkloadNumber,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(PytorchXLA/Oldest Inductor),StdDev,ModelName(PytorchXLA),Speedup(PytorchXLA_Eval/Oldest Inductor),StdDev,ModelName(PytorchXLA_Eval) 0,1.51952596,6.914e-05,BERT_pytorch,1.56880282,7.138e-05,BERT_pytorch,1.36859903,6.227e-05,BERT_pytorch diff --git a/test/benchmarks/v100.inference.speedup.baseline_latest.test b/test/benchmarks/v100.inference.speedup.baseline_latest.test new file mode 100644 index 000000000000..ebca413ed541 --- /dev/null +++ b/test/benchmarks/v100.inference.speedup.baseline_latest.test @@ -0,0 +1,4 @@ +# ARGS: --baseline=latest +# Datetime(UTC),Speedup(Inductor/Latest Inductor),StdDev,Speedup(PytorchXLA/Latest Inductor),StdDev,Speedup(PytorchXLA_Eval/Latest Inductor),StdDev +2023-11-11 05:32:18.723407,0.71267792,1.621e-05,0.60245072,0.0,0.55375084,0.0 +2023-11-12 05:32:18,1.0,0.0,0.78480315,0.0,0.71435904,0.0