Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 76 additions & 31 deletions benchmarks/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@

Datapoint = namedtuple('Datapoint', 'avg, std')

_title_map = {
'eager': 'Eager',
'inductor': 'Inductor',
'openxla_eval+dynamo': 'XLA_Eval+Dynamo',
'openxla+dynamo': 'XLA+Dynamo',
'openxla+lazytensor': 'XLA+LazyTensor',
}

_test_to_field_name = {
'inference': 'eval',
'training': 'train',
Expand Down Expand Up @@ -86,9 +94,26 @@ def skip_model(args, model_name: str):
re.search("|".join(args.exclude), model_name, re.I))


def get_backend_name(dynamo: str, xla: str) -> str:
if dynamo == 'inductor':
return 'inductor'
if xla == 'PJRT':
assert dynamo == 'openxla' or dynamo == 'openxla_eval' or dynamo == None
xla_name = dynamo
tracer = 'dynamo'
if not dynamo:
xla_name = 'openxla'
tracer = 'lazytensor'
return f'{xla_name}+{tracer}'
assert dynamo == None and xla == None
return 'eager'


def process_file(args, results_map: Dict[str, Any], filename: str):
fields = {
'experiment': ['accelerator_model', 'batch_size', 'dynamo', 'test'],
'experiment': [
'accelerator_model', 'batch_size', 'dynamo', 'test', 'xla'
],
'metrics': [],
'model': ['model_name'],
'timestamp': [],
Expand Down Expand Up @@ -116,7 +141,9 @@ def process_file(args, results_map: Dict[str, Any], filename: str):
model_name = r['model']['model_name']
if skip_model(args, model_name):
continue
xla = r['experiment']['xla']
dynamo = r['experiment']['dynamo']
backend = get_backend_name(dynamo, xla)
test = r['experiment']['test']
if test != _test_to_field_name[args.test]:
continue
Expand All @@ -129,13 +156,13 @@ def process_file(args, results_map: Dict[str, Any], filename: str):

if timestamp not in results_map:
results_map[timestamp] = {}
if dynamo not in results_map[timestamp]:
results_map[timestamp][dynamo] = {}
if (model_name not in results_map[timestamp][dynamo]):
results_map[timestamp][dynamo][model_name] = {}
if (batch_size not in results_map[timestamp][dynamo][model_name]):
results_map[timestamp][dynamo][model_name][batch_size] = {}
results_map[timestamp][dynamo][model_name][batch_size] = dp
if backend not in results_map[timestamp]:
results_map[timestamp][backend] = {}
if (model_name not in results_map[timestamp][backend]):
results_map[timestamp][backend][model_name] = {}
if (batch_size not in results_map[timestamp][backend][model_name]):
results_map[timestamp][backend][model_name][batch_size] = {}
results_map[timestamp][backend][model_name][batch_size] = dp


# Speedup of a over baseline ("b"), with errors.
Expand Down Expand Up @@ -211,21 +238,21 @@ def compute_baseline(args, results_map: Dict[str, Any]) -> Dict[str, Any]:
if not timestamps:
return baseline
timestamps.sort()
base_backend = args.backends[0]
if args.baseline == 'oldest':
# A benchmark's baseline is the oldest Inductor perf number we have for it.
# This way we can track both Pytorch/XLA and Inductor perf improvements over
# time.
# A benchmark's baseline is the oldest `base_backend` perf number we have
# for it. This way we can track perf improvements over time.
for ts in timestamps:
if 'inductor' not in results_map[ts]:
if base_backend not in results_map[ts]:
continue
populate_baseline(baseline, results_map[ts]['inductor'])
populate_baseline(baseline, results_map[ts][base_backend])

elif args.baseline == 'latest':
# Pick only results from the latest timestamp.
ts = timestamps[-1]
if 'inductor' not in results_map[ts]:
sys.exit(f'No Inductor results in the latest timestamp {ts}')
populate_baseline(baseline, results_map[ts]['inductor'])
if base_backend not in results_map[ts]:
sys.exit(f'No {base_backend} results in the latest timestamp {ts}')
populate_baseline(baseline, results_map[ts][base_backend])
return baseline


Expand All @@ -234,9 +261,8 @@ def process_results(args, results_map: Dict[str, Any]):
for timestamp in results_map:
acc_map = results_map[timestamp]

compute_speedups(acc_map, baseline, 'xla:speedups', 'openxla')
compute_speedups(acc_map, baseline, 'xla_eval:speedups', 'openxla_eval')
compute_speedups(acc_map, baseline, 'inductor:speedups', 'inductor')
for backend in sorted(_title_map.keys()):
compute_speedups(acc_map, baseline, f'{backend}:speedups', backend)


def maketitle(args, title: str):
Expand All @@ -246,18 +272,16 @@ def maketitle(args, title: str):


def get_pr_titles(args):
titles = ['Inductor', 'PytorchXLA']
data_labels = ['inductor', 'xla']
if args.test == "inference":
titles.append('PytorchXLA_Eval')
data_labels.append('xla_eval')
titles = [_title_map[t] for t in args.backends]
data_labels = args.backends
return [titles, data_labels]


def pr_latest(results_map: Dict[str, Any], args, timestamps: List[str]):
titles, data_labels = get_pr_titles(args)
speedups = [[] for _ in titles]
model_names = [[] for _ in titles]
base_backend_name = _title_map[args.backends[0]]

for i, pfx in enumerate(data_labels):
label = f'{pfx}:speedups'
Expand All @@ -274,8 +298,8 @@ def pr_latest(results_map: Dict[str, Any], args, timestamps: List[str]):
return

if args.format == 'csv':
print(','.join(['# WorkloadNumber'] + [
f'Speedup({title}/{args.baseline.capitalize()} Inductor),StdDev,ModelName({title})'
print(','.join(['# Workload'] + [
f'Speedup({title}/{args.baseline.capitalize()} {base_backend_name}),StdDev,ModelName({title})'
for title in titles
]))
# Note: the latest timestamp might not have results for all benchmarks.
Expand Down Expand Up @@ -328,7 +352,8 @@ def pad_array(arr, desired_len, val):
plt.title(
maketitle(
args,
f'Speedup over {args.baseline.capitalize()} Benchmarked Inductor'))
f'Speedup over {args.baseline.capitalize()} Benchmarked {base_backend_name}'
))
plt.xlabel('Workload Number')
plt.ylabel(f'Speedup')
plt.savefig(sys.stdout.buffer, format=args.format)
Expand All @@ -339,6 +364,7 @@ def pr_histogram(results_map: Dict[str, Any], args, timestamps: List[str]):
percentiles = [f'p{p}' for p in (95, 50, 5)]
labels = [f'{pfx}:speedups:{p}' for pfx in data_labels for p in percentiles]
full_titles = [f'{title} {p}' for title in titles for p in percentiles]
base_backend_name = _title_map[args.backends[0]]
x = []
y = [[] for i in range(len(labels))]
for timestamp in timestamps:
Expand Down Expand Up @@ -375,7 +401,7 @@ def pr_histogram(results_map: Dict[str, Any], args, timestamps: List[str]):
plt.title(
maketitle(
args,
f'Histogram of Speedup over {args.baseline.capitalize()} Benchmarked Inductor'
f'Histogram of Speedup over {args.baseline.capitalize()} Benchmarked {base_backend_name}'
))
plt.savefig(sys.stdout.buffer, format=args.format)

Expand All @@ -385,6 +411,7 @@ def pr_gmean(results_map: Dict[str, Any], args, timestamps: List[str]):
x = []
titles, data_labels = get_pr_titles(args)
labels = [f"{x}:speedups:gmean" for x in data_labels]
base_backend_name = _title_map[args.backends[0]]
y = [[] for _ in labels]
for timestamp in timestamps:
if all(label not in results_map[timestamp] for label in labels):
Expand All @@ -396,7 +423,7 @@ def pr_gmean(results_map: Dict[str, Any], args, timestamps: List[str]):
results_map[timestamp] else Datapoint('', ''))
if args.format == 'csv':
print(','.join(['# Datetime(UTC)'] + [
f"Speedup({title}/{args.baseline.capitalize()} Inductor),StdDev"
f"Speedup({title}/{args.baseline.capitalize()} {base_backend_name}),StdDev"
for title in titles
]))
for j, x in enumerate(x):
Expand All @@ -422,7 +449,8 @@ def pr_gmean(results_map: Dict[str, Any], args, timestamps: List[str]):
plt.title(
maketitle(
args,
f'Speedup over {args.baseline.capitalize()} Benchmarked Inductor'))
f'Speedup over {args.baseline.capitalize()} Benchmarked {base_backend_name}'
))
plt.savefig(sys.stdout.buffer, format=args.format)


Expand Down Expand Up @@ -450,11 +478,19 @@ def parse_args(args=None):
default='v100',
choices=['a100', 'v100', 'a6000'],
help='Accelerator.')
parser.add_argument(
'--backends',
type=str,
action='extend',
nargs='+',
help=f'''List of backends to report on.
Valid: {sorted(_title_map.keys())}.
Note: the first element is used as the baseline backend.''')
parser.add_argument(
'--baseline',
default='oldest',
choices=['oldest', 'latest'],
help='Inductor baseline to be used for computing speedups.')
help='Baseline point in time to be used for computing speedups.')
parser.add_argument(
"--exclude",
"-x",
Expand Down Expand Up @@ -522,6 +558,15 @@ def parse_args(args=None):
tiers.append_filter_by_tier(args.exclude, args.exclude_by_tier)
args.filter = args.filter or [r"."]
args.exclude = args.exclude or [r"^$"]
if not args.backends:
if args.test == 'inference':
args.backends = ['inductor', 'openxla+dynamo', 'openxla_eval+dynamo']
else:
args.backends = ['inductor', 'openxla+dynamo']
for backend in args.backends:
if backend not in _title_map:
sys.exit(f"error: argument --backends: invalid choice: '{backend}' "
f"(choose from {sorted(_title_map.keys())})")

return args

Expand Down
2 changes: 1 addition & 1 deletion test/benchmarks/a6000.inference.speedup.test
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Datetime(UTC),Speedup(Inductor/Oldest Inductor),StdDev,Speedup(PytorchXLA/Oldest Inductor),StdDev,Speedup(PytorchXLA_Eval/Oldest Inductor),StdDev
# Datetime(UTC),Speedup(Inductor/Oldest Inductor),StdDev,Speedup(XLA+Dynamo/Oldest Inductor),StdDev,Speedup(XLA_Eval+Dynamo/Oldest Inductor),StdDev
2023-11-11 04:43:56.070348,1.0,0.0,,,0.76855822,0.0
2 changes: 1 addition & 1 deletion test/benchmarks/a6000.training.latest.test
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# WorkloadNumber,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(PytorchXLA/Oldest Inductor),StdDev,ModelName(PytorchXLA)
# Workload,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(XLA+Dynamo/Oldest Inductor),StdDev,ModelName(XLA+Dynamo)
0,1.0,0.0,BERT_pytorch,2.84589073,0.0,BERT_pytorch
1,1.0,0.0,Background_Matting,,,
2 changes: 1 addition & 1 deletion test/benchmarks/v100.inference.histogram.test
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Datetime(UTC),Inductor p95,Inductor p50,Inductor p5,PytorchXLA p95,PytorchXLA p50,PytorchXLA p5,PytorchXLA_Eval p95,PytorchXLA_Eval p50,PytorchXLA_Eval p5
# Datetime(UTC),Inductor p95,Inductor p50,Inductor p5,XLA+Dynamo p95,XLA+Dynamo p50,XLA+Dynamo p5,XLA_Eval+Dynamo p95,XLA_Eval+Dynamo p50,XLA_Eval+Dynamo p5
2023-11-11 05:32:18.723407,1.0,1.0,1.0,0.97631327,0.85586259,0.7354119,0.94359157,0.79447,0.64534844
2023-11-12 05:32:18,1.50833479,1.40761418,1.30689358,1.52901152,1.17088985,0.81276817,1.33687535,1.05136221,0.76584908
3 changes: 3 additions & 0 deletions test/benchmarks/v100.inference.latest.openxla_baseline.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# ARGS: --backends openxla+dynamo inductor --baseline=latest --filter-by-tier=1
# Workload,Speedup(XLA+Dynamo/Latest XLA+Dynamo),StdDev,ModelName(XLA+Dynamo),Speedup(Inductor/Latest XLA+Dynamo),StdDev,ModelName(Inductor)
0,1.0,0.0,BERT_pytorch,0.96858952,0.0,BERT_pytorch
2 changes: 1 addition & 1 deletion test/benchmarks/v100.inference.latest.test
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# WorkloadNumber,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(PytorchXLA/Oldest Inductor),StdDev,ModelName(PytorchXLA),Speedup(PytorchXLA_Eval/Oldest Inductor),StdDev,ModelName(PytorchXLA_Eval)
# Workload,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(XLA+Dynamo/Oldest Inductor),StdDev,ModelName(XLA+Dynamo),Speedup(XLA_Eval+Dynamo/Oldest Inductor),StdDev,ModelName(XLA_Eval+Dynamo)
0,1.2957024,0.0,Background_Matting,0.77297688,0.0,Background_Matting,0.7341254,0.0,Background_Matting
1,1.51952596,6.914e-05,BERT_pytorch,1.56880282,7.138e-05,BERT_pytorch,1.36859903,6.227e-05,BERT_pytorch
2 changes: 1 addition & 1 deletion test/benchmarks/v100.inference.latest.tier1.test
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# ARGS: --filter-by-tier=1
# WorkloadNumber,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(PytorchXLA/Oldest Inductor),StdDev,ModelName(PytorchXLA),Speedup(PytorchXLA_Eval/Oldest Inductor),StdDev,ModelName(PytorchXLA_Eval)
# Workload,Speedup(Inductor/Oldest Inductor),StdDev,ModelName(Inductor),Speedup(XLA+Dynamo/Oldest Inductor),StdDev,ModelName(XLA+Dynamo),Speedup(XLA_Eval+Dynamo/Oldest Inductor),StdDev,ModelName(XLA_Eval+Dynamo)
0,1.51952596,6.914e-05,BERT_pytorch,1.56880282,7.138e-05,BERT_pytorch,1.36859903,6.227e-05,BERT_pytorch
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# ARGS: --baseline=latest
# Datetime(UTC),Speedup(Inductor/Latest Inductor),StdDev,Speedup(PytorchXLA/Latest Inductor),StdDev,Speedup(PytorchXLA_Eval/Latest Inductor),StdDev
# Datetime(UTC),Speedup(Inductor/Latest Inductor),StdDev,Speedup(XLA+Dynamo/Latest Inductor),StdDev,Speedup(XLA_Eval+Dynamo/Latest Inductor),StdDev
2023-11-11 05:32:18.723407,0.71267792,1.621e-05,0.60245072,0.0,0.55375084,0.0
2023-11-12 05:32:18,1.0,0.0,0.78480315,0.0,0.71435904,0.0
2 changes: 1 addition & 1 deletion test/benchmarks/v100.inference.speedup.test
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Datetime(UTC),Speedup(Inductor/Oldest Inductor),StdDev,Speedup(PytorchXLA/Oldest Inductor),StdDev,Speedup(PytorchXLA_Eval/Oldest Inductor),StdDev
# Datetime(UTC),Speedup(Inductor/Oldest Inductor),StdDev,Speedup(XLA+Dynamo/Oldest Inductor),StdDev,Speedup(XLA_Eval+Dynamo/Oldest Inductor),StdDev
2023-11-11 05:32:18.723407,1.0,3.217e-05,0.84533378,1.923e-05,0.77700013,1.768e-05
2023-11-12 05:32:18,1.40315838,3.192e-05,1.10120312,2.505e-05,1.00235887,2.28e-05