diff --git a/benchmarks/README.md b/benchmarks/README.md index 35aace07781e..9f9db8cad505 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -5,6 +5,19 @@ The two main benchmarking scripts are - `result_analyzer.py` to aggregate the benchmark result in CSV form. +## Patching mismatched batch sizes + +Sometimes batch sizes for inference might differ between Inductor, and XLA. +This stems from the fact that we pass in an XLA device string to the TorchBench +modelling code, instead of a raw CUDA string, and the path to correctly +fetch the accelerator underneath is not covered. To fix this apply a patch: + +``` +git apply benchmarks/patches/mismatched_batch_size.patch +``` + +And replace the `current_device_name` with your actual accelerator name. + ## Reducing benchmark noise It is important to keep the benchmark runs safe from external effects diff --git a/benchmarks/patches/mismatched_batch_size.patch b/benchmarks/patches/mismatched_batch_size.patch new file mode 100644 index 000000000000..8185150dfcdc --- /dev/null +++ b/benchmarks/patches/mismatched_batch_size.patch @@ -0,0 +1,13 @@ +diff --git a/torchbenchmark/util/model.py b/torchbenchmark/util/model.py +index 8593ba4c..57fef507 100644 +--- a/torchbenchmark/util/model.py ++++ b/torchbenchmark/util/model.py +@@ -182,6 +182,7 @@ class BenchmarkModel(metaclass=PostInitProcessor): + + # use the device suggestion on CUDA inference tests, key should be either eval_batch_size or train_batch_size + device_batch_size_key = f"{self.test}_batch_size" ++ # A patch to making sure batch sizes are comparable. It's needed because xla device string is unrecognized. ++ current_device_name = 'NVIDIA A100-SXM4-40GB' + if self.metadata and "devices" in self.metadata and current_device_name in self.metadata["devices"] \ + and device_batch_size_key in self.metadata["devices"][current_device_name]: + batch_size = self.metadata["devices"][current_device_name][device_batch_size_key]