diff --git a/benchmarks/README.md b/benchmarks/README.md
index 35aace07781e..9f9db8cad505 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -5,6 +5,19 @@ The two main benchmarking scripts are
   - `result_analyzer.py` to aggregate the benchmark result in CSV form.
 
 
+## Patching mismatched batch sizes
+
+Sometimes batch sizes for inference might differ between Inductor, and XLA.
+This stems from the fact that we pass in an XLA device string to the TorchBench
+modelling code, instead of a raw CUDA string, and the path to correctly
+fetch the accelerator underneath is not covered. To fix this apply a patch:
+
+```
+git apply benchmarks/patches/mismatched_batch_size.patch
+```
+
+And replace the `current_device_name` with your actual accelerator name.
+
 ## Reducing benchmark noise 
 
 It is important to keep the benchmark runs safe from external effects 
diff --git a/benchmarks/patches/mismatched_batch_size.patch b/benchmarks/patches/mismatched_batch_size.patch
new file mode 100644
index 000000000000..8185150dfcdc
--- /dev/null
+++ b/benchmarks/patches/mismatched_batch_size.patch
@@ -0,0 +1,13 @@
+diff --git a/torchbenchmark/util/model.py b/torchbenchmark/util/model.py
+index 8593ba4c..57fef507 100644
+--- a/torchbenchmark/util/model.py
++++ b/torchbenchmark/util/model.py
+@@ -182,6 +182,7 @@ class BenchmarkModel(metaclass=PostInitProcessor):
+
+         # use the device suggestion on CUDA inference tests, key should be either eval_batch_size or train_batch_size
+         device_batch_size_key = f"{self.test}_batch_size"
++        # A patch to making sure batch sizes are comparable. It's needed because xla device string is unrecognized.
++        current_device_name = 'NVIDIA A100-SXM4-40GB'
+         if self.metadata and "devices" in self.metadata and current_device_name in self.metadata["devices"] \
+                             and device_batch_size_key in self.metadata["devices"][current_device_name]:
+             batch_size = self.metadata["devices"][current_device_name][device_batch_size_key]