pytorch · jgong5 · May 8, 2024 · May 8, 2024 · May 8, 2024 · May 9, 2024
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -44,7 +44,7 @@
     from torch._inductor.select_algorithm import TritonTemplateCaller
 
 from . import config
-from .runtime.runtime_utils import do_bench, do_bench_cpu
+from .runtime.runtime_utils import do_bench_cpu, do_bench_gpu
 from .virtualized import V
 
 CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
@@ -592,7 +592,7 @@ def do_bench(
             device_idx = torch.cuda.current_device()
 
         with torch.cuda.device(device_idx):
-            out = do_bench(fn)
+            out = do_bench_gpu(fn)
             torch.cuda.synchronize()  # shake out any CUDA errors
 
         return out

diff --git a/torch/_inductor/codegen/multi_kernel.py b/torch/_inductor/codegen/multi_kernel.py
@@ -6,7 +6,7 @@
 
 from .. import config
 from ..codecache import PyCodeCache, TritonFuture
-from ..runtime.runtime_utils import do_bench
+from ..runtime.runtime_utils import do_bench_gpu
 from ..utils import cache_on_self
 from ..virtualized import V
 from .common import TensorArg
@@ -339,7 +339,7 @@ def benchmark_sub_kernels(kernel_calls):
         be picked.
         """
         return [
-            do_bench(lambda: kernel_call(True), rep=40, fast_flush=True)
+            do_bench_gpu(lambda: kernel_call(True), rep=40, fast_flush=True)
             for kernel_call in kernel_calls
         ]
 

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -49,7 +49,7 @@
 from ..optimize_indexing import indexing_dtype_strength_reduction
 from ..runtime.hints import ReductionHint, TRITON_MAX_BLOCK
 from ..runtime.runtime_utils import (
-    do_bench,
+    do_bench_gpu,
     get_max_y_grid,
     green_text,
     next_power_of_2,
@@ -2653,7 +2653,7 @@ def codegen_kernel_benchmark(self, num_gb, grid=None):
 
             result.writeline("args = get_args()")
             result.writeline(
-                "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)"
+                "ms = do_bench_gpu(lambda: call(args), rep=40, fast_flush=True)"
             )
             result.writeline(f"num_gb = {num_gb}")
             result.writeline("gb_per_s = num_gb / (ms / 1e3)")
@@ -4036,13 +4036,13 @@ def store_cache():
         else:
             # We have to clone the inplace updated arguments to avoid earlier calls
             # generating out of range indices for later calls.
-            ms = do_bench(lambda: call(wrapped_jit_function.clone_args(*args)[0]))
+            ms = do_bench_gpu(lambda: call(wrapped_jit_function.clone_args(*args)[0]))
 
             # overhead of cloning args gives bias for fusing the kernel
             # in the case of mutating/in-placeable second fusion
             # TODO - would be better as a hook in triton do_bench that reset
             # the input values between benchmarking
-            ms = ms - do_bench(lambda: wrapped_jit_function.clone_args(*args))
+            ms = ms - do_bench_gpu(lambda: wrapped_jit_function.clone_args(*args))
 
         log.debug(
             "The fused kernel for %s took %.3f ms to run",

diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
@@ -251,7 +251,7 @@ def should_pad_bench(
         return False
 
     do_bench = functools.partial(
-        torch._inductor.runtime.runtime_utils.do_bench,
+        torch._inductor.runtime.runtime_utils.do_bench_gpu,
         warmup=5,
     )
 

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -70,7 +70,7 @@
 )
 from .ops_handler import OpCounterCSE
 from .runtime.hints import ReductionHint
-from .runtime.runtime_utils import do_bench, do_bench_cpu
+from .runtime.runtime_utils import do_bench
 from .utils import (
     argsort,
     cache_on_self,
@@ -79,7 +79,6 @@
     convert_shape_to_symint,
     developer_warning,
     get_kernel_metadata,
-    is_cpu_device,
     is_dynamic,
     is_gpu,
     pad_listlike,
@@ -3628,10 +3627,7 @@ def __init__(self, name, input_nodes, layout):
 
     def benchmark(self, *args, out) -> float:
         algo = self.to_callable()
-        if is_cpu_device(args):
-            return do_bench_cpu(lambda: algo(*args, out=out))
-        else:
-            return do_bench(lambda: algo(*args, out=out))
+        return do_bench(algo, args, {"out": out})
 
     def call_name(self) -> str:
         raise NotImplementedError

diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
@@ -70,7 +70,18 @@ def get_max_y_grid():
     return 65535
 
 
-def do_bench(*args, **kwargs):
+def do_bench(fn, fn_args, fn_kwargs, **kwargs):
+    from torch._inductor.utils import is_cpu_device
+
+    args = list(fn_args)
+    args.extend(fn_kwargs.values())
+    if is_cpu_device(args):
+        return do_bench_cpu(lambda: fn(*fn_args, **fn_kwargs), **kwargs)
+    else:
+        return do_bench_gpu(lambda: fn(*fn_args, **fn_kwargs), **kwargs)
+
+
+def do_bench_gpu(*args, **kwargs):
     @functools.lru_cache(None)
     def load_triton():
         try:

diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -32,7 +32,7 @@
     ceildiv,
     conditional_product,
     create_bandwidth_info_str,
-    do_bench,
+    do_bench_gpu,
     dynamo_timed,
     get_first_attr,
     get_max_y_grid,
@@ -628,7 +628,7 @@ def kernel_call():
                 stream=stream,
             )
 
-        return do_bench(kernel_call, rep=40, fast_flush=True)
+        return do_bench_gpu(kernel_call, rep=40, fast_flush=True)
 
     def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
         from ..compile_fx import clone_preserve_strides

diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -38,10 +38,9 @@
 from .exc import CUDACompileError
 from .ir import ChoiceCaller, PrimitiveInfoType
 from .runtime.hints import DeviceProperties
-from .runtime.runtime_utils import do_bench, do_bench_cpu
+from .runtime.runtime_utils import do_bench
 from .utils import (
     get_dtype_size,
-    is_cpu_device,
     Placeholder,
     restore_stdout_stderr,
     sympy_dot,
@@ -845,10 +844,7 @@ def benchmark(self, *args, out):
                 out_new, tuple(out.size()), tuple(out.stride())
             )
             out.copy_(out_new)  # for correctness checking
-            if is_cpu_device(args):
-                return do_bench_cpu(lambda: algo(*args))
-            else:
-                return do_bench(lambda: algo(*args))
+            return do_bench(algo, args, {})
 
     def to_callable(self):
         fn = self.choice.to_callable()

diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
@@ -4,7 +4,11 @@
 
 import torch
 from torch.autograd import DeviceType
-from .runtime.runtime_utils import create_bandwidth_info_str, do_bench, get_num_bytes
+from .runtime.runtime_utils import (
+    create_bandwidth_info_str,
+    do_bench_gpu,
+    get_num_bytes,
+)
 
 _kernel_category_choices = [
     "foreach",
@@ -116,7 +120,7 @@ def get_info_str(ms, n_regs, n_spills, shared, prefix=""):
                     f"  {get_info_str(ms, launcher.n_regs, launcher.n_spills, launcher.shared)} @ {launcher.config}"
                 )
         else:
-            ms = do_bench(lambda: kernel_mod.call(args), rep=40, fast_flush=True)
+            ms = do_bench_gpu(lambda: kernel_mod.call(args), rep=40, fast_flush=True)
             assert (
                 len(triton_kernel.launchers) == 1
             ), "Autotuner should have selected the best config"