pytorch · jgong5 · Apr 14, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 17, 2024
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
@@ -0,0 +1,135 @@
+# Owner(s): ["oncall: cpu inductor"]
+import functools
+import unittest
+from unittest.mock import patch
+
+import torch
+import torch._dynamo.config
+import torch._dynamo.config as dynamo_config
+import torch._inductor.config as inductor_config
+import torch._inductor.select_algorithm as select_algorithm
+from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+)
+
+from torch.testing._internal.common_utils import IS_MACOS, parametrize, TEST_MKL
+
+aten = torch.ops.aten
+
+
+def patches(fn):
+    def skip_cache(self, choices, name, key, benchmark):
+        if benchmark is None:
+            return {}
+        return benchmark(choices)
+
+    for patcher in [
+        dynamo_config.patch(verbose=True),
+        inductor_config.patch(
+            debug=True,
+            max_autotune=True,
+            epilogue_fusion=True,
+            max_autotune_gemm_backends="CPP,ATEN",
+        ),
+        patch.object(select_algorithm, "VERIFY", dict(atol=1e-4, rtol=1e-4)),
+        patch.object(select_algorithm.AlgorithmSelectorCache, "lookup", skip_cache),
+    ]:
+        fn = patcher(fn)
+
+    @functools.wraps(fn)
+    def wrapped(*args, **kwargs):
+        counters.clear()
+        torch.manual_seed(12345)
+        return fn(*args, **kwargs)
+
+    return wrapped
+
+
+class TestSelectAlgorithm(TestCase):
+    @inductor_config.patch({"freezing": True})
+    @patches
+    @torch.no_grad
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+    @parametrize("batch_size", (1, 2, 1000))
+    @parametrize("in_features", (1, 2, 1000))
+    @parametrize("out_features", (1, 32, 1024))
+    @parametrize("bias", (True, False))
+    @parametrize("input_3d", (True, False))
+    @dtypes(torch.float)
+    def test_linear_static_shapes(
+        self, batch_size, in_features, out_features, bias, input_3d, dtype
+    ):
+        class M(torch.nn.Module):
+            def __init__(self, bias):
+                super().__init__()
+                self.linear = torch.nn.Linear(in_features, out_features, bias)
+
+            @torch.compile
+            def forward(self, x):
+                return self.linear(x)
+
+        counters.clear()
+        mod = M(bias=bias).to(dtype=dtype).eval()
+        B = (2, batch_size) if input_3d else (batch_size,)
+        v = torch.randn(*B, in_features).to(dtype=dtype)
+        mod(v)
+        if (
+            counters["inductor"]["decompose_mm"] > 0
+            or counters["inductor"]["decompose_addmm"] > 0
+        ):
+            # This is a special case where we go directly with vectorized codegen
+            self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 0)
+        else:
+            self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
+
+    @inductor_config.patch({"freezing": True})
+    @patches
+    @torch.no_grad
+    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
+    @parametrize("bias", (True, False))
+    @dtypes(torch.float)
+    def test_linear_input_transpose(self, bias, dtype):
+        batch_size = 384
+        in_features = 196
+        out_features = 384
+
+        class M(torch.nn.Module):
+            def __init__(self, bias):
+                super().__init__()
+                self.linear = torch.nn.Linear(in_features, out_features, bias)
+
+            @torch.compile
+            def forward(self, x):
+                return self.linear(x)
+
+        counters.clear()
+        mod = M(bias=bias).to(dtype=dtype).eval()
+        v = torch.randn(in_features, batch_size).to(dtype=dtype)
+        mod(v.transpose(0, 1))
+        # TODO(jgong5): support transposed input
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 0)
+
+
+@dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
+class _DynamicShapesTestBase(TestCase):
+    pass
+
+
+class TestSelectAlgorithmDynamicShapes(_DynamicShapesTestBase):
+    test_linear_dynamic_shapes = TestSelectAlgorithm.test_linear_static_shapes
+
+
+instantiate_device_type_tests(TestSelectAlgorithm, globals(), only_for="cpu")
+instantiate_device_type_tests(
+    TestSelectAlgorithmDynamicShapes, globals(), only_for="cpu"
+)
+
+
+if __name__ == "__main__":
+    from torch.testing._internal.inductor_utils import HAS_CPU
+
+    if HAS_CPU and not IS_MACOS:
+        run_tests()
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -8,7 +8,7 @@
 import sys
 from copy import copy, deepcopy
 from enum import Enum
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, cast, Dict, List, Optional, Sequence, Set, Tuple, Union
 
 import sympy
 
@@ -20,6 +20,7 @@
 from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
 from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
+from ..._dynamo.utils import counters
 
 from .. import codecache, config, ir, metrics
 from ..codegen.wrapper import WrapperCodeGen
@@ -3584,6 +3585,8 @@ def _can_fuse_horizontal_impl(self, node1, node2):
         return self._why_fuse_nodes(node1, node2) is not None
 
     def can_fuse_horizontal(self, node1, node2):
+        if node1.is_template() or node2.is_template():
+            return False
         if (
             len(node1.get_nodes()) + len(node2.get_nodes())
             > config.cpp.max_horizontal_fusion_size
@@ -3664,6 +3667,9 @@ def get_fusion_pair_priority(self, node1, node2):
             return 0
 
     def can_fuse_vertical(self, node1, node2):
+        # TODO(jgong5): support vertical fusion for template nodes
+        if node1.is_template() or node2.is_template():
+            return False
         return (
             self._can_fuse_horizontal_impl(node1, node2) and not node1.is_reduction()
         ) or self.can_fuse_vertical_outer_loop(node1, node2)
@@ -3720,6 +3726,44 @@ def codegen_node(
         if args_num > CppScheduling.MAX_FUSED_KERNEL_ARGS_NUM:
             self._set_flush_status(True)
 
+    def is_cpp_template(self, node: BaseSchedulerNode) -> bool:
+        return isinstance(node, SchedulerNode) and isinstance(
+            node.node, ir.CppTemplateBuffer
+        )
+
+    def codegen_template(
+        self,
+        template_node: BaseSchedulerNode,
+        epilogue_nodes: Sequence[BaseSchedulerNode],
+    ):
+        """
+        Codegen a CPP template, possibly with fused epilogues
+        """
+        counters["inductor"]["cpp_epilogue_fusion_counter"] += len(epilogue_nodes)
+        assert self.is_cpp_template(
+            template_node
+        ), "Template node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBuffer"
+        template_node = cast(SchedulerNode, template_node)
+        _, (_, rnumel) = template_node.group
+        assert rnumel == ()
+        ctb: ir.CppTemplateBuffer = cast(ir.CppTemplateBuffer, template_node.node)
+        epilogue_ir_nodes: List[Optional[ir.Buffer]] = [n.node for n in epilogue_nodes]
+        assert all(
+            isinstance(n, ir.ComputedBuffer) for n in epilogue_ir_nodes
+        ), "Epilogue nodes must all be instances of ir.ComputedBuffer"
+        kernel, render = ctb.make_kernel_render(ctb, epilogue_nodes=epilogue_ir_nodes)
+        with kernel:
+            for node in [template_node, *epilogue_nodes]:
+                node.mark_run()  # type: ignore[attr-defined]
+            src_code = render()
+
+        with V.set_kernel_handler(kernel):
+            node_schedule = [template_node, *epilogue_nodes]
+            kernel_name = self.define_kernel(src_code, node_schedule, kernel.args)
+        kernel.call_kernel(kernel_name, ctb)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        self.scheduler.free_buffers()
+
     def _get_scheduled_num_args(self):
         return self.kernel_group.get_num_args()
 
@@ -3729,7 +3773,7 @@ def ready_to_flush(self):
     def codegen_sync(self):
         pass
 
-    def define_kernel(self, src_code, nodes):
+    def define_kernel(self, src_code, nodes, kernel_args=None):
         wrapper = V.graph.wrapper_code
         fused_name = (
             get_fused_kernel_name(nodes, config.cpp.descriptive_names)
@@ -3745,7 +3789,8 @@ def define_kernel(self, src_code, nodes):
         src_code = src_code.replace("#pragma CMT", "//")
 
         compile_wrapper = IndentedBuffer()
-        _, _, arg_types = self.kernel_group.args.cpp_argdefs()
+        args = self.kernel_group.args if kernel_args is None else kernel_args
+        _, _, arg_types = args.cpp_argdefs()
         if not V.graph.cpp_wrapper:
             compile_wrapper.writeline(f"async_compile.cpp_pybinding({arg_types!r}, '''")
         compile_wrapper.splice(src_code, strip=True)