pytorch
diff --git a/‎.ci/pytorch/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/pytorch/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 34 additions & 0 deletions b/‎setup.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎test/inductor/test_cutedsl_grouped_mm.py‎
Lines changed: 154 additions & 0 deletions b/‎test/inductor/test_cutedsl_grouped_mm.py‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎torch/_inductor/config.py‎
Lines changed: 4 additions & 0 deletions b/‎torch/_inductor/config.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/_inductor/kernel/mm_common.py‎
Lines changed: 7 additions & 0 deletions b/‎torch/_inductor/kernel/mm_common.py‎
Lines changed: 7 additions & 0 deletions
@@ -337,7 +337,7 @@ test_python() {
 
 test_python_smoke() {
   # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   assert_git_not_dirty
 }
 
 
@@ -127,6 +127,7 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
+torch/_inductor/kernel/vendored_templates/*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
 
@@ -630,6 +630,37 @@ def mirror_files_into_torchgen() -> None:
         raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")
 
 
+def mirror_inductor_external_kernels() -> None:
+    """
+    Copy external kernels into Inductor so they are importable.
+    """
+    paths = [
+        (
+            CWD / "torch/_inductor/kernel/vendored_templates/cutedsl_grouped_gemm.py",
+            CWD
+            / "third_party/cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py",
+        ),
+    ]
+    for new_path, orig_path in paths:
+        # Create the dirs involved in new_path if they don't exist
+        if not new_path.exists():
+            new_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Copy the files from the orig location to the new location
+        if orig_path.is_file():
+            shutil.copyfile(orig_path, new_path)
+            continue
+        if orig_path.is_dir():
+            if new_path.exists():
+                # copytree fails if the tree exists already, so remove it.
+                shutil.rmtree(new_path)
+            shutil.copytree(orig_path, new_path)
+            continue
+        raise RuntimeError(
+            "Check the file paths in `mirror_inductor_external_kernels()`"
+        )
+
+
 # ATTENTION: THIS IS AI SLOP
 def extract_variant_from_version(version: str) -> str:
     """Extract variant from version string, defaulting to 'cpu'."""
@@ -1616,6 +1647,8 @@ def main() -> None:
     if RUN_BUILD_DEPS:
         build_deps()
 
+    mirror_inductor_external_kernels()
+
     (
         ext_modules,
         cmdclass,
@@ -1649,6 +1682,7 @@ def main() -> None:
         "_inductor/codegen/aoti_runtime/*.cpp",
         "_inductor/script.ld",
         "_inductor/kernel/flex/templates/*.jinja",
+        "_inductor/kernel/templates/*.jinja",
         "_export/serde/*.yaml",
         "_export/serde/*.thrift",
         "share/cmake/ATen/*.cmake",
 
@@ -0,0 +1,154 @@
+# Owner(s): ["module: inductor"]
+
+
+import unittest
+
+import torch
+from torch import Tensor
+from torch._inductor import config
+from torch._inductor.codegen.cuda.cuda_env import is_datacenter_blackwell_arch
+from torch._inductor.test_case import run_tests, TestCase as InductorTestCase
+from torch._inductor.utils import ensure_cute_available
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+
+@unittest.skipIf(
+    not (ensure_cute_available() and is_datacenter_blackwell_arch()),
+    "CuTeDSL library or Blackwell device not available",
+)
+@instantiate_parametrized_tests
+class TestCuTeDSLGroupedGemm(InductorTestCase):
+    def _get_inputs(
+        self,
+        group_size: int,
+        M_hint: int,
+        K: int,
+        N: int,
+        device: str,
+        dtype: torch.dtype,
+        alignment: int = 16,
+    ) -> tuple[Tensor, Tensor, Tensor]:
+        # --- Random, tile-aligned M sizes ---
+        M_sizes = (
+            torch.randint(1, (M_hint // alignment) + 1, (group_size,), dtype=torch.int)
+            * alignment
+        )
+
+        M_total = torch.sum(M_sizes).item()
+
+        # --- Construct input tensors ---
+        A = torch.randn(int(M_total), K, dtype=dtype, device=device) * 0.1
+        B = torch.randn((group_size, K, N), dtype=dtype, device=device) * 0.01
+
+        # --- Build offsets (no leading zero, strictly increasing) ---
+        offsets = torch.cumsum(M_sizes, dim=0).to(dtype=torch.int32, device=device)
+
+        return (A, B, offsets)
+
+    @parametrize("group_size", (2, 8))
+    @parametrize("M_hint", (256, 1024))
+    @parametrize("K", (64, 128))
+    @parametrize("N", (128, 256))
+    def test_grouped_gemm_basic(self, group_size: int, M_hint: int, K: int, N: int):
+        device = "cuda"
+        dtype = torch.bfloat16
+
+        A, B, offsets = self._get_inputs(group_size, M_hint, K, N, device, dtype)
+
+        def grouped_gemm_fn(A_packed, B_batched, offs):
+            return torch._grouped_mm(A_packed, B_batched, offs=offs)
+
+        # Eager execution
+        c_eager = grouped_gemm_fn(A, B, offsets)
+
+        # Test with Cute backend
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTEDSL",
+                "test_configs.autotune_choice_name_regex": "cutedsl",
+                "autotune_fallback_to_aten": False,
+            }
+        ):
+            grouped_gemm_compiled = torch.compile(
+                grouped_gemm_fn, backend="inductor", dynamic=False
+            )
+            c_compiled = grouped_gemm_compiled(A, B, offsets)
+
+        self.assertEqual(c_eager.dtype, dtype)
+        self.assertEqual(c_compiled.dtype, dtype)
+        torch.testing.assert_close(c_eager, c_compiled)
+
+    @parametrize("layout_A", ("contiguous", "offset", "padded", "view"))
+    @parametrize("layout_B", ("contiguous", "broadcasted"))
+    def test_grouped_gemm_assorted_layouts(
+        self,
+        layout_A: str,
+        layout_B: str,
+    ):
+        device = "cuda"
+        dtype = torch.bfloat16
+
+        G, K, N = 8, 64, 128
+        M_sizes = [128] * G
+        sum_M = sum(M_sizes)
+        offsets = torch.tensor(
+            [sum(M_sizes[: i + 1]) for i in range(G)], dtype=torch.int32, device=device
+        )
+
+        A_base = torch.randn(sum_M, K, device=device, dtype=dtype)
+        A = A_base
+
+        if layout_A == "offset":
+            # allocate bigger buffer than needed, use nonzero storage offset
+            storage = torch.randn(sum_M * K + 512, device=device, dtype=dtype)
+            offset = 128  # skip first 128 elements
+            A = torch.as_strided(storage[offset:], (sum_M, K), (K, 1))
+        elif layout_A == "padded":
+            # simulate row pitch > K (row_stride = K + pad)
+            row_pitch = K + 8
+            storage = torch.randn(sum_M * row_pitch, device=device, dtype=dtype)
+            A = torch.as_strided(storage, (sum_M, K), (row_pitch, 1))
+        elif layout_A == "view":
+            A_storage = torch.randn(sum_M * K, device=device, dtype=dtype)
+            A = A_storage.view(sum_M, K)
+            assert A._base is not None
+            assert A.shape == (sum_M, K)
+
+        B = torch.randn((G, K, N), dtype=dtype, device=device) * 0.01
+
+        if layout_B == "broadcasted":
+            # Broadcast B across groups (zero stride along G)
+            B = B[0].expand(G, K, N)
+            assert B.stride(0) == 0
+
+        def grouped_gemm_fn(A_packed, B_batched, offs):
+            return torch._grouped_mm(A_packed, B_batched, offs=offs)
+
+        # --- eager ---
+        c_eager = grouped_gemm_fn(A, B, offsets)
+
+        # --- compiled (CUTE backend) ---
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTEDSL",
+                "test_configs.autotune_choice_name_regex": "cutedsl",
+                "autotune_fallback_to_aten": False,
+            }
+        ):
+            grouped_gemm_compiled = torch.compile(
+                grouped_gemm_fn, backend="inductor", dynamic=False
+            )
+            c_compiled = grouped_gemm_compiled(A, B, offsets)
+
+        self.assertEqual(c_eager.dtype, dtype)
+        self.assertEqual(c_compiled.dtype, dtype)
+        torch.testing.assert_close(c_eager, c_compiled)
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -546,6 +546,10 @@ def prologue_fusion_enabled() -> bool:
     "TORCHINDUCTOR_MAX_AUTOTUNE_FLEX_SEARCH_SPACE", "DEFAULT"
 ).upper()  # type: ignore[assignment]
 
+cutedsl_enable_autotuning: bool = (
+    os.environ.get("CUTEDSL_ENABLE_AUTOTUNING", "0") == "1"
+)
+
 # DEPRECATED. This setting is ignored.
 autotune_fallback_to_aten = False
 
 
@@ -1,6 +1,8 @@
 # mypy: allow-untyped-defs
 import logging
 from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
 from typing import Any
 
 import torch
@@ -12,6 +14,7 @@
 from .. import config
 from ..codegen.wrapper import PythonWrapperCodegen
 from ..ir import _IntLike, Layout, TensorBox
+from ..utils import load_template
 
 
 log = logging.getLogger(__name__)
@@ -254,3 +257,7 @@ def is_batch_stride_largest_or_zero(mat1, mat2, layout) -> bool:
             return False
 
     return True
+
+
+_KERNEL_TEMPLATE_DIR = Path(__file__).parent / "templates"
+load_kernel_template = partial(load_template, template_dir=_KERNEL_TEMPLATE_DIR)
Original file line number	Diff line number	Diff line change
`@@ -337,7 +337,7 @@ test_python() {`
`337`	`337`
`338`	`338`	`test_python_smoke() {`
`339`	`339`	`# Smoke tests for H100/B200`
`340`		`- time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
	`340`	`+ time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`341`	`341`	`assert_git_not_dirty`
`342`	`342`	`}`
`343`	`343`