up

yf225 · yf225 · commit 2f765f1675e7 · 2025-10-29T18:26:11.000-07:00
diff --git a/README.md b/README.md
@@ -301,6 +301,11 @@ To view the generated Triton code, set the environment variable `HELION_PRINT_OU
 helpful for debugging and understanding Helion's compilation process.  One can also use
 `foo_kernel.bind(args).to_triton_code(config)` to get the Triton code as a string.
 
+To emit a repro script that includes the Helion kernel definition, the config decorator, and a
+`helion_repro_caller()` helper that recreates the runtime inputs before invoking the Helion kernel, set
+`HELION_PRINT_REPRO=1` or include `print_repro=True` in the `@helion.kernel` decorator. This prints
+the repro script to `stderr`, which is helpful for debugging and for sharing minimal repro on GitHub issue tracker.
+
 Within an `hl.tile`/`hl.grid` device loop, if you want to print intermediate results using `print("x", ...)` syntax,
 or pause execution using Python's built-in `breakpoint()`, set either `TRITON_INTERPRET=1` (runs Triton's CPU interpreter)
 or `HELION_INTERPRET=1` (runs the Helion kernel in eager mode).
diff --git a/docs/api/config.md b/docs/api/config.md
@@ -27,7 +27,7 @@ The `Config` class represents kernel optimization parameters that control how He
 |--------|--------|----------|
 | **Purpose** | Control execution performance | Control compilation behavior |
 | **Autotuning** | ✅ Automatically optimized | ❌ Never autotuned |
-| **Examples** | `block_sizes`, `num_warps`, `indexing` | `print_output_code`, `autotune_effort` |
+| **Examples** | `block_sizes`, `num_warps`, `indexing` | `print_output_code`, `print_repro`, `autotune_effort` |
 | **When to use** | Performance optimization | Development, debugging, environment setup |
 
 
diff --git a/docs/api/kernel.md b/docs/api/kernel.md
@@ -161,6 +161,7 @@ Settings control **how the kernel is compiled** and the development environment:
     autotune_effort="none",      # Skip autotuning for development
     autotune_effort="quick",     # Smaller autotuning budget when search is enabled
     print_output_code=True,       # Debug: show generated Triton code
+    print_repro=True,             # Debug: show Helion kernel code, config, and caller code as a standalone repro script
     static_shapes=True,           # Compilation optimization strategy
     autotune_log_level=logging.DEBUG  # Verbose autotuning output
 )
diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -61,7 +61,8 @@ import helion.language as hl
 
 @helion.kernel(
     autotune_effort="none",           # Skip autotuning
-    print_output_code=True,            # Debug output
+    print_output_code=True,            # Debug: show generated Triton code
+    print_repro=True,                  # Debug: show Helion kernel code, config, and caller code as a standalone repro script
 )
 def my_kernel(x: torch.Tensor) -> torch.Tensor:
     result = torch.zeros_like(x)
@@ -190,6 +191,10 @@ See :class:`helion.autotuner.LocalAutotuneCache` for details on cache keys and b
 
    Print generated Triton code to stderr. Default is ``False``. Controlled by ``HELION_PRINT_OUTPUT_CODE=1``.
 
+.. autoattribute:: Settings.print_repro
+
+   Print Helion kernel code, config, and caller code to stderr as a standalone repro script. Default is ``False``. Controlled by ``HELION_PRINT_REPRO=1``.
+
 .. autoattribute:: Settings.output_origin_lines
 
    Annotate generated Triton code with ``# src[<file>:<line>]`` comments indicating the originating Helion statements.
@@ -259,6 +264,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_SKIP_CACHE`` | ``LocalAutotuneCache`` | When set to ``1``, ignore cached autotuning entries and rerun searches. |
 | ``HELION_ASSERT_CACHE_HIT`` | ``AutotuneCacheBase`` | When set to ``1``, require a cache hit; raises ``CacheAssertionError`` on cache miss with detailed diagnostics. |
 | ``HELION_PRINT_OUTPUT_CODE`` | ``print_output_code`` | Print generated Triton code to stderr for inspection. |
+| ``HELION_PRINT_REPRO`` | ``print_repro`` | Print Helion kernel code, config, and caller code to stderr as a standalone repro script. |
 | ``HELION_OUTPUT_ORIGIN_LINES`` | ``output_origin_lines`` | Include ``# src[...]`` comments in generated Triton code; set to ``0`` to disable. |
 | ``HELION_IGNORE_WARNINGS`` | ``ignore_warnings`` | Comma-separated warning names defined in ``helion.exc`` to suppress. |
 | ``HELION_ALLOW_WARP_SPECIALIZE`` | ``allow_warp_specialize`` | Permit warp-specialized code generation for ``tl.range``. |
diff --git a/docs/index.md b/docs/index.md
@@ -241,6 +241,11 @@ To view the generated Triton code, set the environment variable `HELION_PRINT_OU
 helpful for debugging and understanding Helion's compilation process.  One can also use
 `foo_kernel.bind(args).to_triton_code(config)` to get the Triton code as a string.
 
+To emit a repro script that includes the Helion kernel definition, the config decorator, and a
+`helion_repro_caller()` helper that recreates the runtime inputs before invoking the Helion kernel, set
+`HELION_PRINT_REPRO=1` or include `print_repro=True` in the `@helion.kernel` decorator. This prints
+the repro script to `stderr`, which is helpful for debugging and for sharing minimal repro on GitHub issue tracker.
+
 To force autotuning, bypassing provided configurations, set `HELION_FORCE_AUTOTUNE=1` or invoke `foo_kernel.autotune(args,
 force=True)`.
 
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -9,6 +9,7 @@
 import operator
 import re
 import sys
+import textwrap
 import types
 from typing import TYPE_CHECKING
 from typing import Callable
@@ -641,8 +642,88 @@ def __call__(self, *args: object) -> _R:
             self.format_kernel_decorator(self._config, self.settings)
         ] = 1
 
+        if self.settings.print_repro:
+            self._print_repro(args)
+
         return self._run(*args)
 
+    def _print_repro(
+        self, args: tuple[object, ...], config: Config | None = None
+    ) -> None:
+        effective_config = config or self._config
+        assert effective_config is not None
+
+        # Get kernel source
+        try:
+            raw_source = inspect.getsource(self.kernel.fn)
+            source_lines = textwrap.dedent(raw_source).splitlines()
+            # Skip decorator lines
+            start_idx = 0
+            while start_idx < len(source_lines) and source_lines[
+                start_idx
+            ].lstrip().startswith("@"):
+                start_idx += 1
+            kernel_body = "\n".join(source_lines[start_idx:])
+        except (OSError, TypeError):
+            kernel_body = f"# Source unavailable for {self.kernel.fn.__module__}.{self.kernel.fn.__qualname__}"
+
+        # Format decorator
+        decorator = self.format_kernel_decorator(effective_config, self.settings)
+
+        # Build output
+        output_lines = [
+            "# === HELION KERNEL REPRO ===",
+            "import helion",
+            "import helion.language as hl",
+            "import torch",
+            "from torch._dynamo.testing import rand_strided",
+            "",
+            decorator,
+            kernel_body,
+        ]
+
+        # Generate caller function
+        if args:
+
+            def _render_input_arg_assignment(name: str, value: object) -> list[str]:
+                if isinstance(value, torch.Tensor):
+                    shape = tuple(int(d) for d in value.shape)
+                    stride = tuple(int(s) for s in value.stride())
+                    device = str(value.device)
+                    dtype = str(value.dtype)
+
+                    lines = [
+                        f"{name} = rand_strided({shape!r}, {stride!r}, dtype={dtype}, device={device!r})"
+                    ]
+
+                    if value.requires_grad:
+                        lines.append(f"{name}.requires_grad_(True)")
+                    return lines
+
+                return [f"{name} = {value!r}"]
+
+            sig_param_names = list(self.kernel.signature.parameters.keys())
+            assert len(args) == len(sig_param_names)
+
+            output_lines.extend(["", "def helion_repro_caller():"])
+            output_lines.append("    torch.manual_seed(0)")
+            arg_names = []
+
+            for i, value in enumerate(args):
+                var_name = sig_param_names[i]
+                arg_names.append(var_name)
+
+                # Add assignment lines with indentation
+                for line in _render_input_arg_assignment(var_name, value):
+                    output_lines.append(f"    {line}")
+
+            # Add return statement
+            call_args = ", ".join(arg_names)
+            output_lines.append(f"    return {self.kernel.name}({call_args})")
+
+        output_lines.append("# === END HELION KERNEL REPRO ===")
+        print("\n".join(output_lines), file=sys.stderr)
+
 
 class _KernelDecorator(Protocol):
     def __call__(
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -314,6 +314,9 @@ class _Settings:
             _env_get_bool, "HELION_PRINT_OUTPUT_CODE", False
         )
     )
+    print_repro: bool = dataclasses.field(
+        default_factory=functools.partial(_env_get_bool, "HELION_PRINT_REPRO", False)
+    )
     output_origin_lines: bool = dataclasses.field(
         default_factory=functools.partial(
             _env_get_bool, "HELION_OUTPUT_ORIGIN_LINES", True
@@ -384,6 +387,7 @@ class Settings(_Settings):
             "Set HELION_AUTOTUNE_IGNORE_ERRORS=1 to enable globally."
         ),
         "print_output_code": "If True, print the output code of the kernel to stderr.",
+        "print_repro": "If True, print Helion kernel code, config, and caller code to stderr as a standalone repro script.",
         "output_origin_lines": (
             "If True, annotate generated Triton code with source-origin comments. "
             "Set HELION_OUTPUT_ORIGIN_LINES=0 to disable."
diff --git a/test/test_debug_utils.expected b/test/test_debug_utils.expected
@@ -8,7 +8,7 @@ import torch
 from torch._dynamo.testing import rand_strided
 
 @helion.kernel(config=helion.Config(block_sizes=[2, 2], flatten_loops=[False], indexing=['pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=[''], loop_orders=[[0, 1]], num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[0], range_unroll_factors=[0], range_warp_specializes=[None]), static_shapes=True)
-def repro_kernel(x: torch.Tensor) -> torch.Tensor:
+def kernel1(x: torch.Tensor) -> torch.Tensor:
     out = torch.empty_like(x)
     m, n = x.shape
     for tile_m, tile_n in hl.tile([m, n]):
@@ -18,4 +18,4 @@ def repro_kernel(x: torch.Tensor) -> torch.Tensor:
 def helion_repro_caller():
     torch.manual_seed(0)
     x = rand_strided((2, 2), (2, 1), dtype=torch.float32, device=DEVICE)
-    return repro_kernel(x)
+    return kernel1(x)
diff --git a/test/test_debug_utils.py b/test/test_debug_utils.py
@@ -32,7 +32,7 @@ def test_print_repro_env_var(self):
         try:
 
             @helion.kernel(autotune_effort="none")
-            def repro_kernel(x: torch.Tensor) -> torch.Tensor:
+            def kernel1(x: torch.Tensor) -> torch.Tensor:
                 out = torch.empty_like(x)
                 m, n = x.shape
                 for tile_m, tile_n in hl.tile([m, n]):
@@ -45,7 +45,7 @@ def repro_kernel(x: torch.Tensor) -> torch.Tensor:
             if hasattr(self, "_capfd"):
                 self._capfd.readouterr()
 
-            result = repro_kernel(x)
+            result = kernel1(x)
             torch.testing.assert_close(result, x + 1)
 
             if not hasattr(self, "_capfd"):

Original file line number	Diff line number	Diff line change
`@@ -161,6 +161,7 @@ Settings control how the kernel is compiled and the development environment:`
`161`	`161`	`autotune_effort="none", # Skip autotuning for development`
`162`	`162`	`autotune_effort="quick", # Smaller autotuning budget when search is enabled`
`163`	`163`	`print_output_code=True, # Debug: show generated Triton code`
	`164`	`+ print_repro=True, # Debug: show Helion kernel code, config, and caller code as a standalone repro script`
`164`	`165`	`static_shapes=True, # Compilation optimization strategy`
`165`	`166`	`autotune_log_level=logging.DEBUG # Verbose autotuning output`
`166`	`167`	`)`