From 8719e1895d406d57133bcbd53f90dfaa86bbec5c Mon Sep 17 00:00:00 2001
From: Will Feng <yfeng.us@gmail.com>
Date: Mon, 27 Oct 2025 15:42:08 -0700
Subject: [PATCH 1/2] Default config: reduce block_size further to avoid shared
 mem OOM

---
 helion/autotuner/config_spec.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
index 5cf2e3c16..bf7cf1270 100644
--- a/helion/autotuner/config_spec.py
+++ b/helion/autotuner/config_spec.py
@@ -399,9 +399,7 @@ def _fragment(self, base: ConfigSpec) -> BlockSizeFragment:
         reduction_numel = _product(
             [next_power_of_2(spec.size_hint) for spec in base.reduction_loops]
         )
-        if total_ndim <= 1 and reduction_numel <= 1:
-            default = 256
-        elif total_ndim <= 2 and reduction_numel <= 128:
+        if total_ndim <= 2 and reduction_numel <= 128:
             default = 32
         elif reduction_numel <= 256:
             default = 16

From f68ac9f6cd4dda837ef5f092d31dafb80279660e Mon Sep 17 00:00:00 2001
From: Will Feng <yfeng.us@gmail.com>
Date: Mon, 27 Oct 2025 16:01:58 -0700
Subject: [PATCH 2/2] update expected files

---
 test/test_closures.expected               | 10 +++++-----
 test/test_examples.expected               |  4 ++--
 test/test_graph_module.expected           |  4 ++--
 test/test_indexing.expected               |  4 ++--
 test/test_inline_asm_elementwise.expected |  6 +++---
 test/test_inline_triton.expected          |  4 ++--
 test/test_misc.expected                   |  2 +-
 test/test_random.expected                 |  2 +-
 test/test_register_tunable.expected       |  2 +-
 test/test_unroll_tuples.expected          |  6 +++---
 10 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/test/test_closures.expected b/test/test_closures.expected
index cbeb28467..0aa3bac15 100644
--- a/test/test_closures.expected
+++ b/test/test_closures.expected
@@ -76,7 +76,7 @@ def sin_func_arg(a, fn, *, _launcher=_default_launcher):
     # src[test_closures.py:N]: out = torch.empty_like(a)
     out = torch.empty_like(a)
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
     # src[test_closures.py:N]:     out[tile] = fn(torch.sin(a[tile]), tile)
     _launcher(_helion_sin_func_arg, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, fn.__closure__[0].cell_contents, out, a.size(0), a.stride(0), fn.__closure__[0].cell_contents.stride(0), out.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=1)
@@ -112,7 +112,7 @@ def sin_func_arg(a, fn, *, _launcher=_default_launcher):
     # src[test_closures.py:N]: out = torch.empty_like(a)
     out = torch.empty_like(a)
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
     # src[test_closures.py:N]:     out[tile] = fn(torch.sin(a[tile]), tile)
     _launcher(_helion_sin_func_arg, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, _source_module.global_tensor, out, a.size(0), _source_module.global_tensor.stride(0), a.stride(0), out.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=1)
@@ -147,7 +147,7 @@ def sin_func_arg(a, fn, *, _launcher=_default_launcher):
     # src[test_closures.py:N]: out = torch.empty_like(a)
     out = torch.empty_like(a)
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
     # src[test_closures.py:N]:     out[tile] = fn(torch.sin(a[tile]), tile)
     _launcher(_helion_sin_func_arg, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, out, a.size(0), a.stride(0), out.stride(0), _global_source0.global_float, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
@@ -183,7 +183,7 @@ def sin_func_arg(a, fn, *, _launcher=_default_launcher):
     # src[test_closures.py:N]: out = torch.empty_like(a)
     out = torch.empty_like(a)
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
     # src[test_closures.py:N]:     out[tile] = fn(torch.sin(a[tile]), tile)
     _launcher(_helion_sin_func_arg, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, fn.__closure__[0].cell_contents.__closure__[0].cell_contents, out, a.size(0), a.stride(0), fn.__closure__[0].cell_contents.__closure__[0].cell_contents.stride(0), out.stride(0), fn.__closure__[1].cell_contents, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
@@ -215,7 +215,7 @@ def call_func_arg_on_host(a, alloc, *, _launcher=_default_launcher):
     # src[test_closures.py:N]: out = alloc(a)
     out = alloc(a)
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_closures.py:N]: for tile in hl.tile(a.size()):
     # src[test_closures.py:N]:     out[tile] = a[tile].sin()
     _launcher(_helion_call_func_arg_on_host, (triton.cdiv(512, _BLOCK_SIZE_0),), a, out, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
diff --git a/test/test_examples.expected b/test/test_examples.expected
index 30f3965f3..bd43104da 100644
--- a/test/test_examples.expected
+++ b/test/test_examples.expected
@@ -4136,7 +4136,7 @@ def low_mem_dropout(p: float, x: torch.Tensor, seed: int, *, _launcher=_default_
     # src[low_mem_dropout.py:N]: out_flat = torch.empty_like(x_flat)
     out_flat = torch.empty_like(x_flat)
     # src[low_mem_dropout.py:N]: for tidx in hl.tile(n):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[low_mem_dropout.py:N]: for tidx in hl.tile(n):
     # src[low_mem_dropout.py:N]:     xi = x_flat[tidx].to(torch.float32)
     # src[low_mem_dropout.py:N]:     r = hl.rand([tidx], seed=seed)
@@ -6077,7 +6077,7 @@ def swiglu_bwd(gout: Tensor, x1: Tensor, x2: Tensor, *, _launcher=_default_launc
     # src[swiglu.py:N]: dx2_flat = dx2.view(-1)
     dx2_flat = dx2.view(-1)
     # src[swiglu.py:N]: for tile in hl.tile(x1.numel()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[swiglu.py:N]: for tile in hl.tile(x1.numel()):
     # src[swiglu.py:N]:     x1_vals = x1_flat[tile].to(torch.float32)
     # src[swiglu.py:N]:     gout_vals = gout_flat[tile].to(torch.float32)
diff --git a/test/test_graph_module.expected b/test/test_graph_module.expected
index f207ffea1..32bc1afd4 100644
--- a/test/test_graph_module.expected
+++ b/test/test_graph_module.expected
@@ -29,7 +29,7 @@ def apply_graph_module(func_m, x, *, _launcher=_default_launcher):
     # src[test_graph_module.py:N]: out = torch.empty_like(x)
     out = torch.empty_like(x)
     # src[test_graph_module.py:N]: for tile in hl.tile(out.size()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_graph_module.py:N]: for tile in hl.tile(out.size()):
     # src[test_graph_module.py:N]:     out[tile] = func_m(x[tile])
     _launcher(_helion_apply_graph_module, (triton.cdiv(1000, _BLOCK_SIZE_0),), x, out, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
@@ -68,7 +68,7 @@ def apply_graph_module(func_m, x, *, _launcher=_default_launcher):
     # src[test_graph_module.py:N]: out = torch.empty_like(x)
     out = torch.empty_like(x)
     # src[test_graph_module.py:N]: for tile in hl.tile(out.size()):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_graph_module.py:N]: for tile in hl.tile(out.size()):
     # src[test_graph_module.py:N]:     out[tile] = func_m(x[tile])
     _launcher(_helion_apply_graph_module, (triton.cdiv(512, _BLOCK_SIZE_0),), x, out, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
diff --git a/test/test_indexing.expected b/test/test_indexing.expected
index 202c85971..0a82023d9 100644
--- a/test/test_indexing.expected
+++ b/test/test_indexing.expected
@@ -56,7 +56,7 @@ def arange_block_size_mul(x: torch.Tensor, *, _launcher=_default_launcher):
     # src[test_indexing.py:N]: out = torch.zeros([x.size(0) * 2], dtype=torch.int32, device=x.device)
     out = torch.zeros([x.size(0) * 2], dtype=torch.int32, device=x.device)
     # src[test_indexing.py:N]: for tile in hl.tile(x.size(0)):
-    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_0 = 32
     # src[test_indexing.py:N]: for tile in hl.tile(x.size(0)):
     # src[test_indexing.py:N]:     indices = hl.arange(
     # src[test_indexing.py:N]:         tile.begin * 2, tile.begin * 2 + tile.block_size * 2
@@ -816,7 +816,7 @@ def arange_block_size_mul(x: torch.Tensor, *, _launcher=_default_launcher):
     # src[test_indexing.py:N]: ones = torch.ones_like(out)
     ones = torch.ones_like(out)
     # src[test_indexing.py:N]: for tile in hl.tile(x.size(0)):
-    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_0 = 32
     # src[test_indexing.py:N]: for tile in hl.tile(x.size(0)):
     # src[test_indexing.py:N]:     indices_start = tile.begin * 2
     # src[test_indexing.py:N]:     indices_end = indices_start + tile.block_size * 2
diff --git a/test/test_inline_asm_elementwise.expected b/test/test_inline_asm_elementwise.expected
index e2ccca1ab..8942e70b1 100644
--- a/test/test_inline_asm_elementwise.expected
+++ b/test/test_inline_asm_elementwise.expected
@@ -108,7 +108,7 @@ def kernel_multiple_outputs(a: torch.Tensor, b: torch.Tensor, *, _launcher=_defa
     # src[test_inline_asm_elementwise.py:N]: result_d = torch.empty_like(a)
     result_d = torch.empty_like(a)
     # src[test_inline_asm_elementwise.py:N]: for tile in hl.tile(a.shape):
-    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_0 = 32
     # src[test_inline_asm_elementwise.py:N]: for tile in hl.tile(a.shape):
     # src[test_inline_asm_elementwise.py:N]:     val_a = a[tile]
     # src[test_inline_asm_elementwise.py:N]:     val_b = b[tile]
@@ -146,7 +146,7 @@ def kernel_packed_asm(x: torch.Tensor, *, _launcher=_default_launcher):
     # src[test_inline_asm_elementwise.py:N]: result = torch.empty_like(x)
     result = torch.empty_like(x)
     # src[test_inline_asm_elementwise.py:N]: for tile in hl.tile(x.shape):
-    _BLOCK_SIZE_0 = 256
+    _BLOCK_SIZE_0 = 32
     # src[test_inline_asm_elementwise.py:N]: for tile in hl.tile(x.shape):
     # src[test_inline_asm_elementwise.py:N]:     val = x[tile]
     # src[test_inline_asm_elementwise.py:N]:     # Shift 4x8bit values together, pack=4
@@ -187,7 +187,7 @@ def kernel_shift_asm(x: torch.Tensor, y: torch.Tensor, n: int, *, _launcher=_def
     # src[test_inline_asm_elementwise.py:N]: result = torch.empty_like(x)
     result = torch.empty_like(x)
     # src[test_inline_asm_elementwise.py:N]: for tile in hl.tile(x.shape):
-    _BLOCK_SIZE_0 = 128
+    _BLOCK_SIZE_0 = 32
     # src[test_inline_asm_elementwise.py:N]: for tile in hl.tile(x.shape):
     # src[test_inline_asm_elementwise.py:N]:     val_x = x[tile]
     # src[test_inline_asm_elementwise.py:N]:     val_y = y[tile]
diff --git a/test/test_inline_triton.expected b/test/test_inline_triton.expected
index 450eae4a0..d4c9b7c70 100644
--- a/test/test_inline_triton.expected
+++ b/test/test_inline_triton.expected
@@ -88,7 +88,7 @@ def kernel(a: torch.Tensor, b: torch.Tensor, *, _launcher=_default_launcher):
     # src[test_inline_triton.py:N]: diff_out = torch.empty_like(a)
     diff_out = torch.empty_like(a)
     # src[test_inline_triton.py:N]: for tile in hl.tile(a.shape):
-    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_0 = 32
     # src[test_inline_triton.py:N]: for tile in hl.tile(a.shape):
     # src[test_inline_triton.py:N]:     a_val = a[tile]
     # src[test_inline_triton.py:N]:     b_val = b[tile]
@@ -131,7 +131,7 @@ def kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
     # src[test_inline_triton.py:N]: out = torch.empty_like(x)
     out = torch.empty_like(x)
     # src[test_inline_triton.py:N]: for tile in hl.tile(x.shape):
-    _BLOCK_SIZE_0 = 128
+    _BLOCK_SIZE_0 = 32
     # src[test_inline_triton.py:N]: for tile in hl.tile(x.shape):
     # src[test_inline_triton.py:N]:     x_val = x[tile]
     # src[test_inline_triton.py:N]:     y_val = y[tile]
diff --git a/test/test_misc.expected b/test/test_misc.expected
index 281159b24..1fc8e8a86 100644
--- a/test/test_misc.expected
+++ b/test/test_misc.expected
@@ -113,7 +113,7 @@ def kernel_with_scalar_item(x: torch.Tensor, scalar_tensor: torch.Tensor, *, _la
     # src[test_misc.py:N]: scalar_val = scalar_tensor.item()
     scalar_val = scalar_tensor.item()
     # src[test_misc.py:N]: for tile in hl.tile(x.shape):
-    _BLOCK_SIZE_0 = 128
+    _BLOCK_SIZE_0 = 32
     # src[test_misc.py:N]: for tile in hl.tile(x.shape):
     # src[test_misc.py:N]:     result[tile] = x[tile] + scalar_val
     _launcher(_helion_kernel_with_scalar_item, (triton.cdiv(100, _BLOCK_SIZE_0),), x, result, scalar_val, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
diff --git a/test/test_random.expected b/test/test_random.expected
index 67f949c76..e2535c665 100644
--- a/test/test_random.expected
+++ b/test/test_random.expected
@@ -26,7 +26,7 @@ def rand_kernel_tiled_1d(x: torch.Tensor, seed: int, *, _launcher=_default_launc
     # src[test_random.py:N]: (m,) = x.shape
     m, = x.shape
     # src[test_random.py:N]: for tile_m in hl.tile(m):
-    _BLOCK_SIZE_0 = 128
+    _BLOCK_SIZE_0 = 32
     # src[test_random.py:N]: for tile_m in hl.tile(m):
     # src[test_random.py:N]:     output[tile_m] = hl.rand([tile_m], seed=seed)
     _launcher(_helion_rand_kernel_tiled_1d, (triton.cdiv(m, _BLOCK_SIZE_0),), output, output.stride(0), m, seed, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
diff --git a/test/test_register_tunable.expected b/test/test_register_tunable.expected
index fe76170b2..a71a84290 100644
--- a/test/test_register_tunable.expected
+++ b/test/test_register_tunable.expected
@@ -2,7 +2,7 @@ This file is automatically generated by assertExpectedJournal calls in test_regi
 Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.
 
 --- assertExpectedJournal(TestRegisterTunable.test_integer_fragment)
-helion.Config(block_sizes=[128], indexing=['pointer', 'pointer'], load_eviction_policies=[''], multiplier=3, num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[0], range_unroll_factors=[0], range_warp_specializes=[])
+helion.Config(block_sizes=[32], indexing=['pointer', 'pointer'], load_eviction_policies=[''], multiplier=3, num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[0], range_unroll_factors=[0], range_warp_specializes=[])
 
 --- assertExpectedJournal(TestRegisterTunable.test_integer_fragment)
 from __future__ import annotations
diff --git a/test/test_unroll_tuples.expected b/test/test_unroll_tuples.expected
index 29761b6db..77bdef257 100644
--- a/test/test_unroll_tuples.expected
+++ b/test/test_unroll_tuples.expected
@@ -656,7 +656,7 @@ def kernel_nested_tuple_iteration(a_tuple: tuple[torch.Tensor, torch.Tensor], b_
     # src[test_unroll_tuples.py:N]: result = torch.zeros_like(a_tuple[0])
     result = torch.zeros_like(a_tuple[0])
     # src[test_unroll_tuples.py:N]: for tile_idx in hl.tile(result.size(0)):
-    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_0 = 32
     # src[test_unroll_tuples.py:N]: for tile_idx in hl.tile(result.size(0)):
     # src[test_unroll_tuples.py:N]:     temp = torch.zeros([tile_idx], dtype=torch.float32, device=result.device)
     # src[test_unroll_tuples.py:N-N]: ...
@@ -881,7 +881,7 @@ def kernel_tuple_with_scaling(tensor1: torch.Tensor, tensor2: torch.Tensor, tens
     # src[test_unroll_tuples.py:N]: output = torch.zeros_like(tensor1)
     output = torch.zeros_like(tensor1)
     # src[test_unroll_tuples.py:N]: for tile_idx in hl.tile(output.size(0)):
-    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_0 = 32
     # src[test_unroll_tuples.py:N]: for tile_idx in hl.tile(output.size(0)):
     # src[test_unroll_tuples.py:N]:     temp = torch.zeros([tile_idx], dtype=torch.float32, device=output.device)
     # src[test_unroll_tuples.py:N]:     for tensor, scale in zip(tensors, scales, strict=True):
@@ -924,7 +924,7 @@ def kernel_zip_iteration(tensors_a: tuple[torch.Tensor, torch.Tensor], tensors_b
     # src[test_unroll_tuples.py:N]: result = torch.zeros_like(tensors_a[0])
     result = torch.zeros_like(tensors_a[0])
     # src[test_unroll_tuples.py:N]: for tile_idx in hl.tile(result.size(0)):
-    _BLOCK_SIZE_0 = 64
+    _BLOCK_SIZE_0 = 32
     # src[test_unroll_tuples.py:N]: for tile_idx in hl.tile(result.size(0)):
     # src[test_unroll_tuples.py:N]:     acc = torch.zeros([tile_idx], dtype=torch.float32, device=result.device)
     # src[test_unroll_tuples.py:N]:     # Iterate over zip of tensors