Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions helion/autotuner/config_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from ..runtime.config import PidTypeLiteral

DEFAULT_NUM_WARPS = 4
DEFAULT_NUM_STAGES = 2
DEFAULT_NUM_STAGES = 1
VALID_KEYS: frozenset[str] = frozenset(
[
"block_sizes",
Expand Down Expand Up @@ -400,7 +400,7 @@ def _fragment(self, base: ConfigSpec) -> BlockSizeFragment:
[next_power_of_2(spec.size_hint) for spec in base.reduction_loops]
)
if total_ndim <= 1 and reduction_numel <= 1:
default = 1024
default = 256
elif total_ndim <= 2 and reduction_numel <= 128:
default = 32
elif reduction_numel <= 256:
Expand Down
84 changes: 42 additions & 42 deletions test/test_associative_scan.expected

Large diffs are not rendered by default.

28 changes: 14 additions & 14 deletions test/test_atomic_ops.expected
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def atomic_add_2d_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default
_BLOCK_SIZE_1 = 8
# src[test_atomic_ops.py:N]: for i, j in hl.tile([y.size(0), y.size(1)]):
# src[test_atomic_ops.py:N]: hl.atomic_add(x, [i, j], y[i, j])
_launcher(_helion_atomic_add_2d_kernel, (triton.cdiv(3, _BLOCK_SIZE_0) * triton.cdiv(4, _BLOCK_SIZE_1),), y, x, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=2)
_launcher(_helion_atomic_add_2d_kernel, (triton.cdiv(3, _BLOCK_SIZE_0) * triton.cdiv(4, _BLOCK_SIZE_1),), y, x, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand Down Expand Up @@ -77,7 +77,7 @@ def atomic_add_1d_tensor_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_
# src[test_atomic_ops.py:N]: x_tile = x[tile_m, :].to(torch.float32)
# src[test_atomic_ops.py:N]: y_tile = y[tile_m, :].to(torch.float32)
# src[test_atomic_ops.py:N-N]: ...
_launcher(_helion_atomic_add_1d_tensor_kernel, (triton.cdiv(32, _BLOCK_SIZE_0),), x, y, z, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=2)
_launcher(_helion_atomic_add_1d_tensor_kernel, (triton.cdiv(32, _BLOCK_SIZE_0),), x, y, z, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return z
return z

Expand Down Expand Up @@ -108,7 +108,7 @@ def atomic_add_float_kernel(x: torch.Tensor, indices: torch.Tensor, *, _launcher
# src[test_atomic_ops.py:N]: for i in hl.tile(indices.size(0)):
# src[test_atomic_ops.py:N]: idx = indices[i]
# src[test_atomic_ops.py:N]: hl.atomic_add(x, [idx], 2.0)
_launcher(_helion_atomic_add_float_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), indices, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_add_float_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), indices, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand Down Expand Up @@ -140,7 +140,7 @@ def k(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: old = hl.atomic_add(x, [i], y[i])
# src[test_atomic_ops.py:N]: prev[i] = old
_launcher(_helion_k, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, prev, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_k, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, prev, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x, prev
return (x, prev)

Expand Down Expand Up @@ -168,7 +168,7 @@ def atomic_add_w_tile_attr(x: torch.Tensor, *, _launcher=_default_launcher):
_BLOCK_SIZE_0 = 2
# src[test_atomic_ops.py:N]: for tile in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_add(y, [tile.begin], 1)
_launcher(_helion_atomic_add_w_tile_attr, (triton.cdiv(20, _BLOCK_SIZE_0),), y, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_add_w_tile_attr, (triton.cdiv(20, _BLOCK_SIZE_0),), y, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return y
return y

Expand All @@ -195,7 +195,7 @@ def atomic_and_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_la
_BLOCK_SIZE_0 = 8
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_and(x, [i], y[i])
_launcher(_helion_atomic_and_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_and_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand Down Expand Up @@ -223,7 +223,7 @@ def atomic_cas_kernel(x: torch.Tensor, y: torch.Tensor, expect: torch.Tensor, *,
_BLOCK_SIZE_0 = 4
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_cas(x, [i], expect[i], y[i])
_launcher(_helion_atomic_cas_kernel, (triton.cdiv(4, _BLOCK_SIZE_0),), expect, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_cas_kernel, (triton.cdiv(4, _BLOCK_SIZE_0),), expect, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand All @@ -250,7 +250,7 @@ def atomic_max_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_la
_BLOCK_SIZE_0 = 4
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_max(x, [i], y[i])
_launcher(_helion_atomic_max_kernel, (triton.cdiv(4, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_max_kernel, (triton.cdiv(4, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand All @@ -277,7 +277,7 @@ def atomic_min_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_la
_BLOCK_SIZE_0 = 4
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_min(x, [i], y[i])
_launcher(_helion_atomic_min_kernel, (triton.cdiv(4, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_min_kernel, (triton.cdiv(4, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand All @@ -304,7 +304,7 @@ def atomic_or_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_lau
_BLOCK_SIZE_0 = 8
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_or(x, [i], y[i])
_launcher(_helion_atomic_or_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_or_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand All @@ -331,7 +331,7 @@ def atomic_xchg_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_l
_BLOCK_SIZE_0 = 8
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_xchg(x, [i], y[i])
_launcher(_helion_atomic_xchg_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_xchg_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand All @@ -358,7 +358,7 @@ def atomic_xor_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_la
_BLOCK_SIZE_0 = 8
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_xor(x, [i], y[i])
_launcher(_helion_atomic_xor_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_xor_kernel, (triton.cdiv(8, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand Down Expand Up @@ -387,7 +387,7 @@ def atomic_add_kernel(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_la
_BLOCK_SIZE_0 = 32
# src[test_atomic_ops.py:N]: for i in hl.tile(x.size(0)):
# src[test_atomic_ops.py:N]: hl.atomic_add(x, [i], y[i])
_launcher(_helion_atomic_add_kernel, (triton.cdiv(10, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_add_kernel, (triton.cdiv(10, _BLOCK_SIZE_0),), y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x

Expand Down Expand Up @@ -419,6 +419,6 @@ def atomic_add_overlap_kernel(x: torch.Tensor, y: torch.Tensor, indices: torch.T
# src[test_atomic_ops.py:N]: for i in hl.tile([y.size(0)]):
# src[test_atomic_ops.py:N]: idx = indices[i]
# src[test_atomic_ops.py:N]: hl.atomic_add(x, [idx], y[i])
_launcher(_helion_atomic_add_overlap_kernel, (triton.cdiv(10, _BLOCK_SIZE_0),), indices, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=2)
_launcher(_helion_atomic_add_overlap_kernel, (triton.cdiv(10, _BLOCK_SIZE_0),), indices, y, x, _BLOCK_SIZE_0, num_warps=4, num_stages=1)
# src[test_atomic_ops.py:N]: return x
return x
6 changes: 3 additions & 3 deletions test/test_autotuner.expected
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ This file is automatically generated by assertExpectedJournal calls in test_auto
Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.

--- assertExpectedJournal(TestAutotuner.test_config_fragment0)
helion.Config(block_sizes=[16, 16, 16], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=2, num_warps=4, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None])
helion.Config(block_sizes=[16, 16, 16], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1]], num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None, None], range_multi_buffers=[None, None], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, None])
helion.Config(block_sizes=[32, 128, 64], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[8], load_eviction_policies=['last', 'last'], loop_orders=[[1, 0]], num_stages=8, num_warps=8, pid_type='flat', range_flattens=[None, True], range_multi_buffers=[None, True], range_num_stages=[0, 0], range_unroll_factors=[0, 0], range_warp_specializes=[None, True])
helion.Config(block_sizes=[64, 16, 64], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[8], load_eviction_policies=['last', 'last'], loop_orders=[[1, 0]], num_stages=2, num_warps=32, pid_type='persistent_blocked', range_flattens=[False, False], range_multi_buffers=[True, None], range_num_stages=[0, 0], range_unroll_factors=[4, 3], range_warp_specializes=[False, None])
helion.Config(block_sizes=[16, 64, 512], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[4], load_eviction_policies=['last', ''], loop_orders=[[1, 0]], num_stages=3, num_warps=16, pid_type='persistent_interleaved', range_flattens=[False, None], range_multi_buffers=[None, True], range_num_stages=[4, 0], range_unroll_factors=[4, 0], range_warp_specializes=[None, True])
Expand All @@ -14,7 +14,7 @@ helion.Config(block_sizes=[64, 64, 16], indexing=['tensor_descriptor', 'tensor_d
helion.Config(block_sizes=[16, 16, 16], indexing=['pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[1], load_eviction_policies=['first', 'first'], loop_orders=[[0, 1]], num_stages=7, num_warps=1, pid_type='persistent_interleaved', range_flattens=[True, False], range_multi_buffers=[None, None], range_num_stages=[4, 0], range_unroll_factors=[1, 3], range_warp_specializes=[True, None])

--- assertExpectedJournal(TestAutotuner.test_config_fragment1)
helion.Config(block_sizes=[8, 16, 16], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=2, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
helion.Config(block_sizes=[8, 16, 16], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
helion.Config(block_sizes=[4, 256, 256], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[4], load_eviction_policies=['', ''], loop_orders=[[1, 2, 0]], num_stages=4, num_warps=8, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[False], range_unroll_factors=[1], range_warp_specializes=[True])
helion.Config(block_sizes=[1, 64, 128], flatten_loops=[True], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['first', 'last'], loop_orders=[[1, 2, 0]], num_stages=7, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
helion.Config(block_sizes=[8, 1, 16], flatten_loops=[True], indexing=['pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[16], load_eviction_policies=['last', 'last'], loop_orders=[[2, 1, 0]], num_stages=2, num_warps=16, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[True], range_unroll_factors=[3], range_warp_specializes=[None])
Expand All @@ -26,7 +26,7 @@ helion.Config(block_sizes=[4, 16, 16], flatten_loops=[False], indexing=['pointer
helion.Config(block_sizes=[4, 1, 2], flatten_loops=[True], indexing=['tensor_descriptor', 'tensor_descriptor', 'pointer'], l2_groupings=[16], load_eviction_policies=['', 'last'], loop_orders=[[0, 2, 1]], num_stages=7, num_warps=32, pid_type='persistent_blocked', range_flattens=[False], range_multi_buffers=[None], range_unroll_factors=[1], range_warp_specializes=[True])

--- assertExpectedJournal(TestAutotuner.test_config_warp_specialize_unroll)
helion.Config(block_sizes=[8, 16, 16], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=2, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
helion.Config(block_sizes=[8, 16, 16], flatten_loops=[False], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['', ''], loop_orders=[[0, 1, 2]], num_stages=1, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
helion.Config(block_sizes=[4, 256, 256], flatten_loops=[False], indexing=['tensor_descriptor', 'tensor_descriptor', 'tensor_descriptor'], l2_groupings=[4], load_eviction_policies=['', ''], loop_orders=[[1, 2, 0]], num_stages=4, num_warps=8, pid_type='persistent_blocked', range_flattens=[None], range_multi_buffers=[False], range_unroll_factors=[0], range_warp_specializes=[True])
helion.Config(block_sizes=[1, 64, 128], flatten_loops=[True], indexing=['pointer', 'pointer', 'pointer'], l2_groupings=[1], load_eviction_policies=['first', 'last'], loop_orders=[[1, 2, 0]], num_stages=7, num_warps=4, pid_type='flat', range_flattens=[None], range_multi_buffers=[None], range_num_stages=[], range_unroll_factors=[0], range_warp_specializes=[None])
helion.Config(block_sizes=[8, 1, 16], flatten_loops=[True], indexing=['pointer', 'tensor_descriptor', 'pointer'], l2_groupings=[16], load_eviction_policies=['last', 'last'], loop_orders=[[2, 1, 0]], num_stages=2, num_warps=16, pid_type='persistent_interleaved', range_flattens=[True], range_multi_buffers=[True], range_unroll_factors=[0], range_warp_specializes=[True])
Expand Down
Loading
Loading