From 9cc272cba26911bfcd15564103bb101e360a0be5 Mon Sep 17 00:00:00 2001 From: Oguz Ulgen Date: Sun, 31 Aug 2025 10:40:03 -0700 Subject: [PATCH] Update expected test results stack-info: PR: https://github.com/pytorch/helion/pull/541, branch: oulgen/stack/69 --- test/test_examples.expected | 41 ++++++++++++++++++----------------- test/test_reductions.expected | 2 +- test/test_views.expected | 8 +++---- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/test/test_examples.expected b/test/test_examples.expected index 16ba95d3a..7d70581b2 100644 --- a/test/test_examples.expected +++ b/test/test_examples.expected @@ -568,6 +568,7 @@ import torch import triton import triton.language as tl from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -587,7 +588,7 @@ def _helion_cross_entropy(labels, logits_flat, logits, losses, labels_stride_0, _mask_to = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), logits_rows, float('-inf')) max_logits = tl.reshape(tl.max(_mask_to, 1), [1, 1]) v_4 = logits_rows - max_logits - v_5 = tl_math.exp(v_4) + v_5 = libdevice.exp(v_4) _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_5, 0) sum_exp = tl.reshape(tl.sum(_mask_to_1, 1), [1, 1]) squeeze = tl.reshape(max_logits, [1]) @@ -1020,7 +1021,7 @@ import torch import triton import triton.language as tl from torch._inductor.runtime import triton_helpers -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -1085,14 +1086,14 @@ def _helion_jagged_softmax_kernel(x_offsets, x_flat, out, out_stride_0, x_flat_s slice_max = tl.max(_mask_to_1, 1) block_new_max = triton_helpers.maximum(block_max_copy_0, slice_max) v_17 = block_max_copy_0 - block_new_max - v_18 = tl_math.exp(v_17) + v_18 = libdevice.exp(v_17) v_19 = block_L_copy_0 * v_18 subscript_8 = block_new_max[:, None, :] v_20 = x_slice - subscript_8 v_21 = float('-inf') v_22 = v_21[None, None, None] v_23 = tl.where(v_12, v_20, v_22) - v_24 = tl_math.exp(v_23) + v_24 = libdevice.exp(v_23) _mask_to_2 = tl.where(mask_0[:, None, None] & mask_2[None, :, None] & mask_1[None, None, :], v_24, 0) sum_1 = tl.sum(_mask_to_2, 1) block_L = v_19 + sum_1 @@ -1129,7 +1130,7 @@ def _helion_jagged_softmax_kernel(x_offsets, x_flat, out, out_stride_0, x_flat_s x_slice_1 = tl.load(x_flat + v_30 * x_flat_stride_0, mask_0[:, None, None] & mask_3[None, :, None] & mask_1[None, None, :] & v_35, other=0) subscript_17 = block_max_copy_1_0[:, None, :] v_36 = x_slice_1 - subscript_17 - v_37 = tl_math.exp(v_36) + v_37 = libdevice.exp(v_36) subscript_18 = block_L_copy_1_0[:, None, :] v_38 = v_37 / subscript_18 tl.store(out + v_30 * out_stride_0, v_38, mask_0[:, None, None] & mask_3[None, :, None] & mask_1[None, None, :] & v_35) @@ -1713,7 +1714,7 @@ from __future__ import annotations import torch import triton import triton.language as tl -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -1726,7 +1727,7 @@ def _helion_softmax(x, out, out_size_0, out_size_1, x_size_0, x_size_1, out_stri _mask_to = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), load, float('-inf')) amax = tl.reshape(tl.max(_mask_to, 1), [1, 1]) v_0 = load - amax - v_1 = tl_math.exp(v_0) + v_1 = libdevice.exp(v_0) _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_1, 0) sum_1 = tl.reshape(tl.sum(_mask_to_1, 1), [1, 1]) v_2 = v_1 / sum_1 @@ -1752,7 +1753,7 @@ from __future__ import annotations import torch import triton import triton.language as tl -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -1765,7 +1766,7 @@ def _helion_softmax_decomposed(x, out, out_size_0, out_size_1, x_size_0, x_size_ _mask_to = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), values, float('-inf')) amax = tl.reshape(tl.max(_mask_to, 1), [1, 1]) v_0 = values - amax - v_1 = tl_math.exp(v_0) + v_1 = libdevice.exp(v_0) _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_1, 0) sum_exp = tl.reshape(tl.sum(_mask_to_1, 1), [1, 1]) v_2 = v_1 / sum_exp @@ -1793,7 +1794,7 @@ import torch import triton import triton.language as tl from torch._inductor.runtime import triton_helpers -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -1816,7 +1817,7 @@ def _helion_softmax(x, out, out_size_0, out_size_1, x_size_0, x_size_1, out_stri amax_copy = amax load_1 = tl.load(tl.make_block_ptr(x, [x_size_0, x_size_1], [x_stride_0, x_stride_1], [offset_0, roffset_1], [1, _REDUCTION_BLOCK_1], [1, 0]), boundary_check=[0, 1], padding_option='zero') v_1 = load_1 - amax_copy - v_2 = tl_math.exp(v_1) + v_2 = libdevice.exp(v_1) _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _REDUCTION_BLOCK_1]), v_2, 0) v_3 = sum_1_acc + _mask_to_1 sum_1_acc = v_3 @@ -1828,7 +1829,7 @@ def _helion_softmax(x, out, out_size_0, out_size_1, x_size_0, x_size_1, out_stri sum_1_copy = sum_1 load_2 = tl.load(tl.make_block_ptr(x, [x_size_0, x_size_1], [x_stride_0, x_stride_1], [offset_0, roffset_1], [1, _REDUCTION_BLOCK_1], [1, 0]), boundary_check=[0, 1], padding_option='zero') v_4 = load_2 - amax_copy_1 - v_5 = tl_math.exp(v_4) + v_5 = libdevice.exp(v_4) v_6 = v_5 / sum_1_copy tl.store(tl.make_block_ptr(out, [out_size_0, out_size_1], [out_stride_0, out_stride_1], [offset_0, roffset_1], [1, _REDUCTION_BLOCK_1], [1, 0]), v_6, boundary_check=[0, 1]) @@ -1853,7 +1854,7 @@ import torch import triton import triton.language as tl from torch._inductor.runtime import triton_helpers -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -1876,11 +1877,11 @@ def _helion_softmax_two_pass(x, out, out_stride_0, out_stride_1, x_stride_0, x_s local_amax = tl.max(_mask_to, 1) v_0 = triton_helpers.maximum(mi_copy_0, local_amax) v_1 = mi_copy_0 - v_0 - v_2 = tl_math.exp(v_1) + v_2 = libdevice.exp(v_1) v_3 = di_copy_0 * v_2 subscript = v_0[:, None] v_4 = values - subscript - v_5 = tl_math.exp(v_4) + v_5 = libdevice.exp(v_4) _mask_to_1 = tl.where(mask_0[:, None] & mask_1[None, :], v_5, 0) sum_1 = tl.sum(_mask_to_1, 1) di = v_3 + sum_1 @@ -1895,7 +1896,7 @@ def _helion_softmax_two_pass(x, out, out_stride_0, out_stride_1, x_stride_0, x_s values_1 = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_2[None, :] * x_stride_1), mask_0[:, None] & mask_2[None, :], other=0) subscript_1 = mi_copy_1_0[:, None] v_7 = values_1 - subscript_1 - v_8 = tl_math.exp(v_7) + v_8 = libdevice.exp(v_7) subscript_2 = di_copy_1_0[:, None] v_9 = v_8 / subscript_2 tl.store(out + (indices_0[:, None] * out_stride_0 + indices_2[None, :] * out_stride_1), v_9, mask_0[:, None] & mask_2[None, :]) @@ -1923,7 +1924,7 @@ import torch import triton import triton.language as tl from torch._inductor.runtime import triton_helpers -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -1946,11 +1947,11 @@ def _helion_softmax_two_pass(x, out, out_size_0, out_size_1, x_size_0, x_size_1, local_amax = tl.max(_mask_to, 1) v_0 = triton_helpers.maximum(mi_copy_0, local_amax) v_1 = mi_copy_0 - v_0 - v_2 = tl_math.exp(v_1) + v_2 = libdevice.exp(v_1) v_3 = di_copy_0 * v_2 subscript = v_0[:, None] v_4 = values - subscript - v_5 = tl_math.exp(v_4) + v_5 = libdevice.exp(v_4) _mask_to_1 = tl.where(mask_0[:, None] & mask_1[None, :], v_5, 0) sum_1 = tl.sum(_mask_to_1, 1) di = v_3 + sum_1 @@ -1964,7 +1965,7 @@ def _helion_softmax_two_pass(x, out, out_size_0, out_size_1, x_size_0, x_size_1, values_1 = tl.load(tl.make_block_ptr(x, [x_size_0, x_size_1], [x_stride_0, x_stride_1], [offset_0, offset_2], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), boundary_check=[0, 1], padding_option='zero') subscript_1 = mi_copy_1_0[:, None] v_7 = values_1 - subscript_1 - v_8 = tl_math.exp(v_7) + v_8 = libdevice.exp(v_7) subscript_2 = di_copy_1_0[:, None] v_9 = v_8 / subscript_2 tl.store(tl.make_block_ptr(out, [out_size_0, out_size_1], [out_stride_0, out_stride_1], [offset_0, offset_2], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), v_9, boundary_check=[0, 1]) diff --git a/test/test_reductions.expected b/test/test_reductions.expected index 5b976b0ca..cfde12dfe 100644 --- a/test/test_reductions.expected +++ b/test/test_reductions.expected @@ -194,7 +194,7 @@ def _helion_multi_math_ops_fp16_kernel(x, result, x_size_0, result_stride_0, res tl.store(result + (indices_0 * result_stride_0 + 6 * result_stride_1), v_20, mask_0) load_7 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0) v_21 = load_7.to(tl.float32) - v_22 = tl_math.exp(v_21) + v_22 = libdevice.exp(v_21) v_23 = v_22.to(tl.float16) tl.store(result + (indices_0 * result_stride_0 + 7 * result_stride_1), v_23, mask_0) diff --git a/test/test_views.expected b/test/test_views.expected index 0429d4cc6..6f71fe1a8 100644 --- a/test/test_views.expected +++ b/test/test_views.expected @@ -44,7 +44,7 @@ from __future__ import annotations import torch import triton import triton.language as tl -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -60,7 +60,7 @@ def _helion_softmax(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, amax_1 = amax[:, None] v_0 = values - amax_1 v_1 = v_0.to(tl.float32) - v_2 = tl_math.exp(v_1) + v_2 = libdevice.exp(v_1) v_3 = v_2.to(tl.float16) _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_3, 0) sum_1 = tl.sum(_mask_to_1, 1) @@ -81,7 +81,7 @@ from __future__ import annotations import torch import triton import triton.language as tl -from torch._inductor.runtime.triton_helpers import math as tl_math +from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -97,7 +97,7 @@ def _helion_softmax(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, amax_1 = tl.reshape(amax, [1, 1]) v_0 = values - amax_1 v_1 = v_0.to(tl.float32) - v_2 = tl_math.exp(v_1) + v_2 = libdevice.exp(v_1) v_3 = v_2.to(tl.float16) _mask_to_1 = tl.where(tl.broadcast_to(mask_1[None, :], [1, _RDIM_SIZE_1]), v_3, 0) sum_1 = tl.sum(_mask_to_1, 1)