diff --git a/.github/matrix.json b/.github/matrix.json index ff2f5baf3..24b151abb 100644 --- a/.github/matrix.json +++ b/.github/matrix.json @@ -4,8 +4,8 @@ "runner": "linux.g5.4xlarge.nvidia.gpu", "python-version": "3.10", "ref-eager": false, - "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04", - "runtime-version": "cu129", + "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", + "runtime-version": "cu128", "container-options": "--gpus all", "alias": "a10g" }, @@ -13,8 +13,8 @@ "runner": "linux.g5.4xlarge.nvidia.gpu", "python-version": "3.12", "ref-eager": false, - "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04", - "runtime-version": "cu129", + "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", + "runtime-version": "cu128", "container-options": "--gpus all", "alias": "a10g-dtype-asserts", "dtype-asserts": true, @@ -24,8 +24,8 @@ "runner": "linux.g5.4xlarge.nvidia.gpu", "python-version": "3.12", "ref-eager": false, - "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04", - "runtime-version": "cu129", + "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", + "runtime-version": "cu128", "container-options": "--gpus all", "alias": "a10g" }, @@ -33,8 +33,8 @@ "runner": "linux.g5.4xlarge.nvidia.gpu", "python-version": "3.12", "ref-eager": true, - "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04", - "runtime-version": "cu129", + "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", + "runtime-version": "cu128", "container-options": "--gpus all", "alias": "a10g-ref-eager" }, @@ -42,8 +42,8 @@ "runner": "linux.aws.h100", "python-version": "3.12", "ref-eager": false, - "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04", - "runtime-version": "cu129", + "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04", + "runtime-version": "cu128", "container-options": "--gpus all", "alias": "h100" }, @@ -51,8 +51,8 @@ "runner": "linux.dgx.b200", "python-version": "3.12", "ref-eager": false, - "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04", - "runtime-version": "cu129", + "image": "nvidia/cuda:13.0.1-devel-ubuntu24.04", + "runtime-version": "cu130", "container-options": "--gpus all", "alias": "b200" }, diff --git a/test/test_loops.expected b/test/test_loops.expected index 7a93427d8..b2d65abe7 100644 --- a/test/test_loops.expected +++ b/test/test_loops.expected @@ -1079,7 +1079,6 @@ from __future__ import annotations import torch import triton import triton.language as tl -from torch._inductor.runtime.triton_compat import libdevice from helion.runtime import default_launcher as _default_launcher @triton.jit @@ -1115,7 +1114,7 @@ def _helion_three_pass_kernel(x, out, out_stride_0, out_stride_1, x_stride_0, x_ v_8 = v_6 - v_7 v_9 = 1e-06 v_10 = v_8 + v_9 - v_11 = libdevice.sqrt(v_10) + v_11 = tl.sqrt_rn(v_10) for offset_3 in tl.range(0, M.to(tl.int32), _BLOCK_SIZE_3): indices_3 = offset_3 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32) mask_3 = indices_3 < M diff --git a/test/test_reductions.expected b/test/test_reductions.expected index 0003aca5f..7d347b69e 100644 --- a/test/test_reductions.expected +++ b/test/test_reductions.expected @@ -164,7 +164,7 @@ def _helion_multi_math_ops_fp16_kernel(x, result, x_size_0, result_stride_0, res tl.store(result + (indices_0 * result_stride_0 + 0 * result_stride_1), v_2, mask_0) load_1 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0) v_3 = tl.cast(load_1, tl.float32) - v_4 = libdevice.sqrt(v_3) + v_4 = tl.sqrt_rn(v_3) v_5 = tl.cast(v_4, tl.float16) tl.store(result + (indices_0 * result_stride_0 + 1 * result_stride_1), v_5, mask_0) load_2 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)