pytorch · oulgen · Sep 24, 2025 · Sep 24, 2025
diff --git a/.github/matrix.json b/.github/matrix.json
@@ -4,17 +4,17 @@
       "runner": "linux.g5.4xlarge.nvidia.gpu",
       "python-version": "3.10",
       "ref-eager": false,
-      "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04",
-      "runtime-version": "cu129",
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cu128",
       "container-options": "--gpus all",
       "alias": "a10g"
     },
     {
       "runner": "linux.g5.4xlarge.nvidia.gpu",
       "python-version": "3.12",
       "ref-eager": false,
-      "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04",
-      "runtime-version": "cu129",
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cu128",
       "container-options": "--gpus all",
       "alias": "a10g-dtype-asserts",
       "dtype-asserts": true,
@@ -24,35 +24,35 @@
       "runner": "linux.g5.4xlarge.nvidia.gpu",
       "python-version": "3.12",
       "ref-eager": false,
-      "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04",
-      "runtime-version": "cu129",
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cu128",
       "container-options": "--gpus all",
       "alias": "a10g"
     },
     {
       "runner": "linux.g5.4xlarge.nvidia.gpu",
       "python-version": "3.12",
       "ref-eager": true,
-      "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04",
-      "runtime-version": "cu129",
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cu128",
       "container-options": "--gpus all",
       "alias": "a10g-ref-eager"
     },
     {
       "runner": "linux.aws.h100",
       "python-version": "3.12",
       "ref-eager": false,
-      "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04",
-      "runtime-version": "cu129",
+      "image": "nvidia/cuda:12.8.1-devel-ubuntu24.04",
+      "runtime-version": "cu128",
       "container-options": "--gpus all",
       "alias": "h100"
     },
     {
       "runner": "linux.dgx.b200",
       "python-version": "3.12",
       "ref-eager": false,
-      "image": "nvidia/cuda:12.9.1-devel-ubuntu24.04",
-      "runtime-version": "cu129",
+      "image": "nvidia/cuda:13.0.1-devel-ubuntu24.04",
+      "runtime-version": "cu130",
       "container-options": "--gpus all",
       "alias": "b200"
     },

diff --git a/test/test_loops.expected b/test/test_loops.expected
@@ -1079,7 +1079,6 @@ from __future__ import annotations
 import torch
 import triton
 import triton.language as tl
-from torch._inductor.runtime.triton_compat import libdevice
 from helion.runtime import default_launcher as _default_launcher
 
 @triton.jit
@@ -1115,7 +1114,7 @@ def _helion_three_pass_kernel(x, out, out_stride_0, out_stride_1, x_stride_0, x_
     v_8 = v_6 - v_7
     v_9 = 1e-06
     v_10 = v_8 + v_9
-    v_11 = libdevice.sqrt(v_10)
+    v_11 = tl.sqrt_rn(v_10)
     for offset_3 in tl.range(0, M.to(tl.int32), _BLOCK_SIZE_3):
         indices_3 = offset_3 + tl.arange(0, _BLOCK_SIZE_3).to(tl.int32)
         mask_3 = indices_3 < M

diff --git a/test/test_reductions.expected b/test/test_reductions.expected
@@ -164,7 +164,7 @@ def _helion_multi_math_ops_fp16_kernel(x, result, x_size_0, result_stride_0, res
     tl.store(result + (indices_0 * result_stride_0 + 0 * result_stride_1), v_2, mask_0)
     load_1 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
     v_3 = tl.cast(load_1, tl.float32)
-    v_4 = libdevice.sqrt(v_3)
+    v_4 = tl.sqrt_rn(v_3)
     v_5 = tl.cast(v_4, tl.float16)
     tl.store(result + (indices_0 * result_stride_0 + 1 * result_stride_1), v_5, mask_0)
     load_2 = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)