From 2cad665a25a418cd41a6b944531a7ce44496719b Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 07:06:03 -0700 Subject: [PATCH 1/8] Upgrade rocm --- .github/workflows/regression_test_rocm.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml index a9fdb68c13..5df97e1673 100644 --- a/.github/workflows/regression_test_rocm.yml +++ b/.github/workflows/regression_test_rocm.yml @@ -24,8 +24,8 @@ jobs: runs-on: linux.rocm.gpu.gfx942.2 torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' gpu-arch-type: "rocm" - gpu-arch-version: "6.3" - docker-image: pytorch/manylinux2_28-builder:rocm6.3 + gpu-arch-version: "7.0" + docker-image: pytorch/manylinux2_28-builder:rocm7.0 permissions: id-token: write From 01a9e19b0473a4b054115fa348cf1d3f3fbfddc1 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 07:10:07 -0700 Subject: [PATCH 2/8] Upgrade rocm --- .github/workflows/regression_test_rocm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml index 5df97e1673..c12f2a61da 100644 --- a/.github/workflows/regression_test_rocm.yml +++ b/.github/workflows/regression_test_rocm.yml @@ -22,7 +22,7 @@ jobs: include: - name: ROCM Nightly runs-on: linux.rocm.gpu.gfx942.2 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0' gpu-arch-type: "rocm" gpu-arch-version: "7.0" docker-image: pytorch/manylinux2_28-builder:rocm7.0 From 48f7fa4616ab473dd4d66eac22424a65be97224f Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 14:21:37 -0700 Subject: [PATCH 3/8] Add 6.3, 6.4 to rocm --- .github/workflows/regression_test_rocm.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml index c12f2a61da..0326b2e912 100644 --- a/.github/workflows/regression_test_rocm.yml +++ b/.github/workflows/regression_test_rocm.yml @@ -20,12 +20,18 @@ jobs: fail-fast: false matrix: include: - - name: ROCM Nightly + - name: ROCM Nightly 6.3 runs-on: linux.rocm.gpu.gfx942.2 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0' + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' gpu-arch-type: "rocm" - gpu-arch-version: "7.0" - docker-image: pytorch/manylinux2_28-builder:rocm7.0 + gpu-arch-version: "6.3" + docker-image: pytorch/manylinux2_28-builder:rocm6.3 + - name: ROCM Nightly 6.4 + runs-on: linux.rocm.gpu.gfx942.2 + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4' + gpu-arch-type: "rocm" + gpu-arch-version: "6.4" + docker-image: pytorch/manylinux2_28-builder:rocm6.4 permissions: id-token: write From a4547891c146f8256f1e3eb8e684b2a38aa893bd Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 14:27:34 -0700 Subject: [PATCH 4/8] Revert 7.0 to 6.3 --- .github/workflows/regression_test_rocm.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml index 0326b2e912..b98a052343 100644 --- a/.github/workflows/regression_test_rocm.yml +++ b/.github/workflows/regression_test_rocm.yml @@ -26,12 +26,6 @@ jobs: gpu-arch-type: "rocm" gpu-arch-version: "6.3" docker-image: pytorch/manylinux2_28-builder:rocm6.3 - - name: ROCM Nightly 6.4 - runs-on: linux.rocm.gpu.gfx942.2 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4' - gpu-arch-type: "rocm" - gpu-arch-version: "6.4" - docker-image: pytorch/manylinux2_28-builder:rocm6.4 permissions: id-token: write From 45269e0a07ed02bd2001f7920ed7f8e97be8dae3 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 19:11:13 -0700 Subject: [PATCH 5/8] Skip failing rocm tests --- .github/workflows/regression_test_rocm.yml | 6 ++++++ test/dtypes/test_floatx.py | 1 + test/dtypes/test_nf4.py | 2 ++ test/dtypes/test_uintx.py | 3 +++ test/integration/test_integration.py | 21 +++++++++++++++++++++ test/prototype/mx_formats/test_kernels.py | 4 ++++ test/prototype/mx_formats/test_mx_linear.py | 2 ++ test/prototype/mx_formats/test_mx_tensor.py | 5 +++++ test/prototype/test_blockwise_triton.py | 3 +++ test/prototype/test_quantized_training.py | 5 +++++ test/sparsity/test_sparse_api.py | 3 +++ 11 files changed, 55 insertions(+) diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml index b98a052343..b9f8d1f423 100644 --- a/.github/workflows/regression_test_rocm.yml +++ b/.github/workflows/regression_test_rocm.yml @@ -26,6 +26,12 @@ jobs: gpu-arch-type: "rocm" gpu-arch-version: "6.3" docker-image: pytorch/manylinux2_28-builder:rocm6.3 + - name: ROCM Nightly 7.0 + runs-on: linux.rocm.gpu.gfx942.2 + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0' + gpu-arch-type: "rocm" + gpu-arch-version: "7.0" + docker-image: pytorch/manylinux2_28-builder:rocm7.0 permissions: id-token: write diff --git a/test/dtypes/test_floatx.py b/test/dtypes/test_floatx.py index ab4a13d24c..0cff75d309 100644 --- a/test/dtypes/test_floatx.py +++ b/test/dtypes/test_floatx.py @@ -73,6 +73,7 @@ def test_from_tc_floatx_correctness(self, ebits, mbits, device): @parametrize("ebits,mbits", _Floatx_DTYPES) @parametrize("device", _DEVICES) + @skip_if_rocm("ROCm enablement in progress") def test_from_scaled_tc_floatx_compile(self, ebits, mbits, device): M, N = 256, 64 nbits = 1 + ebits + mbits diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py index 2a711413f0..7dddf4b476 100644 --- a/test/dtypes/test_nf4.py +++ b/test/dtypes/test_nf4.py @@ -263,6 +263,7 @@ def test_smoketest_linear(self, dtype: torch.dtype): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) + @skip_if_rocm("ROCm enablement in progress") def test_smoketest_linear_compile(self, dtype: torch.dtype): if ( torch.cuda.is_available() @@ -301,6 +302,7 @@ def test_empty_like(self, input_size: Union[Tuple[int], int]): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @parametrize("compile", [False, True]) + @skip_if_rocm("ROCm enablement in progress") def test_quantize_api(self, compile): nf4_linear = nn.Linear(512, 512, device="cuda") torchao.quantize_(nf4_linear, nf4_weight_only()) diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py index cb0c88b21c..ab417ab417 100644 --- a/test/dtypes/test_uintx.py +++ b/test/dtypes/test_uintx.py @@ -14,6 +14,7 @@ dequantize_affine, quantize_affine, ) +from torchao.testing.utils import skip_if_rocm dtypes = ( torch.uint1, @@ -75,6 +76,7 @@ def test_uintx_quant_on_cpu_then_move_to_cuda(dtype, group_size): @pytest.mark.parametrize("group_size", group_sizes) @pytest.mark.parametrize("device", devices) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@skip_if_rocm("ROCm enablement in progress") def test_uintx_weight_only_model_quant(dtype, group_size, device): scale = 512 fp16 = Linear16(scale, device) @@ -132,6 +134,7 @@ def test_uintx_target_dtype(dtype): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available") +@skip_if_rocm("ROCm enablement in progress") def test_uintx_target_dtype_compile(dtype): linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda") # make sure it runs diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index f99cf4a1b4..b58e3eeb03 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -376,6 +376,7 @@ def test_swap(self): assert torch.allclose(y_ref, y) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @skip_if_rocm("ROCm enablement in progress") def test_weight_t_and_non_t_numerics_match(self): # verify that numerics match whether weight is stored # in transposed format (for cuBLAS) vs non-transposed format @@ -586,6 +587,7 @@ def test_per_token_linear_cuda(self): self._test_per_token_linear_impl("cuda", dtype) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @skip_if_rocm("ROCm enablement in progress") def test__int_mm(self): # TODO(future): figure out what here needs to move to PT core, # if it's not already tested there @@ -607,6 +609,7 @@ def test__int_mm(self): torch.testing.assert_close(y_ref, y_opt, atol=0, rtol=0) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @skip_if_rocm("ROCm enablement in progress") def test__int_mm_eager_and_torch_compile_numerics(self): def __int_mm_ref(x, w): x = x.cpu().to(torch.int32) @@ -772,6 +775,7 @@ def _test_lin_weight_subclass_impl( ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_int8_dynamic_quant_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( Int8DynamicallyQuantizedLinearWeight.from_float, @@ -781,6 +785,7 @@ def test_int8_dynamic_quant_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_quant_subclass(self, device, dtype): undo_recommended_configs() self._test_lin_weight_subclass_impl( @@ -788,6 +793,7 @@ def test_int8_weight_only_quant_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_aq_int8_dynamic_quant_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( AQInt8DynamicallyQuantizedLinearWeight.from_float, @@ -810,6 +816,7 @@ def test_aq_int8_weight_only_quant_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( AQInt8WeightOnlyQuantizedLinearWeight2.from_float, @@ -819,6 +826,7 @@ def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_aq_int8_weight_only_quant_3_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( AQInt8WeightOnlyQuantizedLinearWeight3.from_float, @@ -990,6 +998,7 @@ def test_int8_dynamic_quant_subclass_api(self, device, dtype, act_mapping): @parameterized.expand(COMMON_DEVICE_DTYPE) @unittest.skipIf(is_fbcode(), "broken in fbcode") + @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_quant_subclass_api(self, device, dtype): undo_recommended_configs() self._test_lin_weight_subclass_api_impl( @@ -1006,6 +1015,7 @@ def test_int8_weight_only_quant_with_freeze(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_int4_weight_only_quant_subclass_api(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -1017,6 +1027,7 @@ def test_int4_weight_only_quant_subclass_api(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_int4_weight_only_hqq_quant_subclass_api(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -1168,6 +1179,7 @@ def test_weight_only_groupwise_embedding_quant(self): @parameterized.expand(COMMON_DEVICE_DTYPE) @torch.no_grad() @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @skip_if_rocm("ROCm enablement in progress") def test_weight_only_quant_force_mixed_mm(self, device, dtype): undo_recommended_configs() if device != "cuda": @@ -1200,6 +1212,7 @@ def test_weight_only_quant_force_mixed_mm(self, device, dtype): @parameterized.expand(COMMON_DEVICE_DTYPE) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @skip_if_rocm("ROCm enablement in progress") def test_weight_only_quant_use_mixed_mm(self, device, dtype): undo_recommended_configs() if device != "cuda": @@ -1302,6 +1315,7 @@ def forward(self, x): is_fbcode(), "'PlainAQTTensorImpl' object has no attribute 'int_data'" ) @torch.no_grad() + @skip_if_rocm("ROCm enablement in progress") def test_save_load_dqtensors(self, device, dtype): if device == "cpu": self.skipTest("indcutor failed for cpu right now") @@ -1312,12 +1326,14 @@ def test_save_load_dqtensors(self, device, dtype): @parameterized.expand(COMMON_DEVICE_DTYPE) @torch.no_grad() @unittest.skipIf(is_fbcode(), "broken in fbcode") + @skip_if_rocm("ROCm enablement in progress") def test_save_load_int8woqtensors(self, device, dtype): undo_recommended_configs() self._test_handle_save_load_meta_impl(_int8wo_api, device, test_dtype=dtype) @parameterized.expand(COMMON_DEVICE_DTYPE) @torch.no_grad() + @skip_if_rocm("ROCm enablement in progress") def test_save_load_int4woqtensors(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -1326,6 +1342,7 @@ def test_save_load_int4woqtensors(self, device, dtype): class TorchCompileUnitTest(unittest.TestCase): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @skip_if_rocm("ROCm enablement in progress") def test_fullgraph(self): lin_fp16 = nn.Linear(32, 16, device="cuda", dtype=torch.float16) lin_smooth = SmoothFakeDynamicallyQuantizedLinear.from_float( @@ -1510,6 +1527,7 @@ def test_autoquant_one_input(self, device, dtype, m, k, n): ], ) ) + @skip_if_rocm("ROCm enablement in progress") def test_autoquant_compile(self, device, dtype, m1, m2, k, n): undo_recommended_configs() @@ -1586,6 +1604,7 @@ def forward(self, x): assert len(_AUTOQUANT_CACHE) > 0 @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_rocm("ROCm enablement in progress") def test_autoquant_manual(self, device, dtype): undo_recommended_configs() if device != "cuda" or not torch.cuda.is_available(): @@ -1635,6 +1654,7 @@ def test_autoquant_manual(self, device, dtype): ], ) ) + @skip_if_rocm("ROCm enablement in progress") def test_autoquant_kwargs(self, device, dtype, m1, m2, k, n): undo_recommended_configs() if device != "cuda" or not torch.cuda.is_available(): @@ -2057,6 +2077,7 @@ def run_benchmark_model(self, device): return benchmark_model(m_bf16, num_runs, example_inputs) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @skip_if_rocm("ROCm enablement in progress") def test_benchmark_model_cuda(self): assert self.run_benchmark_model("cuda") is not None diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py index 1bd55e28f5..2752cf159e 100644 --- a/test/prototype/mx_formats/test_kernels.py +++ b/test/prototype/mx_formats/test_kernels.py @@ -43,6 +43,7 @@ ) from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx from torchao.prototype.mx_formats.utils import to_blocked +from torchao.testing.utils import skip_if_rocm from torchao.utils import ( is_sm_at_least_89, is_sm_at_least_100, @@ -423,6 +424,7 @@ def test_fp6_e3m2_rounding(f32_val, f6_e3m2_enc, device): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.skipif(not has_triton(), reason="unsupported without triton") +@skip_if_rocm("ROCm enablement in progress") def test_fp6_e2m3_pack_unpack(): orig_vals = torch.Tensor([[0.0, 0.5, 7.5, -0.0], [-0.875, 1.0, -6.0, 0.125]]).to( "cuda" @@ -438,6 +440,7 @@ def test_fp6_e2m3_pack_unpack(): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.skipif(not has_triton(), reason="unsupported without triton") +@skip_if_rocm("ROCm enablement in progress") def test_fp6_e3m2_pack_unpack(): orig_vals = torch.Tensor([[0.0, 5.0, 28.0, -0.0], [-0.25, 0.1875, 0.0625, 8.0]]).to( "cuda" @@ -480,6 +483,7 @@ def test_triton_mxfp8_dim1_randn(M, K): (128, 1), ], ) +@skip_if_rocm("ROCm enablement in progress") def test_rearrange(shape): scales = torch.randint(256, size=shape, device="cuda", dtype=torch.uint8) eager = to_blocked(scales, False) diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py index c858657af6..096d65b0b3 100644 --- a/test/prototype/mx_formats/test_mx_linear.py +++ b/test/prototype/mx_formats/test_mx_linear.py @@ -25,6 +25,7 @@ ) from torchao.quantization import quantize_ from torchao.quantization.utils import compute_error +from torchao.testing.utils import skip_if_rocm from torchao.utils import ( is_sm_at_least_89, is_sm_at_least_100, @@ -265,6 +266,7 @@ def test_activation_checkpointing(): ScaleCalculationMode.RCEIL, ], ) +@skip_if_rocm("ROCm enablement in progress") def test_linear_compile( hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice, scale_calculation_mode ): diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py index aa02c11b3a..dab519f1f3 100644 --- a/test/prototype/mx_formats/test_mx_tensor.py +++ b/test/prototype/mx_formats/test_mx_tensor.py @@ -23,6 +23,7 @@ to_dtype, ) from torchao.quantization.utils import compute_error +from torchao.testing.utils import skip_if_rocm from torchao.utils import ( is_sm_at_least_89, is_sm_at_least_90, @@ -347,6 +348,7 @@ def test_exponent_nan_in(elem_dtype): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES) @pytest.mark.parametrize("pack_fp6", [False, True]) +@skip_if_rocm("ROCm enablement in progress") def test_exponent_nan_out(elem_dtype, pack_fp6): """ If block exponent value is NaN, the MX tensor block value is NaN @@ -471,6 +473,7 @@ def test_clone(): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.parametrize("elem_dtype", [DTYPE_FP6_E2M3, DTYPE_FP6_E3M2]) @pytest.mark.parametrize("pack_fp6", [False, True]) +@skip_if_rocm("ROCm enablement in progress") def test_fp6_packing(elem_dtype, pack_fp6): x = torch.randn(1, 2, 4, device="cuda") block_size = 4 @@ -487,6 +490,7 @@ def test_fp6_packing(elem_dtype, pack_fp6): @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES) @pytest.mark.parametrize("hp_dtype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("all_zeros", [False, True]) +@skip_if_rocm("ROCm enablement in progress") def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros): """ Verifies that compile does not change numerics of MX casts @@ -644,6 +648,7 @@ def to_f8(x): @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+" ) +@skip_if_rocm("ROCm enablement in progress") def test_to_blocked_from_blocked_roundtrip(shape, use_triton_kernel: bool): from torchao.prototype.mx_formats.utils import from_blocked, to_blocked diff --git a/test/prototype/test_blockwise_triton.py b/test/prototype/test_blockwise_triton.py index 89f8cf869e..06d8b3b0c8 100644 --- a/test/prototype/test_blockwise_triton.py +++ b/test/prototype/test_blockwise_triton.py @@ -17,6 +17,7 @@ fp8_blockwise_weight_dequant, fp8_blockwise_weight_quant, ) +from torchao.testing.utils import skip_if_rocm from torchao.utils import is_sm_at_least_89 BLOCKWISE_SIZE_MNK = [ @@ -37,6 +38,7 @@ if is_sm_at_least_89() else [torch.float8_e5m2], ) +@skip_if_rocm("ROCm enablement in progress") def test_blockwise_quant_dequant(_, N, K, dtype): x = torch.randn(N, K).cuda() qx, s = fp8_blockwise_weight_quant(x, dtype=dtype) @@ -59,6 +61,7 @@ def test_blockwise_quant_dequant(_, N, K, dtype): if is_sm_at_least_89() else [torch.float8_e5m2], ) +@skip_if_rocm("ROCm enablement in progress") def test_blockwise_fp8_gemm(M, N, K, dtype): A = torch.randn(M, K).cuda() B = torch.randn(N, K).cuda() diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py index fa0edd694b..e73ce16c6e 100644 --- a/test/prototype/test_quantized_training.py +++ b/test/prototype/test_quantized_training.py @@ -34,6 +34,7 @@ quantize_int8_rowwise, ) from torchao.quantization.quant_api import quantize_ +from torchao.testing.utils import skip_if_rocm if common_utils.SEED is None: common_utils.SEED = 1234 @@ -101,6 +102,7 @@ def test_int8_weight_only_correctness(self, leading_dims, bias, device): @parametrize("leading_dims", [(), (2,), (2, 4)]) @parametrize("bias", [False, True]) @parametrize("device", _DEVICES) + @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_compile(self, leading_dims, bias, device): _reset() embed_dim = 128 @@ -133,6 +135,7 @@ def test_int8_weight_only_compile(self, leading_dims, bias, device): @parametrize("compile", [False, True]) @parametrize("device", _DEVICES) + @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_training(self, compile, device): _reset() bsize = 4 @@ -183,6 +186,7 @@ def test_int8_weight_only_training(self, compile, device): ) @parametrize("module_swap", [False, True]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + @skip_if_rocm("ROCm enablement in progress") def test_int8_mixed_precision_training(self, compile, config, module_swap): _reset() bsize = 64 @@ -296,6 +300,7 @@ def world_size(self) -> int: return _FSDP_WORLD_SIZE @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) + @skip_if_rocm("ROCm enablement in progress") def test_fsdp2_correctness(self): mp_policy = MixedPrecisionPolicy() diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py index 003a50c4d1..4ee5b4530e 100644 --- a/test/sparsity/test_sparse_api.py +++ b/test/sparsity/test_sparse_api.py @@ -22,6 +22,7 @@ quantize_, ) from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_ +from torchao.testing.utils import skip_if_rocm from torchao.utils import is_sm_at_least_90 logging.basicConfig( @@ -194,6 +195,7 @@ class TestBlockSparseWeight(common_utils.TestCase): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @common_utils.parametrize("compile", [True, False]) @common_utils.parametrize("input_shape", [1, 1024]) + @skip_if_rocm("ROCm enablement in progress") def test_sparse(self, compile, input_shape): input = torch.rand((input_shape, 1024)).half().cuda() model = ( @@ -227,6 +229,7 @@ def test_sparse(self, compile, input_shape): class TestQuantBlockSparseWeight(common_utils.TestCase): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @common_utils.parametrize("compile", [True, False]) + @skip_if_rocm("ROCm enablement in progress") def test_sparse(self, compile): input = torch.rand((256, 128)).to(torch.bfloat16).cuda() model = ( From 205e2b9c89739d865893dadb603f629581a103c9 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 19:12:48 -0700 Subject: [PATCH 6/8] remove temp code --- .github/workflows/regression_test_rocm.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml index b9f8d1f423..a9fdb68c13 100644 --- a/.github/workflows/regression_test_rocm.yml +++ b/.github/workflows/regression_test_rocm.yml @@ -20,18 +20,12 @@ jobs: fail-fast: false matrix: include: - - name: ROCM Nightly 6.3 + - name: ROCM Nightly runs-on: linux.rocm.gpu.gfx942.2 torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' gpu-arch-type: "rocm" gpu-arch-version: "6.3" docker-image: pytorch/manylinux2_28-builder:rocm6.3 - - name: ROCM Nightly 7.0 - runs-on: linux.rocm.gpu.gfx942.2 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0' - gpu-arch-type: "rocm" - gpu-arch-version: "7.0" - docker-image: pytorch/manylinux2_28-builder:rocm7.0 permissions: id-token: write From f57e21fd888c8a6f53ccdc5689e37c7762d223ea Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 19:32:10 -0700 Subject: [PATCH 7/8] Remove skipped tests --- test/dtypes/test_floatx.py | 1 - test/dtypes/test_nf4.py | 2 -- test/dtypes/test_uintx.py | 3 --- test/integration/test_integration.py | 21 --------------------- test/prototype/mx_formats/test_kernels.py | 4 ---- test/prototype/mx_formats/test_mx_linear.py | 2 -- test/prototype/mx_formats/test_mx_tensor.py | 5 ----- test/prototype/test_blockwise_triton.py | 3 --- test/prototype/test_quantized_training.py | 5 ----- test/sparsity/test_sparse_api.py | 3 --- 10 files changed, 49 deletions(-) diff --git a/test/dtypes/test_floatx.py b/test/dtypes/test_floatx.py index 0cff75d309..ab4a13d24c 100644 --- a/test/dtypes/test_floatx.py +++ b/test/dtypes/test_floatx.py @@ -73,7 +73,6 @@ def test_from_tc_floatx_correctness(self, ebits, mbits, device): @parametrize("ebits,mbits", _Floatx_DTYPES) @parametrize("device", _DEVICES) - @skip_if_rocm("ROCm enablement in progress") def test_from_scaled_tc_floatx_compile(self, ebits, mbits, device): M, N = 256, 64 nbits = 1 + ebits + mbits diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py index 7dddf4b476..2a711413f0 100644 --- a/test/dtypes/test_nf4.py +++ b/test/dtypes/test_nf4.py @@ -263,7 +263,6 @@ def test_smoketest_linear(self, dtype: torch.dtype): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) - @skip_if_rocm("ROCm enablement in progress") def test_smoketest_linear_compile(self, dtype: torch.dtype): if ( torch.cuda.is_available() @@ -302,7 +301,6 @@ def test_empty_like(self, input_size: Union[Tuple[int], int]): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @parametrize("compile", [False, True]) - @skip_if_rocm("ROCm enablement in progress") def test_quantize_api(self, compile): nf4_linear = nn.Linear(512, 512, device="cuda") torchao.quantize_(nf4_linear, nf4_weight_only()) diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py index ab417ab417..cb0c88b21c 100644 --- a/test/dtypes/test_uintx.py +++ b/test/dtypes/test_uintx.py @@ -14,7 +14,6 @@ dequantize_affine, quantize_affine, ) -from torchao.testing.utils import skip_if_rocm dtypes = ( torch.uint1, @@ -76,7 +75,6 @@ def test_uintx_quant_on_cpu_then_move_to_cuda(dtype, group_size): @pytest.mark.parametrize("group_size", group_sizes) @pytest.mark.parametrize("device", devices) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -@skip_if_rocm("ROCm enablement in progress") def test_uintx_weight_only_model_quant(dtype, group_size, device): scale = 512 fp16 = Linear16(scale, device) @@ -134,7 +132,6 @@ def test_uintx_target_dtype(dtype): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available") -@skip_if_rocm("ROCm enablement in progress") def test_uintx_target_dtype_compile(dtype): linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda") # make sure it runs diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index b58e3eeb03..f99cf4a1b4 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -376,7 +376,6 @@ def test_swap(self): assert torch.allclose(y_ref, y) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @skip_if_rocm("ROCm enablement in progress") def test_weight_t_and_non_t_numerics_match(self): # verify that numerics match whether weight is stored # in transposed format (for cuBLAS) vs non-transposed format @@ -587,7 +586,6 @@ def test_per_token_linear_cuda(self): self._test_per_token_linear_impl("cuda", dtype) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @skip_if_rocm("ROCm enablement in progress") def test__int_mm(self): # TODO(future): figure out what here needs to move to PT core, # if it's not already tested there @@ -609,7 +607,6 @@ def test__int_mm(self): torch.testing.assert_close(y_ref, y_opt, atol=0, rtol=0) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @skip_if_rocm("ROCm enablement in progress") def test__int_mm_eager_and_torch_compile_numerics(self): def __int_mm_ref(x, w): x = x.cpu().to(torch.int32) @@ -775,7 +772,6 @@ def _test_lin_weight_subclass_impl( ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_int8_dynamic_quant_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( Int8DynamicallyQuantizedLinearWeight.from_float, @@ -785,7 +781,6 @@ def test_int8_dynamic_quant_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_quant_subclass(self, device, dtype): undo_recommended_configs() self._test_lin_weight_subclass_impl( @@ -793,7 +788,6 @@ def test_int8_weight_only_quant_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_aq_int8_dynamic_quant_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( AQInt8DynamicallyQuantizedLinearWeight.from_float, @@ -816,7 +810,6 @@ def test_aq_int8_weight_only_quant_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( AQInt8WeightOnlyQuantizedLinearWeight2.from_float, @@ -826,7 +819,6 @@ def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_aq_int8_weight_only_quant_3_subclass(self, device, dtype): self._test_lin_weight_subclass_impl( AQInt8WeightOnlyQuantizedLinearWeight3.from_float, @@ -998,7 +990,6 @@ def test_int8_dynamic_quant_subclass_api(self, device, dtype, act_mapping): @parameterized.expand(COMMON_DEVICE_DTYPE) @unittest.skipIf(is_fbcode(), "broken in fbcode") - @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_quant_subclass_api(self, device, dtype): undo_recommended_configs() self._test_lin_weight_subclass_api_impl( @@ -1015,7 +1006,6 @@ def test_int8_weight_only_quant_with_freeze(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_int4_weight_only_quant_subclass_api(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -1027,7 +1017,6 @@ def test_int4_weight_only_quant_subclass_api(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_int4_weight_only_hqq_quant_subclass_api(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -1179,7 +1168,6 @@ def test_weight_only_groupwise_embedding_quant(self): @parameterized.expand(COMMON_DEVICE_DTYPE) @torch.no_grad() @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @skip_if_rocm("ROCm enablement in progress") def test_weight_only_quant_force_mixed_mm(self, device, dtype): undo_recommended_configs() if device != "cuda": @@ -1212,7 +1200,6 @@ def test_weight_only_quant_force_mixed_mm(self, device, dtype): @parameterized.expand(COMMON_DEVICE_DTYPE) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @skip_if_rocm("ROCm enablement in progress") def test_weight_only_quant_use_mixed_mm(self, device, dtype): undo_recommended_configs() if device != "cuda": @@ -1315,7 +1302,6 @@ def forward(self, x): is_fbcode(), "'PlainAQTTensorImpl' object has no attribute 'int_data'" ) @torch.no_grad() - @skip_if_rocm("ROCm enablement in progress") def test_save_load_dqtensors(self, device, dtype): if device == "cpu": self.skipTest("indcutor failed for cpu right now") @@ -1326,14 +1312,12 @@ def test_save_load_dqtensors(self, device, dtype): @parameterized.expand(COMMON_DEVICE_DTYPE) @torch.no_grad() @unittest.skipIf(is_fbcode(), "broken in fbcode") - @skip_if_rocm("ROCm enablement in progress") def test_save_load_int8woqtensors(self, device, dtype): undo_recommended_configs() self._test_handle_save_load_meta_impl(_int8wo_api, device, test_dtype=dtype) @parameterized.expand(COMMON_DEVICE_DTYPE) @torch.no_grad() - @skip_if_rocm("ROCm enablement in progress") def test_save_load_int4woqtensors(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -1342,7 +1326,6 @@ def test_save_load_int4woqtensors(self, device, dtype): class TorchCompileUnitTest(unittest.TestCase): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @skip_if_rocm("ROCm enablement in progress") def test_fullgraph(self): lin_fp16 = nn.Linear(32, 16, device="cuda", dtype=torch.float16) lin_smooth = SmoothFakeDynamicallyQuantizedLinear.from_float( @@ -1527,7 +1510,6 @@ def test_autoquant_one_input(self, device, dtype, m, k, n): ], ) ) - @skip_if_rocm("ROCm enablement in progress") def test_autoquant_compile(self, device, dtype, m1, m2, k, n): undo_recommended_configs() @@ -1604,7 +1586,6 @@ def forward(self, x): assert len(_AUTOQUANT_CACHE) > 0 @parameterized.expand(COMMON_DEVICE_DTYPE) - @skip_if_rocm("ROCm enablement in progress") def test_autoquant_manual(self, device, dtype): undo_recommended_configs() if device != "cuda" or not torch.cuda.is_available(): @@ -1654,7 +1635,6 @@ def test_autoquant_manual(self, device, dtype): ], ) ) - @skip_if_rocm("ROCm enablement in progress") def test_autoquant_kwargs(self, device, dtype, m1, m2, k, n): undo_recommended_configs() if device != "cuda" or not torch.cuda.is_available(): @@ -2077,7 +2057,6 @@ def run_benchmark_model(self, device): return benchmark_model(m_bf16, num_runs, example_inputs) @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") - @skip_if_rocm("ROCm enablement in progress") def test_benchmark_model_cuda(self): assert self.run_benchmark_model("cuda") is not None diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py index 2752cf159e..1bd55e28f5 100644 --- a/test/prototype/mx_formats/test_kernels.py +++ b/test/prototype/mx_formats/test_kernels.py @@ -43,7 +43,6 @@ ) from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx from torchao.prototype.mx_formats.utils import to_blocked -from torchao.testing.utils import skip_if_rocm from torchao.utils import ( is_sm_at_least_89, is_sm_at_least_100, @@ -424,7 +423,6 @@ def test_fp6_e3m2_rounding(f32_val, f6_e3m2_enc, device): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.skipif(not has_triton(), reason="unsupported without triton") -@skip_if_rocm("ROCm enablement in progress") def test_fp6_e2m3_pack_unpack(): orig_vals = torch.Tensor([[0.0, 0.5, 7.5, -0.0], [-0.875, 1.0, -6.0, 0.125]]).to( "cuda" @@ -440,7 +438,6 @@ def test_fp6_e2m3_pack_unpack(): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.skipif(not has_triton(), reason="unsupported without triton") -@skip_if_rocm("ROCm enablement in progress") def test_fp6_e3m2_pack_unpack(): orig_vals = torch.Tensor([[0.0, 5.0, 28.0, -0.0], [-0.25, 0.1875, 0.0625, 8.0]]).to( "cuda" @@ -483,7 +480,6 @@ def test_triton_mxfp8_dim1_randn(M, K): (128, 1), ], ) -@skip_if_rocm("ROCm enablement in progress") def test_rearrange(shape): scales = torch.randint(256, size=shape, device="cuda", dtype=torch.uint8) eager = to_blocked(scales, False) diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py index 096d65b0b3..c858657af6 100644 --- a/test/prototype/mx_formats/test_mx_linear.py +++ b/test/prototype/mx_formats/test_mx_linear.py @@ -25,7 +25,6 @@ ) from torchao.quantization import quantize_ from torchao.quantization.utils import compute_error -from torchao.testing.utils import skip_if_rocm from torchao.utils import ( is_sm_at_least_89, is_sm_at_least_100, @@ -266,7 +265,6 @@ def test_activation_checkpointing(): ScaleCalculationMode.RCEIL, ], ) -@skip_if_rocm("ROCm enablement in progress") def test_linear_compile( hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice, scale_calculation_mode ): diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py index dab519f1f3..aa02c11b3a 100644 --- a/test/prototype/mx_formats/test_mx_tensor.py +++ b/test/prototype/mx_formats/test_mx_tensor.py @@ -23,7 +23,6 @@ to_dtype, ) from torchao.quantization.utils import compute_error -from torchao.testing.utils import skip_if_rocm from torchao.utils import ( is_sm_at_least_89, is_sm_at_least_90, @@ -348,7 +347,6 @@ def test_exponent_nan_in(elem_dtype): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES) @pytest.mark.parametrize("pack_fp6", [False, True]) -@skip_if_rocm("ROCm enablement in progress") def test_exponent_nan_out(elem_dtype, pack_fp6): """ If block exponent value is NaN, the MX tensor block value is NaN @@ -473,7 +471,6 @@ def test_clone(): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.parametrize("elem_dtype", [DTYPE_FP6_E2M3, DTYPE_FP6_E3M2]) @pytest.mark.parametrize("pack_fp6", [False, True]) -@skip_if_rocm("ROCm enablement in progress") def test_fp6_packing(elem_dtype, pack_fp6): x = torch.randn(1, 2, 4, device="cuda") block_size = 4 @@ -490,7 +487,6 @@ def test_fp6_packing(elem_dtype, pack_fp6): @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES) @pytest.mark.parametrize("hp_dtype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("all_zeros", [False, True]) -@skip_if_rocm("ROCm enablement in progress") def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros): """ Verifies that compile does not change numerics of MX casts @@ -648,7 +644,6 @@ def to_f8(x): @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+" ) -@skip_if_rocm("ROCm enablement in progress") def test_to_blocked_from_blocked_roundtrip(shape, use_triton_kernel: bool): from torchao.prototype.mx_formats.utils import from_blocked, to_blocked diff --git a/test/prototype/test_blockwise_triton.py b/test/prototype/test_blockwise_triton.py index 06d8b3b0c8..89f8cf869e 100644 --- a/test/prototype/test_blockwise_triton.py +++ b/test/prototype/test_blockwise_triton.py @@ -17,7 +17,6 @@ fp8_blockwise_weight_dequant, fp8_blockwise_weight_quant, ) -from torchao.testing.utils import skip_if_rocm from torchao.utils import is_sm_at_least_89 BLOCKWISE_SIZE_MNK = [ @@ -38,7 +37,6 @@ if is_sm_at_least_89() else [torch.float8_e5m2], ) -@skip_if_rocm("ROCm enablement in progress") def test_blockwise_quant_dequant(_, N, K, dtype): x = torch.randn(N, K).cuda() qx, s = fp8_blockwise_weight_quant(x, dtype=dtype) @@ -61,7 +59,6 @@ def test_blockwise_quant_dequant(_, N, K, dtype): if is_sm_at_least_89() else [torch.float8_e5m2], ) -@skip_if_rocm("ROCm enablement in progress") def test_blockwise_fp8_gemm(M, N, K, dtype): A = torch.randn(M, K).cuda() B = torch.randn(N, K).cuda() diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py index e73ce16c6e..fa0edd694b 100644 --- a/test/prototype/test_quantized_training.py +++ b/test/prototype/test_quantized_training.py @@ -34,7 +34,6 @@ quantize_int8_rowwise, ) from torchao.quantization.quant_api import quantize_ -from torchao.testing.utils import skip_if_rocm if common_utils.SEED is None: common_utils.SEED = 1234 @@ -102,7 +101,6 @@ def test_int8_weight_only_correctness(self, leading_dims, bias, device): @parametrize("leading_dims", [(), (2,), (2, 4)]) @parametrize("bias", [False, True]) @parametrize("device", _DEVICES) - @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_compile(self, leading_dims, bias, device): _reset() embed_dim = 128 @@ -135,7 +133,6 @@ def test_int8_weight_only_compile(self, leading_dims, bias, device): @parametrize("compile", [False, True]) @parametrize("device", _DEVICES) - @skip_if_rocm("ROCm enablement in progress") def test_int8_weight_only_training(self, compile, device): _reset() bsize = 4 @@ -186,7 +183,6 @@ def test_int8_weight_only_training(self, compile, device): ) @parametrize("module_swap", [False, True]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") - @skip_if_rocm("ROCm enablement in progress") def test_int8_mixed_precision_training(self, compile, config, module_swap): _reset() bsize = 64 @@ -300,7 +296,6 @@ def world_size(self) -> int: return _FSDP_WORLD_SIZE @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) - @skip_if_rocm("ROCm enablement in progress") def test_fsdp2_correctness(self): mp_policy = MixedPrecisionPolicy() diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py index 4ee5b4530e..003a50c4d1 100644 --- a/test/sparsity/test_sparse_api.py +++ b/test/sparsity/test_sparse_api.py @@ -22,7 +22,6 @@ quantize_, ) from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_ -from torchao.testing.utils import skip_if_rocm from torchao.utils import is_sm_at_least_90 logging.basicConfig( @@ -195,7 +194,6 @@ class TestBlockSparseWeight(common_utils.TestCase): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @common_utils.parametrize("compile", [True, False]) @common_utils.parametrize("input_shape", [1, 1024]) - @skip_if_rocm("ROCm enablement in progress") def test_sparse(self, compile, input_shape): input = torch.rand((input_shape, 1024)).half().cuda() model = ( @@ -229,7 +227,6 @@ def test_sparse(self, compile, input_shape): class TestQuantBlockSparseWeight(common_utils.TestCase): @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @common_utils.parametrize("compile", [True, False]) - @skip_if_rocm("ROCm enablement in progress") def test_sparse(self, compile): input = torch.rand((256, 128)).to(torch.bfloat16).cuda() model = ( From 9396fc8222c52b595bed1bbe53cb4ba4bd4d38cc Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 9 Oct 2025 19:34:14 -0700 Subject: [PATCH 8/8] Add 7.0 rocm --- .github/workflows/regression_test_rocm.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml index a9fdb68c13..c12f2a61da 100644 --- a/.github/workflows/regression_test_rocm.yml +++ b/.github/workflows/regression_test_rocm.yml @@ -22,10 +22,10 @@ jobs: include: - name: ROCM Nightly runs-on: linux.rocm.gpu.gfx942.2 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0' gpu-arch-type: "rocm" - gpu-arch-version: "6.3" - docker-image: pytorch/manylinux2_28-builder:rocm6.3 + gpu-arch-version: "7.0" + docker-image: pytorch/manylinux2_28-builder:rocm7.0 permissions: id-token: write