From 2cad665a25a418cd41a6b944531a7ce44496719b Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 07:06:03 -0700
Subject: [PATCH 1/8] Upgrade rocm

---
 .github/workflows/regression_test_rocm.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
index a9fdb68c13..5df97e1673 100644
--- a/.github/workflows/regression_test_rocm.yml
+++ b/.github/workflows/regression_test_rocm.yml
@@ -24,8 +24,8 @@ jobs:
             runs-on: linux.rocm.gpu.gfx942.2
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
             gpu-arch-type: "rocm"
-            gpu-arch-version: "6.3"
-            docker-image: pytorch/manylinux2_28-builder:rocm6.3
+            gpu-arch-version: "7.0"
+            docker-image: pytorch/manylinux2_28-builder:rocm7.0
 
     permissions:
       id-token: write

From 01a9e19b0473a4b054115fa348cf1d3f3fbfddc1 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 07:10:07 -0700
Subject: [PATCH 2/8] Upgrade rocm

---
 .github/workflows/regression_test_rocm.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
index 5df97e1673..c12f2a61da 100644
--- a/.github/workflows/regression_test_rocm.yml
+++ b/.github/workflows/regression_test_rocm.yml
@@ -22,7 +22,7 @@ jobs:
         include:
           - name: ROCM Nightly
             runs-on: linux.rocm.gpu.gfx942.2
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
             gpu-arch-type: "rocm"
             gpu-arch-version: "7.0"
             docker-image: pytorch/manylinux2_28-builder:rocm7.0

From 48f7fa4616ab473dd4d66eac22424a65be97224f Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 14:21:37 -0700
Subject: [PATCH 3/8] Add 6.3, 6.4 to rocm

---
 .github/workflows/regression_test_rocm.yml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
index c12f2a61da..0326b2e912 100644
--- a/.github/workflows/regression_test_rocm.yml
+++ b/.github/workflows/regression_test_rocm.yml
@@ -20,12 +20,18 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - name: ROCM Nightly
+          - name: ROCM Nightly 6.3
             runs-on: linux.rocm.gpu.gfx942.2
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
             gpu-arch-type: "rocm"
-            gpu-arch-version: "7.0"
-            docker-image: pytorch/manylinux2_28-builder:rocm7.0
+            gpu-arch-version: "6.3"
+            docker-image: pytorch/manylinux2_28-builder:rocm6.3
+          - name: ROCM Nightly 6.4
+            runs-on: linux.rocm.gpu.gfx942.2
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4'
+            gpu-arch-type: "rocm"
+            gpu-arch-version: "6.4"
+            docker-image: pytorch/manylinux2_28-builder:rocm6.4
 
     permissions:
       id-token: write

From a4547891c146f8256f1e3eb8e684b2a38aa893bd Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 14:27:34 -0700
Subject: [PATCH 4/8] Revert 7.0 to 6.3

---
 .github/workflows/regression_test_rocm.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
index 0326b2e912..b98a052343 100644
--- a/.github/workflows/regression_test_rocm.yml
+++ b/.github/workflows/regression_test_rocm.yml
@@ -26,12 +26,6 @@ jobs:
             gpu-arch-type: "rocm"
             gpu-arch-version: "6.3"
             docker-image: pytorch/manylinux2_28-builder:rocm6.3
-          - name: ROCM Nightly 6.4
-            runs-on: linux.rocm.gpu.gfx942.2
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4'
-            gpu-arch-type: "rocm"
-            gpu-arch-version: "6.4"
-            docker-image: pytorch/manylinux2_28-builder:rocm6.4
 
     permissions:
       id-token: write

From 45269e0a07ed02bd2001f7920ed7f8e97be8dae3 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 19:11:13 -0700
Subject: [PATCH 5/8] Skip failing rocm tests

---
 .github/workflows/regression_test_rocm.yml  |  6 ++++++
 test/dtypes/test_floatx.py                  |  1 +
 test/dtypes/test_nf4.py                     |  2 ++
 test/dtypes/test_uintx.py                   |  3 +++
 test/integration/test_integration.py        | 21 +++++++++++++++++++++
 test/prototype/mx_formats/test_kernels.py   |  4 ++++
 test/prototype/mx_formats/test_mx_linear.py |  2 ++
 test/prototype/mx_formats/test_mx_tensor.py |  5 +++++
 test/prototype/test_blockwise_triton.py     |  3 +++
 test/prototype/test_quantized_training.py   |  5 +++++
 test/sparsity/test_sparse_api.py            |  3 +++
 11 files changed, 55 insertions(+)

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
index b98a052343..b9f8d1f423 100644
--- a/.github/workflows/regression_test_rocm.yml
+++ b/.github/workflows/regression_test_rocm.yml
@@ -26,6 +26,12 @@ jobs:
             gpu-arch-type: "rocm"
             gpu-arch-version: "6.3"
             docker-image: pytorch/manylinux2_28-builder:rocm6.3
+          - name: ROCM Nightly 7.0
+            runs-on: linux.rocm.gpu.gfx942.2
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
+            gpu-arch-type: "rocm"
+            gpu-arch-version: "7.0"
+            docker-image: pytorch/manylinux2_28-builder:rocm7.0
 
     permissions:
       id-token: write
diff --git a/test/dtypes/test_floatx.py b/test/dtypes/test_floatx.py
index ab4a13d24c..0cff75d309 100644
--- a/test/dtypes/test_floatx.py
+++ b/test/dtypes/test_floatx.py
@@ -73,6 +73,7 @@ def test_from_tc_floatx_correctness(self, ebits, mbits, device):
 
     @parametrize("ebits,mbits", _Floatx_DTYPES)
     @parametrize("device", _DEVICES)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_from_scaled_tc_floatx_compile(self, ebits, mbits, device):
         M, N = 256, 64
         nbits = 1 + ebits + mbits
diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py
index 2a711413f0..7dddf4b476 100644
--- a/test/dtypes/test_nf4.py
+++ b/test/dtypes/test_nf4.py
@@ -263,6 +263,7 @@ def test_smoketest_linear(self, dtype: torch.dtype):
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+    @skip_if_rocm("ROCm enablement in progress")
     def test_smoketest_linear_compile(self, dtype: torch.dtype):
         if (
             torch.cuda.is_available()
@@ -301,6 +302,7 @@ def test_empty_like(self, input_size: Union[Tuple[int], int]):
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @parametrize("compile", [False, True])
+    @skip_if_rocm("ROCm enablement in progress")
     def test_quantize_api(self, compile):
         nf4_linear = nn.Linear(512, 512, device="cuda")
         torchao.quantize_(nf4_linear, nf4_weight_only())
diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py
index cb0c88b21c..ab417ab417 100644
--- a/test/dtypes/test_uintx.py
+++ b/test/dtypes/test_uintx.py
@@ -14,6 +14,7 @@
     dequantize_affine,
     quantize_affine,
 )
+from torchao.testing.utils import skip_if_rocm
 
 dtypes = (
     torch.uint1,
@@ -75,6 +76,7 @@ def test_uintx_quant_on_cpu_then_move_to_cuda(dtype, group_size):
 @pytest.mark.parametrize("group_size", group_sizes)
 @pytest.mark.parametrize("device", devices)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@skip_if_rocm("ROCm enablement in progress")
 def test_uintx_weight_only_model_quant(dtype, group_size, device):
     scale = 512
     fp16 = Linear16(scale, device)
@@ -132,6 +134,7 @@ def test_uintx_target_dtype(dtype):
 
 @pytest.mark.parametrize("dtype", dtypes)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
+@skip_if_rocm("ROCm enablement in progress")
 def test_uintx_target_dtype_compile(dtype):
     linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
     # make sure it runs
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
index f99cf4a1b4..b58e3eeb03 100644
--- a/test/integration/test_integration.py
+++ b/test/integration/test_integration.py
@@ -376,6 +376,7 @@ def test_swap(self):
         assert torch.allclose(y_ref, y)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_weight_t_and_non_t_numerics_match(self):
         # verify that numerics match whether weight is stored
         # in transposed format (for cuBLAS) vs non-transposed format
@@ -586,6 +587,7 @@ def test_per_token_linear_cuda(self):
             self._test_per_token_linear_impl("cuda", dtype)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test__int_mm(self):
         # TODO(future): figure out what here needs to move to PT core,
         # if it's not already tested there
@@ -607,6 +609,7 @@ def test__int_mm(self):
         torch.testing.assert_close(y_ref, y_opt, atol=0, rtol=0)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test__int_mm_eager_and_torch_compile_numerics(self):
         def __int_mm_ref(x, w):
             x = x.cpu().to(torch.int32)
@@ -772,6 +775,7 @@ def _test_lin_weight_subclass_impl(
             )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_dynamic_quant_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             Int8DynamicallyQuantizedLinearWeight.from_float,
@@ -781,6 +785,7 @@ def test_int8_dynamic_quant_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_quant_subclass(self, device, dtype):
         undo_recommended_configs()
         self._test_lin_weight_subclass_impl(
@@ -788,6 +793,7 @@ def test_int8_weight_only_quant_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_aq_int8_dynamic_quant_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             AQInt8DynamicallyQuantizedLinearWeight.from_float,
@@ -810,6 +816,7 @@ def test_aq_int8_weight_only_quant_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             AQInt8WeightOnlyQuantizedLinearWeight2.from_float,
@@ -819,6 +826,7 @@ def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_aq_int8_weight_only_quant_3_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             AQInt8WeightOnlyQuantizedLinearWeight3.from_float,
@@ -990,6 +998,7 @@ def test_int8_dynamic_quant_subclass_api(self, device, dtype, act_mapping):
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(is_fbcode(), "broken in fbcode")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_quant_subclass_api(self, device, dtype):
         undo_recommended_configs()
         self._test_lin_weight_subclass_api_impl(
@@ -1006,6 +1015,7 @@ def test_int8_weight_only_quant_with_freeze(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_quant_subclass_api(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
@@ -1017,6 +1027,7 @@ def test_int4_weight_only_quant_subclass_api(self, device, dtype):
             )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_hqq_quant_subclass_api(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
@@ -1168,6 +1179,7 @@ def test_weight_only_groupwise_embedding_quant(self):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_weight_only_quant_force_mixed_mm(self, device, dtype):
         undo_recommended_configs()
         if device != "cuda":
@@ -1200,6 +1212,7 @@ def test_weight_only_quant_force_mixed_mm(self, device, dtype):
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_weight_only_quant_use_mixed_mm(self, device, dtype):
         undo_recommended_configs()
         if device != "cuda":
@@ -1302,6 +1315,7 @@ def forward(self, x):
         is_fbcode(), "'PlainAQTTensorImpl' object has no attribute 'int_data'"
     )
     @torch.no_grad()
+    @skip_if_rocm("ROCm enablement in progress")
     def test_save_load_dqtensors(self, device, dtype):
         if device == "cpu":
             self.skipTest("indcutor failed for cpu right now")
@@ -1312,12 +1326,14 @@ def test_save_load_dqtensors(self, device, dtype):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
     @unittest.skipIf(is_fbcode(), "broken in fbcode")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_save_load_int8woqtensors(self, device, dtype):
         undo_recommended_configs()
         self._test_handle_save_load_meta_impl(_int8wo_api, device, test_dtype=dtype)
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
+    @skip_if_rocm("ROCm enablement in progress")
     def test_save_load_int4woqtensors(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
@@ -1326,6 +1342,7 @@ def test_save_load_int4woqtensors(self, device, dtype):
 
 class TorchCompileUnitTest(unittest.TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_fullgraph(self):
         lin_fp16 = nn.Linear(32, 16, device="cuda", dtype=torch.float16)
         lin_smooth = SmoothFakeDynamicallyQuantizedLinear.from_float(
@@ -1510,6 +1527,7 @@ def test_autoquant_one_input(self, device, dtype, m, k, n):
             ],
         )
     )
+    @skip_if_rocm("ROCm enablement in progress")
     def test_autoquant_compile(self, device, dtype, m1, m2, k, n):
         undo_recommended_configs()
 
@@ -1586,6 +1604,7 @@ def forward(self, x):
         assert len(_AUTOQUANT_CACHE) > 0
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_autoquant_manual(self, device, dtype):
         undo_recommended_configs()
         if device != "cuda" or not torch.cuda.is_available():
@@ -1635,6 +1654,7 @@ def test_autoquant_manual(self, device, dtype):
             ],
         )
     )
+    @skip_if_rocm("ROCm enablement in progress")
     def test_autoquant_kwargs(self, device, dtype, m1, m2, k, n):
         undo_recommended_configs()
         if device != "cuda" or not torch.cuda.is_available():
@@ -2057,6 +2077,7 @@ def run_benchmark_model(self, device):
         return benchmark_model(m_bf16, num_runs, example_inputs)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_benchmark_model_cuda(self):
         assert self.run_benchmark_model("cuda") is not None
 
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
index 1bd55e28f5..2752cf159e 100644
--- a/test/prototype/mx_formats/test_kernels.py
+++ b/test/prototype/mx_formats/test_kernels.py
@@ -43,6 +43,7 @@
 )
 from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx
 from torchao.prototype.mx_formats.utils import to_blocked
+from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_100,
@@ -423,6 +424,7 @@ def test_fp6_e3m2_rounding(f32_val, f6_e3m2_enc, device):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@skip_if_rocm("ROCm enablement in progress")
 def test_fp6_e2m3_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 0.5, 7.5, -0.0], [-0.875, 1.0, -6.0, 0.125]]).to(
         "cuda"
@@ -438,6 +440,7 @@ def test_fp6_e2m3_pack_unpack():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@skip_if_rocm("ROCm enablement in progress")
 def test_fp6_e3m2_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 5.0, 28.0, -0.0], [-0.25, 0.1875, 0.0625, 8.0]]).to(
         "cuda"
@@ -480,6 +483,7 @@ def test_triton_mxfp8_dim1_randn(M, K):
         (128, 1),
     ],
 )
+@skip_if_rocm("ROCm enablement in progress")
 def test_rearrange(shape):
     scales = torch.randint(256, size=shape, device="cuda", dtype=torch.uint8)
     eager = to_blocked(scales, False)
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
index c858657af6..096d65b0b3 100644
--- a/test/prototype/mx_formats/test_mx_linear.py
+++ b/test/prototype/mx_formats/test_mx_linear.py
@@ -25,6 +25,7 @@
 )
 from torchao.quantization import quantize_
 from torchao.quantization.utils import compute_error
+from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_100,
@@ -265,6 +266,7 @@ def test_activation_checkpointing():
         ScaleCalculationMode.RCEIL,
     ],
 )
+@skip_if_rocm("ROCm enablement in progress")
 def test_linear_compile(
     hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice, scale_calculation_mode
 ):
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
index aa02c11b3a..dab519f1f3 100644
--- a/test/prototype/mx_formats/test_mx_tensor.py
+++ b/test/prototype/mx_formats/test_mx_tensor.py
@@ -23,6 +23,7 @@
     to_dtype,
 )
 from torchao.quantization.utils import compute_error
+from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_90,
@@ -347,6 +348,7 @@ def test_exponent_nan_in(elem_dtype):
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 @pytest.mark.parametrize("pack_fp6", [False, True])
+@skip_if_rocm("ROCm enablement in progress")
 def test_exponent_nan_out(elem_dtype, pack_fp6):
     """
     If block exponent value is NaN, the MX tensor block value is NaN
@@ -471,6 +473,7 @@ def test_clone():
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", [DTYPE_FP6_E2M3, DTYPE_FP6_E3M2])
 @pytest.mark.parametrize("pack_fp6", [False, True])
+@skip_if_rocm("ROCm enablement in progress")
 def test_fp6_packing(elem_dtype, pack_fp6):
     x = torch.randn(1, 2, 4, device="cuda")
     block_size = 4
@@ -487,6 +490,7 @@ def test_fp6_packing(elem_dtype, pack_fp6):
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 @pytest.mark.parametrize("hp_dtype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("all_zeros", [False, True])
+@skip_if_rocm("ROCm enablement in progress")
 def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros):
     """
     Verifies that compile does not change numerics of MX casts
@@ -644,6 +648,7 @@ def to_f8(x):
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+"
 )
+@skip_if_rocm("ROCm enablement in progress")
 def test_to_blocked_from_blocked_roundtrip(shape, use_triton_kernel: bool):
     from torchao.prototype.mx_formats.utils import from_blocked, to_blocked
 
diff --git a/test/prototype/test_blockwise_triton.py b/test/prototype/test_blockwise_triton.py
index 89f8cf869e..06d8b3b0c8 100644
--- a/test/prototype/test_blockwise_triton.py
+++ b/test/prototype/test_blockwise_triton.py
@@ -17,6 +17,7 @@
     fp8_blockwise_weight_dequant,
     fp8_blockwise_weight_quant,
 )
+from torchao.testing.utils import skip_if_rocm
 from torchao.utils import is_sm_at_least_89
 
 BLOCKWISE_SIZE_MNK = [
@@ -37,6 +38,7 @@
     if is_sm_at_least_89()
     else [torch.float8_e5m2],
 )
+@skip_if_rocm("ROCm enablement in progress")
 def test_blockwise_quant_dequant(_, N, K, dtype):
     x = torch.randn(N, K).cuda()
     qx, s = fp8_blockwise_weight_quant(x, dtype=dtype)
@@ -59,6 +61,7 @@ def test_blockwise_quant_dequant(_, N, K, dtype):
     if is_sm_at_least_89()
     else [torch.float8_e5m2],
 )
+@skip_if_rocm("ROCm enablement in progress")
 def test_blockwise_fp8_gemm(M, N, K, dtype):
     A = torch.randn(M, K).cuda()
     B = torch.randn(N, K).cuda()
diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index fa0edd694b..e73ce16c6e 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -34,6 +34,7 @@
     quantize_int8_rowwise,
 )
 from torchao.quantization.quant_api import quantize_
+from torchao.testing.utils import skip_if_rocm
 
 if common_utils.SEED is None:
     common_utils.SEED = 1234
@@ -101,6 +102,7 @@ def test_int8_weight_only_correctness(self, leading_dims, bias, device):
     @parametrize("leading_dims", [(), (2,), (2, 4)])
     @parametrize("bias", [False, True])
     @parametrize("device", _DEVICES)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_compile(self, leading_dims, bias, device):
         _reset()
         embed_dim = 128
@@ -133,6 +135,7 @@ def test_int8_weight_only_compile(self, leading_dims, bias, device):
 
     @parametrize("compile", [False, True])
     @parametrize("device", _DEVICES)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_training(self, compile, device):
         _reset()
         bsize = 4
@@ -183,6 +186,7 @@ def test_int8_weight_only_training(self, compile, device):
     )
     @parametrize("module_swap", [False, True])
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_mixed_precision_training(self, compile, config, module_swap):
         _reset()
         bsize = 64
@@ -296,6 +300,7 @@ def world_size(self) -> int:
         return _FSDP_WORLD_SIZE
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
+    @skip_if_rocm("ROCm enablement in progress")
     def test_fsdp2_correctness(self):
         mp_policy = MixedPrecisionPolicy()
 
diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py
index 003a50c4d1..4ee5b4530e 100644
--- a/test/sparsity/test_sparse_api.py
+++ b/test/sparsity/test_sparse_api.py
@@ -22,6 +22,7 @@
     quantize_,
 )
 from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_
+from torchao.testing.utils import skip_if_rocm
 from torchao.utils import is_sm_at_least_90
 
 logging.basicConfig(
@@ -194,6 +195,7 @@ class TestBlockSparseWeight(common_utils.TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @common_utils.parametrize("compile", [True, False])
     @common_utils.parametrize("input_shape", [1, 1024])
+    @skip_if_rocm("ROCm enablement in progress")
     def test_sparse(self, compile, input_shape):
         input = torch.rand((input_shape, 1024)).half().cuda()
         model = (
@@ -227,6 +229,7 @@ def test_sparse(self, compile, input_shape):
 class TestQuantBlockSparseWeight(common_utils.TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @common_utils.parametrize("compile", [True, False])
+    @skip_if_rocm("ROCm enablement in progress")
     def test_sparse(self, compile):
         input = torch.rand((256, 128)).to(torch.bfloat16).cuda()
         model = (

From 205e2b9c89739d865893dadb603f629581a103c9 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 19:12:48 -0700
Subject: [PATCH 6/8] remove temp code

---
 .github/workflows/regression_test_rocm.yml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
index b9f8d1f423..a9fdb68c13 100644
--- a/.github/workflows/regression_test_rocm.yml
+++ b/.github/workflows/regression_test_rocm.yml
@@ -20,18 +20,12 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - name: ROCM Nightly 6.3
+          - name: ROCM Nightly
             runs-on: linux.rocm.gpu.gfx942.2
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
             gpu-arch-type: "rocm"
             gpu-arch-version: "6.3"
             docker-image: pytorch/manylinux2_28-builder:rocm6.3
-          - name: ROCM Nightly 7.0
-            runs-on: linux.rocm.gpu.gfx942.2
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
-            gpu-arch-type: "rocm"
-            gpu-arch-version: "7.0"
-            docker-image: pytorch/manylinux2_28-builder:rocm7.0
 
     permissions:
       id-token: write

From f57e21fd888c8a6f53ccdc5689e37c7762d223ea Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 19:32:10 -0700
Subject: [PATCH 7/8] Remove skipped tests

---
 test/dtypes/test_floatx.py                  |  1 -
 test/dtypes/test_nf4.py                     |  2 --
 test/dtypes/test_uintx.py                   |  3 ---
 test/integration/test_integration.py        | 21 ---------------------
 test/prototype/mx_formats/test_kernels.py   |  4 ----
 test/prototype/mx_formats/test_mx_linear.py |  2 --
 test/prototype/mx_formats/test_mx_tensor.py |  5 -----
 test/prototype/test_blockwise_triton.py     |  3 ---
 test/prototype/test_quantized_training.py   |  5 -----
 test/sparsity/test_sparse_api.py            |  3 ---
 10 files changed, 49 deletions(-)

diff --git a/test/dtypes/test_floatx.py b/test/dtypes/test_floatx.py
index 0cff75d309..ab4a13d24c 100644
--- a/test/dtypes/test_floatx.py
+++ b/test/dtypes/test_floatx.py
@@ -73,7 +73,6 @@ def test_from_tc_floatx_correctness(self, ebits, mbits, device):
 
     @parametrize("ebits,mbits", _Floatx_DTYPES)
     @parametrize("device", _DEVICES)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_from_scaled_tc_floatx_compile(self, ebits, mbits, device):
         M, N = 256, 64
         nbits = 1 + ebits + mbits
diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py
index 7dddf4b476..2a711413f0 100644
--- a/test/dtypes/test_nf4.py
+++ b/test/dtypes/test_nf4.py
@@ -263,7 +263,6 @@ def test_smoketest_linear(self, dtype: torch.dtype):
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
-    @skip_if_rocm("ROCm enablement in progress")
     def test_smoketest_linear_compile(self, dtype: torch.dtype):
         if (
             torch.cuda.is_available()
@@ -302,7 +301,6 @@ def test_empty_like(self, input_size: Union[Tuple[int], int]):
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @parametrize("compile", [False, True])
-    @skip_if_rocm("ROCm enablement in progress")
     def test_quantize_api(self, compile):
         nf4_linear = nn.Linear(512, 512, device="cuda")
         torchao.quantize_(nf4_linear, nf4_weight_only())
diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py
index ab417ab417..cb0c88b21c 100644
--- a/test/dtypes/test_uintx.py
+++ b/test/dtypes/test_uintx.py
@@ -14,7 +14,6 @@
     dequantize_affine,
     quantize_affine,
 )
-from torchao.testing.utils import skip_if_rocm
 
 dtypes = (
     torch.uint1,
@@ -76,7 +75,6 @@ def test_uintx_quant_on_cpu_then_move_to_cuda(dtype, group_size):
 @pytest.mark.parametrize("group_size", group_sizes)
 @pytest.mark.parametrize("device", devices)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-@skip_if_rocm("ROCm enablement in progress")
 def test_uintx_weight_only_model_quant(dtype, group_size, device):
     scale = 512
     fp16 = Linear16(scale, device)
@@ -134,7 +132,6 @@ def test_uintx_target_dtype(dtype):
 
 @pytest.mark.parametrize("dtype", dtypes)
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA available")
-@skip_if_rocm("ROCm enablement in progress")
 def test_uintx_target_dtype_compile(dtype):
     linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
     # make sure it runs
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
index b58e3eeb03..f99cf4a1b4 100644
--- a/test/integration/test_integration.py
+++ b/test/integration/test_integration.py
@@ -376,7 +376,6 @@ def test_swap(self):
         assert torch.allclose(y_ref, y)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_weight_t_and_non_t_numerics_match(self):
         # verify that numerics match whether weight is stored
         # in transposed format (for cuBLAS) vs non-transposed format
@@ -587,7 +586,6 @@ def test_per_token_linear_cuda(self):
             self._test_per_token_linear_impl("cuda", dtype)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test__int_mm(self):
         # TODO(future): figure out what here needs to move to PT core,
         # if it's not already tested there
@@ -609,7 +607,6 @@ def test__int_mm(self):
         torch.testing.assert_close(y_ref, y_opt, atol=0, rtol=0)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test__int_mm_eager_and_torch_compile_numerics(self):
         def __int_mm_ref(x, w):
             x = x.cpu().to(torch.int32)
@@ -775,7 +772,6 @@ def _test_lin_weight_subclass_impl(
             )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_dynamic_quant_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             Int8DynamicallyQuantizedLinearWeight.from_float,
@@ -785,7 +781,6 @@ def test_int8_dynamic_quant_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_quant_subclass(self, device, dtype):
         undo_recommended_configs()
         self._test_lin_weight_subclass_impl(
@@ -793,7 +788,6 @@ def test_int8_weight_only_quant_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_aq_int8_dynamic_quant_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             AQInt8DynamicallyQuantizedLinearWeight.from_float,
@@ -816,7 +810,6 @@ def test_aq_int8_weight_only_quant_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             AQInt8WeightOnlyQuantizedLinearWeight2.from_float,
@@ -826,7 +819,6 @@ def test_aq_int8_weight_only_quant_2_subclass(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_aq_int8_weight_only_quant_3_subclass(self, device, dtype):
         self._test_lin_weight_subclass_impl(
             AQInt8WeightOnlyQuantizedLinearWeight3.from_float,
@@ -998,7 +990,6 @@ def test_int8_dynamic_quant_subclass_api(self, device, dtype, act_mapping):
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(is_fbcode(), "broken in fbcode")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_quant_subclass_api(self, device, dtype):
         undo_recommended_configs()
         self._test_lin_weight_subclass_api_impl(
@@ -1015,7 +1006,6 @@ def test_int8_weight_only_quant_with_freeze(self, device, dtype):
         )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_quant_subclass_api(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
@@ -1027,7 +1017,6 @@ def test_int4_weight_only_quant_subclass_api(self, device, dtype):
             )
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_hqq_quant_subclass_api(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
@@ -1179,7 +1168,6 @@ def test_weight_only_groupwise_embedding_quant(self):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_weight_only_quant_force_mixed_mm(self, device, dtype):
         undo_recommended_configs()
         if device != "cuda":
@@ -1212,7 +1200,6 @@ def test_weight_only_quant_force_mixed_mm(self, device, dtype):
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_weight_only_quant_use_mixed_mm(self, device, dtype):
         undo_recommended_configs()
         if device != "cuda":
@@ -1315,7 +1302,6 @@ def forward(self, x):
         is_fbcode(), "'PlainAQTTensorImpl' object has no attribute 'int_data'"
     )
     @torch.no_grad()
-    @skip_if_rocm("ROCm enablement in progress")
     def test_save_load_dqtensors(self, device, dtype):
         if device == "cpu":
             self.skipTest("indcutor failed for cpu right now")
@@ -1326,14 +1312,12 @@ def test_save_load_dqtensors(self, device, dtype):
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
     @unittest.skipIf(is_fbcode(), "broken in fbcode")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_save_load_int8woqtensors(self, device, dtype):
         undo_recommended_configs()
         self._test_handle_save_load_meta_impl(_int8wo_api, device, test_dtype=dtype)
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @torch.no_grad()
-    @skip_if_rocm("ROCm enablement in progress")
     def test_save_load_int4woqtensors(self, device, dtype):
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
@@ -1342,7 +1326,6 @@ def test_save_load_int4woqtensors(self, device, dtype):
 
 class TorchCompileUnitTest(unittest.TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_fullgraph(self):
         lin_fp16 = nn.Linear(32, 16, device="cuda", dtype=torch.float16)
         lin_smooth = SmoothFakeDynamicallyQuantizedLinear.from_float(
@@ -1527,7 +1510,6 @@ def test_autoquant_one_input(self, device, dtype, m, k, n):
             ],
         )
     )
-    @skip_if_rocm("ROCm enablement in progress")
     def test_autoquant_compile(self, device, dtype, m1, m2, k, n):
         undo_recommended_configs()
 
@@ -1604,7 +1586,6 @@ def forward(self, x):
         assert len(_AUTOQUANT_CACHE) > 0
 
     @parameterized.expand(COMMON_DEVICE_DTYPE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_autoquant_manual(self, device, dtype):
         undo_recommended_configs()
         if device != "cuda" or not torch.cuda.is_available():
@@ -1654,7 +1635,6 @@ def test_autoquant_manual(self, device, dtype):
             ],
         )
     )
-    @skip_if_rocm("ROCm enablement in progress")
     def test_autoquant_kwargs(self, device, dtype, m1, m2, k, n):
         undo_recommended_configs()
         if device != "cuda" or not torch.cuda.is_available():
@@ -2077,7 +2057,6 @@ def run_benchmark_model(self, device):
         return benchmark_model(m_bf16, num_runs, example_inputs)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_benchmark_model_cuda(self):
         assert self.run_benchmark_model("cuda") is not None
 
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
index 2752cf159e..1bd55e28f5 100644
--- a/test/prototype/mx_formats/test_kernels.py
+++ b/test/prototype/mx_formats/test_kernels.py
@@ -43,7 +43,6 @@
 )
 from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx
 from torchao.prototype.mx_formats.utils import to_blocked
-from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_100,
@@ -424,7 +423,6 @@ def test_fp6_e3m2_rounding(f32_val, f6_e3m2_enc, device):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
-@skip_if_rocm("ROCm enablement in progress")
 def test_fp6_e2m3_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 0.5, 7.5, -0.0], [-0.875, 1.0, -6.0, 0.125]]).to(
         "cuda"
@@ -440,7 +438,6 @@ def test_fp6_e2m3_pack_unpack():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
-@skip_if_rocm("ROCm enablement in progress")
 def test_fp6_e3m2_pack_unpack():
     orig_vals = torch.Tensor([[0.0, 5.0, 28.0, -0.0], [-0.25, 0.1875, 0.0625, 8.0]]).to(
         "cuda"
@@ -483,7 +480,6 @@ def test_triton_mxfp8_dim1_randn(M, K):
         (128, 1),
     ],
 )
-@skip_if_rocm("ROCm enablement in progress")
 def test_rearrange(shape):
     scales = torch.randint(256, size=shape, device="cuda", dtype=torch.uint8)
     eager = to_blocked(scales, False)
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
index 096d65b0b3..c858657af6 100644
--- a/test/prototype/mx_formats/test_mx_linear.py
+++ b/test/prototype/mx_formats/test_mx_linear.py
@@ -25,7 +25,6 @@
 )
 from torchao.quantization import quantize_
 from torchao.quantization.utils import compute_error
-from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_100,
@@ -266,7 +265,6 @@ def test_activation_checkpointing():
         ScaleCalculationMode.RCEIL,
     ],
 )
-@skip_if_rocm("ROCm enablement in progress")
 def test_linear_compile(
     hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice, scale_calculation_mode
 ):
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
index dab519f1f3..aa02c11b3a 100644
--- a/test/prototype/mx_formats/test_mx_tensor.py
+++ b/test/prototype/mx_formats/test_mx_tensor.py
@@ -23,7 +23,6 @@
     to_dtype,
 )
 from torchao.quantization.utils import compute_error
-from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
     is_sm_at_least_89,
     is_sm_at_least_90,
@@ -348,7 +347,6 @@ def test_exponent_nan_in(elem_dtype):
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 @pytest.mark.parametrize("pack_fp6", [False, True])
-@skip_if_rocm("ROCm enablement in progress")
 def test_exponent_nan_out(elem_dtype, pack_fp6):
     """
     If block exponent value is NaN, the MX tensor block value is NaN
@@ -473,7 +471,6 @@ def test_clone():
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("elem_dtype", [DTYPE_FP6_E2M3, DTYPE_FP6_E3M2])
 @pytest.mark.parametrize("pack_fp6", [False, True])
-@skip_if_rocm("ROCm enablement in progress")
 def test_fp6_packing(elem_dtype, pack_fp6):
     x = torch.randn(1, 2, 4, device="cuda")
     block_size = 4
@@ -490,7 +487,6 @@ def test_fp6_packing(elem_dtype, pack_fp6):
 @pytest.mark.parametrize("elem_dtype", SUPPORTED_ELEM_DTYPES)
 @pytest.mark.parametrize("hp_dtype", [torch.float32, torch.bfloat16])
 @pytest.mark.parametrize("all_zeros", [False, True])
-@skip_if_rocm("ROCm enablement in progress")
 def test_to_mx_from_mx_compile_numerics(elem_dtype, hp_dtype, all_zeros):
     """
     Verifies that compile does not change numerics of MX casts
@@ -648,7 +644,6 @@ def to_f8(x):
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+"
 )
-@skip_if_rocm("ROCm enablement in progress")
 def test_to_blocked_from_blocked_roundtrip(shape, use_triton_kernel: bool):
     from torchao.prototype.mx_formats.utils import from_blocked, to_blocked
 
diff --git a/test/prototype/test_blockwise_triton.py b/test/prototype/test_blockwise_triton.py
index 06d8b3b0c8..89f8cf869e 100644
--- a/test/prototype/test_blockwise_triton.py
+++ b/test/prototype/test_blockwise_triton.py
@@ -17,7 +17,6 @@
     fp8_blockwise_weight_dequant,
     fp8_blockwise_weight_quant,
 )
-from torchao.testing.utils import skip_if_rocm
 from torchao.utils import is_sm_at_least_89
 
 BLOCKWISE_SIZE_MNK = [
@@ -38,7 +37,6 @@
     if is_sm_at_least_89()
     else [torch.float8_e5m2],
 )
-@skip_if_rocm("ROCm enablement in progress")
 def test_blockwise_quant_dequant(_, N, K, dtype):
     x = torch.randn(N, K).cuda()
     qx, s = fp8_blockwise_weight_quant(x, dtype=dtype)
@@ -61,7 +59,6 @@ def test_blockwise_quant_dequant(_, N, K, dtype):
     if is_sm_at_least_89()
     else [torch.float8_e5m2],
 )
-@skip_if_rocm("ROCm enablement in progress")
 def test_blockwise_fp8_gemm(M, N, K, dtype):
     A = torch.randn(M, K).cuda()
     B = torch.randn(N, K).cuda()
diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index e73ce16c6e..fa0edd694b 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -34,7 +34,6 @@
     quantize_int8_rowwise,
 )
 from torchao.quantization.quant_api import quantize_
-from torchao.testing.utils import skip_if_rocm
 
 if common_utils.SEED is None:
     common_utils.SEED = 1234
@@ -102,7 +101,6 @@ def test_int8_weight_only_correctness(self, leading_dims, bias, device):
     @parametrize("leading_dims", [(), (2,), (2, 4)])
     @parametrize("bias", [False, True])
     @parametrize("device", _DEVICES)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_compile(self, leading_dims, bias, device):
         _reset()
         embed_dim = 128
@@ -135,7 +133,6 @@ def test_int8_weight_only_compile(self, leading_dims, bias, device):
 
     @parametrize("compile", [False, True])
     @parametrize("device", _DEVICES)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_weight_only_training(self, compile, device):
         _reset()
         bsize = 4
@@ -186,7 +183,6 @@ def test_int8_weight_only_training(self, compile, device):
     )
     @parametrize("module_swap", [False, True])
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    @skip_if_rocm("ROCm enablement in progress")
     def test_int8_mixed_precision_training(self, compile, config, module_swap):
         _reset()
         bsize = 64
@@ -300,7 +296,6 @@ def world_size(self) -> int:
         return _FSDP_WORLD_SIZE
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
-    @skip_if_rocm("ROCm enablement in progress")
     def test_fsdp2_correctness(self):
         mp_policy = MixedPrecisionPolicy()
 
diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py
index 4ee5b4530e..003a50c4d1 100644
--- a/test/sparsity/test_sparse_api.py
+++ b/test/sparsity/test_sparse_api.py
@@ -22,7 +22,6 @@
     quantize_,
 )
 from torchao.sparsity import apply_fake_sparsity, semi_sparse_weight, sparsify_
-from torchao.testing.utils import skip_if_rocm
 from torchao.utils import is_sm_at_least_90
 
 logging.basicConfig(
@@ -195,7 +194,6 @@ class TestBlockSparseWeight(common_utils.TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @common_utils.parametrize("compile", [True, False])
     @common_utils.parametrize("input_shape", [1, 1024])
-    @skip_if_rocm("ROCm enablement in progress")
     def test_sparse(self, compile, input_shape):
         input = torch.rand((input_shape, 1024)).half().cuda()
         model = (
@@ -229,7 +227,6 @@ def test_sparse(self, compile, input_shape):
 class TestQuantBlockSparseWeight(common_utils.TestCase):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @common_utils.parametrize("compile", [True, False])
-    @skip_if_rocm("ROCm enablement in progress")
     def test_sparse(self, compile):
         input = torch.rand((256, 128)).to(torch.bfloat16).cuda()
         model = (

From 9396fc8222c52b595bed1bbe53cb4ba4bd4d38cc Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 9 Oct 2025 19:34:14 -0700
Subject: [PATCH 8/8] Add 7.0 rocm

---
 .github/workflows/regression_test_rocm.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
index a9fdb68c13..c12f2a61da 100644
--- a/.github/workflows/regression_test_rocm.yml
+++ b/.github/workflows/regression_test_rocm.yml
@@ -22,10 +22,10 @@ jobs:
         include:
           - name: ROCM Nightly
             runs-on: linux.rocm.gpu.gfx942.2
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
             gpu-arch-type: "rocm"
-            gpu-arch-version: "6.3"
-            docker-image: pytorch/manylinux2_28-builder:rocm6.3
+            gpu-arch-version: "7.0"
+            docker-image: pytorch/manylinux2_28-builder:rocm7.0
 
     permissions:
       id-token: write