From 8a7153fcb27cb00ec942ba6c498cf43b4eeabb93 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 11:34:24 +0800
Subject: [PATCH 01/10] enable test/dtypes/test_affine_quantized.py on intel
 XPU

---
 test/dtypes/test_affine_quantized.py | 57 ++++++++++++++--------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
index 9679f6975a..e0669a576f 100644
--- a/test/dtypes/test_affine_quantized.py
+++ b/test/dtypes/test_affine_quantized.py
@@ -42,15 +42,16 @@
     is_fbcode,
     is_ROCM,
     is_sm_at_least_89,
+    get_current_accelerator_device,
 )
 
 is_cusparselt_available = (
     hasattr(torch.backends, "cusparselt") and torch.backends.cusparselt.is_available()
 )
-
+_DEVICE = get_current_accelerator_device()
 
 def get_quantization_functions(
-    do_sparse: bool, do_int4: bool, device: str = "cuda", int4_zp_int: bool = False
+    do_sparse: bool, do_int4: bool, device: str = _DEVICE, int4_zp_int: bool = False
 ):
     base_functions = [
         Int8WeightOnlyConfig(),
@@ -105,9 +106,9 @@ class TestAffineQuantized(TestCase):
         ["xpu"] if torch.xpu.is_available() else []
     )
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_tensor_core_layout_transpose(self):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
         t = linear.weight
         shape = t.shape
         apply_int4_weight_only_quant = Int4WeightOnlyConfig(group_size=32, version=1)
@@ -169,7 +170,7 @@ def _apply(module, config_or_subclass_inserter):
             ql = _apply(linear, apply_quant)
             ql.to(device)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_register_new_dispatch(self):
         from torchao.dtypes import AffineQuantizedTensor
         from torchao.dtypes.affine_quantized_tensor_ops import (
@@ -206,10 +207,10 @@ def apply_uint6_weight_only_quant(linear):
             )
             return linear
 
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
         apply_uint6_weight_only_quant(linear)
 
-        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")
+        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)
         with self.assertRaisesRegex(
             AssertionError, "dispatching to my impl for uint6 weight only quant"
         ):
@@ -232,13 +233,13 @@ def test_print_quantized_module(self):
                     ql = apply_quant(linear)
                 assert "AffineQuantizedTensor" in str(ql)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @common_utils.parametrize(
-        "apply_quant", get_quantization_functions(False, True, "cuda", False)
+        "apply_quant", get_quantization_functions(False, True, _DEVICE, False)
     )
     def test_test_copy__apply(self, apply_quant):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
-        linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
+        linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
 
         if isinstance(apply_quant, AOBaseConfig):
             quantize_(linear, apply_quant)
@@ -249,20 +250,20 @@ def test_test_copy__apply(self, apply_quant):
             ql = apply_quant(linear)
             ql2 = apply_quant(linear2)
 
-        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device="cuda")
+        example_input = torch.randn(1, 128, dtype=torch.bfloat16, device=_DEVICE)
         output = ql(example_input)
         ql2.weight.copy_(ql.weight)
         ql2.bias = ql.bias
         output2 = ql2(example_input)
         self.assertEqual(output, output2)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @common_utils.parametrize(
-        "apply_quant", get_quantization_functions(False, True, "cuda", False)
+        "apply_quant", get_quantization_functions(False, True, _DEVICE, False)
     )
     def test_copy__mismatch_metadata(self, apply_quant):
-        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
-        linear2 = torch.nn.Linear(128, 512, dtype=torch.bfloat16, device="cuda")
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
+        linear2 = torch.nn.Linear(128, 512, dtype=torch.bfloat16, device=_DEVICE)
 
         if isinstance(apply_quant, AOBaseConfig):
             quantize_(linear, apply_quant)
@@ -336,9 +337,9 @@ def test_alias(self, device, dtype):
         quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
         _ = dummy.weight[...]
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @skip_if_rocm("ROCm enablement in progress")
     def test_slice_int4wo(self, device, dtype):
         # in_feature not divisible by 1024
@@ -350,9 +351,9 @@ def test_slice_int4wo(self, device, dtype):
         _ = dummy.weight.narrow(0, 0, 64)
         _ = dummy.weight.narrow(1, 0, 128)
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
-    @skip_if_no_cuda()
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @skip_if_no_gemlite()
     def test_slice_gemlite(self, device, dtype):
         # in_feature not divisible by 1024
@@ -433,7 +434,7 @@ def dequant(input_layer, in_features, orig_shape):
             )
             self.assertEqual((W_slice_ref - W_slice).abs().mean().item(), 0)
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
     def test_matmul(self, device, dtype):
         x = torch.randn(53, 2048)
@@ -450,14 +451,14 @@ def test_matmul(self, device, dtype):
         # make sure it runs
         torch.matmul(x, w.t())
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @skip_if_rocm("ROCm enablement in progress")
     def test_slice_and_copy_int4wo(self, device, dtype):
-        l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
+        l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
         l.weight = torch.nn.Parameter(
-            torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
+            torch.zeros(1024, 1024, dtype=torch.bfloat16, device=_DEVICE)
         )
         quantize_(l, Int4WeightOnlyConfig(version=1))
         param = l.weight
@@ -474,7 +475,7 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         assert param.data.dequantize()[0][0] == 0
 
         # dummy_l has random input (shouldn't be 0)
-        dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
+        dummy_l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)
         quantize_(dummy_l, Int4WeightOnlyConfig(version=1))
         quantized = dummy_l.weight
         quantized = quantized.narrow(0, 0, 512)
@@ -484,9 +485,9 @@ def test_slice_and_copy_int4wo(self, device, dtype):
         # making sure param.data is updated
         assert param.data.dequantize()[0][0] != 0
 
-    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @skip_if_no_cuda()
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @skip_if_rocm("ROCm enablement in progress")
     def test_mm_int4wo(self, device, dtype):
         weight = torch.randn(512, 1024).to(device).to(dtype)

From fc23fba78de7e9dfe96803137aca999fc4291bfc Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 12:10:35 +0800
Subject: [PATCH 02/10] enable test/dtypes/test_affine_quantized.py on intel
 XPU

---
 test/dtypes/test_affine_quantized.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
index e0669a576f..416e2547df 100644
--- a/test/dtypes/test_affine_quantized.py
+++ b/test/dtypes/test_affine_quantized.py
@@ -233,7 +233,7 @@ def test_print_quantized_module(self):
                     ql = apply_quant(linear)
                 assert "AffineQuantizedTensor" in str(ql)
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @common_utils.parametrize(
         "apply_quant", get_quantization_functions(False, True, _DEVICE, False)
     )
@@ -339,7 +339,7 @@ def test_alias(self, device, dtype):
 
     @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @skip_if_no_cuda()
     @skip_if_rocm("ROCm enablement in progress")
     def test_slice_int4wo(self, device, dtype):
         # in_feature not divisible by 1024
@@ -453,7 +453,7 @@ def test_matmul(self, device, dtype):
 
     @common_utils.parametrize("device", [_DEVICE])
     @common_utils.parametrize("dtype", [torch.bfloat16])
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @skip_if_no_cuda()
     @skip_if_rocm("ROCm enablement in progress")
     def test_slice_and_copy_int4wo(self, device, dtype):
         l = torch.nn.Linear(1024, 1024).to(_DEVICE).to(torch.bfloat16)

From c1d33ccb6080d89043ce9a13bff4e10f08fdbb1b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 12:25:58 +0800
Subject: [PATCH 03/10] enable test/dtypes/test_affine_quantized_float.py on
 intel XPU

---
 test/dtypes/test_affine_quantized_float.py | 95 +++++++++++-----------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index 738e9b6164..4ab630f125 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -42,11 +42,12 @@
     is_sm_at_least_89,
     is_sm_at_least_90,
     is_sm_version,
+    get_current_accelerator_device,
 )
 
 random.seed(0)
 torch.manual_seed(0)
-
+_DEVICE = get_current_accelerator_device()
 
 class ToyLinearModel(torch.nn.Module):
     def __init__(self, in_features, out_features):
@@ -61,7 +62,7 @@ def forward(self, x):
 
 
 class TestAffineQuantizedFloat8Compile(InductorTestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
         not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
@@ -97,7 +98,7 @@ def test_fp8_linear_variants(
 
         with error_context:
             M, N, K = sizes
-            input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
+            input_tensor = torch.randn(*M, K, dtype=dtype, device=_DEVICE)
             # Get a "reasonable" scale for the input tensor even though
             # we use the same scale for multiple activations
             scale, _ = choose_qparams_affine(
@@ -122,7 +123,7 @@ def test_fp8_linear_variants(
             }
 
             # Create a linear layer with bfloat16 dtype
-            model = ToyLinearModel(K, N).eval().to(dtype).to("cuda")
+            model = ToyLinearModel(K, N).eval().to(dtype).to(_DEVICE)
 
             quantized_model = copy.deepcopy(model)
             factory = mode_map[mode]()
@@ -170,29 +171,29 @@ class UnsupportedGranularity:
                 granularity=(UnsupportedGranularity(), UnsupportedGranularity()),
             )
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_per_row_with_float32(self):
         with pytest.raises(
             AssertionError,
             match="PerRow quantization only works for bfloat16 precision",
         ):
-            model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
+            model = ToyLinearModel(64, 64).eval().to(torch.float32).to(_DEVICE)
             quantize_(
                 model,
                 Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
             )
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("mode", ["dynamic", "weight-only", "static"])
     def test_serialization(self, mode: str):
         # Create and quantize the model
-        model = ToyLinearModel(16, 32).to(device="cuda")
+        model = ToyLinearModel(16, 32).to(device=_DEVICE)
 
         mode_map = {
             "dynamic": partial(
@@ -203,7 +204,7 @@ def test_serialization(self, mode: str):
             "weight-only": partial(Float8WeightOnlyConfig, version=1),
             "static": partial(
                 Float8StaticActivationFloat8WeightConfig,
-                scale=torch.tensor(1.0, dtype=torch.float32, device="cuda"),
+                scale=torch.tensor(1.0, dtype=torch.float32, device=_DEVICE),
                 granularity=PerTensor(),
             ),
         }
@@ -260,13 +261,13 @@ def test_serialization(self, mode: str):
                     original_layer.weight.scale, new_layer.weight.scale
                 ), f"Scales do not match for {layer_name}"
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_fp8_weight_dimension_warning(self):
         # Create model with incompatible dimensions (not multiples of 16)
-        model = ToyLinearModel(10, 25).cuda()  # 10x25 and 25x10 weights
+        model = ToyLinearModel(10, 25).to(_DEVICE)  # 10x25 and 25x10 weights
 
         # Set up logging capture
         with self.assertLogs(
@@ -298,9 +299,9 @@ def test_fp8_weight_dimension_warning(self):
                 f"Expected warning message containing: {expected}",
             )
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize(
         "in_features,out_features", [(512, 1024), (256, 768), (1024, 512)]
@@ -312,7 +313,7 @@ def test_fp8_weight_dimension_warning(self):
     def test_mm_float8dq_per_row(
         self, in_features, out_features, leading_shape, bias: bool
     ):
-        device = "cuda"
+        device = _DEVICE
         dtype = torch.bfloat16
         input_shape = leading_shape + (in_features,)
 
@@ -353,15 +354,15 @@ def test_mm_float8dq_per_row(
         error = compute_error(ref_output, quant_output)
         assert error > 20, f"Quantization error is too high got a SQNR of {error}"
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
     def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
         block_size = ()
-        device = "cuda"
+        device = _DEVICE
         input_tensor = torch.randn(8, 64, device=device, dtype=torch.float32)
 
         # testing upper bounds
@@ -396,9 +397,9 @@ def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
         # since scale = abs_max / quant_max, larger abs_max means scale is larger
         self.assertTrue(scale_ref < scale_with_lb)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
@@ -406,7 +407,7 @@ def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
     def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
         """Test _dequantize_affine_float8 with various configurations"""
 
-        device = "cuda"
+        device = _DEVICE
         input_tensor = torch.randn(8, 64, device=device, dtype=torch.float32)
 
         # Choose quantization parameters
@@ -429,13 +430,13 @@ def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
         error = torch.abs(input_tensor.to(output_dtype) - dequantized).mean()
         self.assertLess(error, 0.1, "Quantization error too high")
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_dequantize_affine_float8_scale_broadcasting(self):
         """Test that scale broadcasting works correctly for block-wise quantization"""
-        device = "cuda"
+        device = _DEVICE
         # Create input tensor with known block structure
         input_tensor = torch.randn(4, 32, device=device, dtype=torch.float32)
         block_size = (2, 16)  # 2x2 blocks in first dim, 2x16 blocks in second dim
@@ -461,14 +462,14 @@ def test_dequantize_affine_float8_scale_broadcasting(self):
         # Verify shapes match
         self.assertEqual(dequantized.shape, input_tensor.shape)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     def test_float8_tensor_slicing_basic(self, granularity):
         """Test basic slicing operations on Float8 tensors"""
-        device = "cuda"
+        device = _DEVICE
         dtype = torch.bfloat16
 
         # Create and quantize a model
@@ -499,13 +500,13 @@ def test_float8_tensor_slicing_basic(self, granularity):
         self.assertTrue(isinstance(sliced_1, Float8AQTTensorImpl))
         self.assertTrue(isinstance(sliced_both, Float8AQTTensorImpl))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_float8_tensor_slicing_per_tensor(self):
         """Test slicing with per-tensor quantization (scale should not change)"""
-        device = "cuda"
+        device = _DEVICE
         dtype = torch.bfloat16
 
         # Create and quantize with per-tensor granularity
@@ -529,17 +530,17 @@ def test_float8_tensor_slicing_per_tensor(self):
         self.assertTrue(torch.equal(original_scale, sliced_impl.scale))
         self.assertEqual(sliced_impl.scale.numel(), 1)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @unittest.skipIf(
-        not is_sm_at_least_90(),
+        _DEVICE == "cuda" and not is_sm_at_least_90(),
         "Per-row quantization requires compute capability >= 9.0",
     )
     def test_float8_tensor_slicing_per_row(self):
         """Test slicing with per-row quantization (scale should be sliced appropriately)"""
-        device = "cuda"
+        device = _DEVICE
         dtype = torch.bfloat16
 
         # Create and quantize with per-row granularity
@@ -572,13 +573,13 @@ def test_float8_tensor_slicing_per_row(self):
         self.assertEqual(sliced_cols_impl.scale.shape, (32, 1))
         self.assertTrue(torch.equal(sliced_cols_impl.scale, original_scale))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_float8_tensor_slicing_edge_cases(self):
         """Test edge cases in slicing"""
-        device = "cuda"
+        device = _DEVICE
         dtype = torch.bfloat16
 
         # Create and quantize a model
@@ -604,18 +605,18 @@ def test_float8_tensor_slicing_edge_cases(self):
         large_slice = original_weight[:100]  # More than available rows
         self.assertEqual(large_slice.shape, (32, 64))  # Should clamp to available
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @unittest.skipIf(
-        is_sm_version(8, 9),
+        _DEVICE == "cuda" and is_sm_version(8, 9),
         "TODO: AssertionError: tensor(-2.1562, device='cuda:0', dtype=torch.bfloat16) not greater than 15",
     )
     def test_float8_tensor_slicing_functional_correctness(self, granularity):
         """Test that sliced tensors produce correct results in computations"""
-        device = "cuda"
+        device = _DEVICE
         dtype = torch.bfloat16
 
         # Create reference and quantized models with dimensions that are multiples of 16
@@ -776,9 +777,9 @@ def test_quantize_dequantize_fp8_inductor(self, float8_dtype, hp_dtype):
             torch.testing.assert_close(expected_dequantized, test_dq)
 
     @torch.no_grad()
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        not is_sm_at_least_90(), "Requires GPU with compute capability >= 9.0"
+        _DEVICE == "cuda" and not is_sm_at_least_90(), "Requires GPU with compute capability >= 9.0"
     )
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @common_utils.parametrize("float8_config_version", [1, 2])
@@ -791,7 +792,7 @@ def test_expected_kernels_on_gpu(self, granularity, float8_config_version):
 
         M, K, N = 128, 256, 512
         m = torch.nn.Sequential(
-            torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
+            torch.nn.Linear(K, N, device=_DEVICE, dtype=torch.bfloat16)
         )
         if float8_config_version == 1:
             config = Float8DynamicActivationFloat8WeightConfig(
@@ -810,7 +811,7 @@ def test_expected_kernels_on_gpu(self, granularity, float8_config_version):
         )
 
         m = torch.compile(m)
-        x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+        x = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
         out, code = run_and_get_code(m, x)
 
         # triton kernel call looks like:

From 96ae48e97ebeb8087dba7d44f020e7eb21ac53a1 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 13:15:44 +0800
Subject: [PATCH 04/10] enable test/dtypes/test_affine_quantized_float.py on
 intel XPU

---
 test/dtypes/test_affine_quantized_float.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index 4ab630f125..9ef2a06574 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -299,9 +299,9 @@ def test_fp8_weight_dimension_warning(self):
                 f"Expected warning message containing: {expected}",
             )
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize(
         "in_features,out_features", [(512, 1024), (256, 768), (1024, 512)]
@@ -462,9 +462,9 @@ def test_dequantize_affine_float8_scale_broadcasting(self):
         # Verify shapes match
         self.assertEqual(dequantized.shape, input_tensor.shape)
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     def test_float8_tensor_slicing_basic(self, granularity):
@@ -777,9 +777,9 @@ def test_quantize_dequantize_fp8_inductor(self, float8_dtype, hp_dtype):
             torch.testing.assert_close(expected_dequantized, test_dq)
 
     @torch.no_grad()
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_90(), "Requires GPU with compute capability >= 9.0"
+        not is_sm_at_least_90(), "Requires GPU with compute capability >= 9.0"
     )
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @common_utils.parametrize("float8_config_version", [1, 2])

From 32df181678eee5a57eebd7fa5b82dbae6be2055c Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 13:21:46 +0800
Subject: [PATCH 05/10] enable test/dtypes/test_affine_quantized_float.py on
 intel XPU

---
 test/dtypes/test_affine_quantized_float.py | 24 +++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index 9ef2a06574..40e1620536 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -186,9 +186,9 @@ def test_per_row_with_float32(self):
                 Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
             )
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("mode", ["dynamic", "weight-only", "static"])
     def test_serialization(self, mode: str):
@@ -500,9 +500,9 @@ def test_float8_tensor_slicing_basic(self, granularity):
         self.assertTrue(isinstance(sliced_1, Float8AQTTensorImpl))
         self.assertTrue(isinstance(sliced_both, Float8AQTTensorImpl))
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_float8_tensor_slicing_per_tensor(self):
         """Test slicing with per-tensor quantization (scale should not change)"""
@@ -530,12 +530,12 @@ def test_float8_tensor_slicing_per_tensor(self):
         self.assertTrue(torch.equal(original_scale, sliced_impl.scale))
         self.assertEqual(sliced_impl.scale.numel(), 1)
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_90(),
+        not is_sm_at_least_90(),
         "Per-row quantization requires compute capability >= 9.0",
     )
     def test_float8_tensor_slicing_per_row(self):
@@ -573,9 +573,9 @@ def test_float8_tensor_slicing_per_row(self):
         self.assertEqual(sliced_cols_impl.scale.shape, (32, 1))
         self.assertTrue(torch.equal(sliced_cols_impl.scale, original_scale))
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_float8_tensor_slicing_edge_cases(self):
         """Test edge cases in slicing"""
@@ -605,13 +605,13 @@ def test_float8_tensor_slicing_edge_cases(self):
         large_slice = original_weight[:100]  # More than available rows
         self.assertEqual(large_slice.shape, (32, 64))  # Should clamp to available
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @unittest.skipIf(
-        _DEVICE == "cuda" and is_sm_version(8, 9),
+        is_sm_version(8, 9),
         "TODO: AssertionError: tensor(-2.1562, device='cuda:0', dtype=torch.bfloat16) not greater than 15",
     )
     def test_float8_tensor_slicing_functional_correctness(self, granularity):

From 2ed02da7899df3026e9d0cd64169a42a44f43aa8 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 13:24:19 +0800
Subject: [PATCH 06/10] enable test/dtypes/test_affine_quantized_float.py on
 intel XPU

---
 test/dtypes/test_affine_quantized_float.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index 40e1620536..4d3d9d0e2a 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -171,9 +171,9 @@ class UnsupportedGranularity:
                 granularity=(UnsupportedGranularity(), UnsupportedGranularity()),
             )
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_per_row_with_float32(self):
         with pytest.raises(
@@ -261,9 +261,9 @@ def test_serialization(self, mode: str):
                     original_layer.weight.scale, new_layer.weight.scale
                 ), f"Scales do not match for {layer_name}"
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_fp8_weight_dimension_warning(self):
         # Create model with incompatible dimensions (not multiples of 16)

From 234f13a85875a891362d6a9af0fadbaf58bc452b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 13:30:38 +0800
Subject: [PATCH 07/10] enable test/dtypes/test_affine_quantized_float.py on
 intel XPU

---
 test/dtypes/test_affine_quantized_float.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index 4d3d9d0e2a..3d87f7e884 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -141,14 +141,14 @@ def test_fp8_linear_variants(
             )
 
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
             Float8DynamicActivationFloat8WeightConfig(granularity="invalid")
 
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_mismatched_granularity(self):
         with pytest.raises(
@@ -160,7 +160,7 @@ def test_mismatched_granularity(self):
             )
 
     @unittest.skipIf(
-        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_unsupported_granularity(self):
         class UnsupportedGranularity:

From 4885a1acecc59fa511256efc869411ab06fca38b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 13:32:10 +0800
Subject: [PATCH 08/10] fix format issue

---
 test/dtypes/test_affine_quantized.py       |  1 +
 test/dtypes/test_affine_quantized_float.py | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
index 416e2547df..3d33374924 100644
--- a/test/dtypes/test_affine_quantized.py
+++ b/test/dtypes/test_affine_quantized.py
@@ -50,6 +50,7 @@
 )
 _DEVICE = get_current_accelerator_device()
 
+
 def get_quantization_functions(
     do_sparse: bool, do_int4: bool, device: str = _DEVICE, int4_zp_int: bool = False
 ):
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index 3d87f7e884..2d1a642c42 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -49,6 +49,7 @@
 torch.manual_seed(0)
 _DEVICE = get_current_accelerator_device()
 
+
 class ToyLinearModel(torch.nn.Module):
     def __init__(self, in_features, out_features):
         super().__init__()
@@ -141,14 +142,16 @@ def test_fp8_linear_variants(
             )
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
             Float8DynamicActivationFloat8WeightConfig(granularity="invalid")
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_mismatched_granularity(self):
         with pytest.raises(
@@ -160,7 +163,8 @@ def test_mismatched_granularity(self):
             )
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_unsupported_granularity(self):
         class UnsupportedGranularity:
@@ -356,7 +360,8 @@ def test_mm_float8dq_per_row(
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
@@ -399,7 +404,8 @@ def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
@@ -432,7 +438,8 @@ def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_dequantize_affine_float8_scale_broadcasting(self):
         """Test that scale broadcasting works correctly for block-wise quantization"""

From 4e46f122caacbf44dcc74f85b15715fb8a95286e Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 13:37:48 +0800
Subject: [PATCH 09/10] fix format issue

---
 test/dtypes/test_affine_quantized.py       |  3 +--
 test/dtypes/test_affine_quantized_float.py | 21 +++++++--------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
index 3d33374924..00cfa272d8 100644
--- a/test/dtypes/test_affine_quantized.py
+++ b/test/dtypes/test_affine_quantized.py
@@ -39,10 +39,10 @@
 from torchao.utils import (
     check_cpu_version,
     check_xpu_version,
+    get_current_accelerator_device,
     is_fbcode,
     is_ROCM,
     is_sm_at_least_89,
-    get_current_accelerator_device,
 )
 
 is_cusparselt_available = (
@@ -50,7 +50,6 @@
 )
 _DEVICE = get_current_accelerator_device()
 
-
 def get_quantization_functions(
     do_sparse: bool, do_int4: bool, device: str = _DEVICE, int4_zp_int: bool = False
 ):
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index 2d1a642c42..c75e29b66b 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -39,17 +39,16 @@
 )
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.utils import (
+    get_current_accelerator_device,
     is_sm_at_least_89,
     is_sm_at_least_90,
     is_sm_version,
-    get_current_accelerator_device,
 )
 
 random.seed(0)
 torch.manual_seed(0)
 _DEVICE = get_current_accelerator_device()
 
-
 class ToyLinearModel(torch.nn.Module):
     def __init__(self, in_features, out_features):
         super().__init__()
@@ -142,16 +141,14 @@ def test_fp8_linear_variants(
             )
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
-        "Requires GPU with compute capability >= 8.9",
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
             Float8DynamicActivationFloat8WeightConfig(granularity="invalid")
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
-        "Requires GPU with compute capability >= 8.9",
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_mismatched_granularity(self):
         with pytest.raises(
@@ -163,8 +160,7 @@ def test_mismatched_granularity(self):
             )
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
-        "Requires GPU with compute capability >= 8.9",
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_unsupported_granularity(self):
         class UnsupportedGranularity:
@@ -360,8 +356,7 @@ def test_mm_float8dq_per_row(
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
-        "Requires GPU with compute capability >= 8.9",
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
@@ -404,8 +399,7 @@ def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
-        "Requires GPU with compute capability >= 8.9",
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
@@ -438,8 +432,7 @@ def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
-        "Requires GPU with compute capability >= 8.9",
+        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
     )
     def test_dequantize_affine_float8_scale_broadcasting(self):
         """Test that scale broadcasting works correctly for block-wise quantization"""

From 060c9e9370bc00518481af659df0bbab7e88889b Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 21 Nov 2025 13:39:40 +0800
Subject: [PATCH 10/10] fix format issue

---
 test/dtypes/test_affine_quantized.py       |  1 +
 test/dtypes/test_affine_quantized_float.py | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
index 00cfa272d8..20cf1c6311 100644
--- a/test/dtypes/test_affine_quantized.py
+++ b/test/dtypes/test_affine_quantized.py
@@ -50,6 +50,7 @@
 )
 _DEVICE = get_current_accelerator_device()
 
+
 def get_quantization_functions(
     do_sparse: bool, do_int4: bool, device: str = _DEVICE, int4_zp_int: bool = False
 ):
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
index c75e29b66b..54820cb5b3 100644
--- a/test/dtypes/test_affine_quantized_float.py
+++ b/test/dtypes/test_affine_quantized_float.py
@@ -49,6 +49,7 @@
 torch.manual_seed(0)
 _DEVICE = get_current_accelerator_device()
 
+
 class ToyLinearModel(torch.nn.Module):
     def __init__(self, in_features, out_features):
         super().__init__()
@@ -141,14 +142,16 @@ def test_fp8_linear_variants(
             )
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
             Float8DynamicActivationFloat8WeightConfig(granularity="invalid")
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_mismatched_granularity(self):
         with pytest.raises(
@@ -160,7 +163,8 @@ def test_mismatched_granularity(self):
             )
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_unsupported_granularity(self):
         class UnsupportedGranularity:
@@ -356,7 +360,8 @@ def test_mm_float8dq_per_row(
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
@@ -399,7 +404,8 @@ def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
@@ -432,7 +438,8 @@ def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        "Requires GPU with compute capability >= 8.9",
     )
     def test_dequantize_affine_float8_scale_broadcasting(self):
         """Test that scale broadcasting works correctly for block-wise quantization"""