From 00124ffb9e3cc5b93857089210a230d0c9ab0535 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 19 Nov 2025 11:40:06 +0800
Subject: [PATCH 1/5] refine thr pytest command

---
 .github/scripts/ci_test_xpu.sh | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
index 05089db7c8..6ce91664b7 100644
--- a/.github/scripts/ci_test_xpu.sh
+++ b/.github/scripts/ci_test_xpu.sh
@@ -14,14 +14,9 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio
 
 pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
 
-pytest -v -s torchao/test/quantization/
-
-pytest -v -s torchao/test/dtypes/
-
-pytest -v -s torchao/test/float8/
-
-pytest -v -s torchao/test/integration/test_integration.py
-
-pytest -v -s torchao/test/prototype/
-
-pytest -v -s torchao/test/test_ao_models.py
+pytest -v -s torchao/test/quantization/ \
+        torchao/test/dtypes/ \
+        torchao/test/float8/ \
+        torchao/test/integration/test_integration.py \
+        torchao/test/prototype/ \
+        torchao/test/test_ao_models.py

From 521920d7c9768aa8e005855b11cef2c48d5bff52 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 19 Nov 2025 15:49:36 +0800
Subject: [PATCH 2/5] add dependency

---
 .github/scripts/ci_test_xpu.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
index 6ce91664b7..d38705d5b9 100644
--- a/.github/scripts/ci_test_xpu.sh
+++ b/.github/scripts/ci_test_xpu.sh
@@ -12,7 +12,7 @@ cd torchao && pip install . --no-build-isolation && cd ..
 
 python3 -c "import torch; import torchao; print(f'Torch version: {torch.__version__}')"
 
-pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
+pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' transformers tabulate fire
 
 pytest -v -s torchao/test/quantization/ \
         torchao/test/dtypes/ \

From 2a860307d16c94df8475cb3a2d6cc4b6bde874c7 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 19 Nov 2025 16:02:43 +0800
Subject: [PATCH 3/5] update

---
 test/dtypes/test_nf4.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py
index 2a711413f0..221eed8216 100644
--- a/test/dtypes/test_nf4.py
+++ b/test/dtypes/test_nf4.py
@@ -756,6 +756,7 @@ def world_size(self) -> int:
         return 2
 
     @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_comm(self):
         self.run_subtests(
             {"input_size": [512, 2048]},

From bef32645b8d7a111206df81de2290348bca3b8b3 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Thu, 20 Nov 2025 09:37:51 +0800
Subject: [PATCH 4/5] update device

---
 test/prototype/test_quantized_training.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index fa0edd694b..c27523d690 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -34,11 +34,13 @@
     quantize_int8_rowwise,
 )
 from torchao.quantization.quant_api import quantize_
+from torchao.utils import get_current_accelerator_device
 
 if common_utils.SEED is None:
     common_utils.SEED = 1234
 
 _DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+_DEVICE = get_current_accelerator_device()
 
 
 def _reset():
@@ -341,7 +343,7 @@ def _run_subtest(self, args):
             dropout_p=0,
         )
         torch.manual_seed(42)
-        base_model = Transformer(model_args).cuda()
+        base_model = Transformer(model_args).to(_DEVICE)
         fsdp_model = copy.deepcopy(base_model)
 
         quantize_(base_model.layers, quantize_fn)
@@ -361,7 +363,7 @@ def _run_subtest(self, args):
 
         torch.manual_seed(42 + self.rank + 1)
         for iter_idx in range(5):
-            inp = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
+            inp = torch.randint(0, vocab_size, (batch_size, seq_len), device=_DEVICE)
             fsdp_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = fsdp_model(inp).sum()
             fsdp_loss.backward()
@@ -392,7 +394,9 @@ def test_precompute_bitnet_scale(self):
             precompute_bitnet_scale_for_fsdp,
         )
 
-        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).cuda()
+        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to(
+            _DEVICE
+        )
         model_fsdp = copy.deepcopy(model)
         quantize_(model_fsdp, bitnet_training())
         fully_shard(model_fsdp)

From 51e9a3e3afb91e22d31c050c90d32299a0f85efc Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Thu, 20 Nov 2025 09:42:32 +0800
Subject: [PATCH 5/5] update skip

---
 test/prototype/test_quantized_training.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
index c27523d690..ab25a38bb3 100644
--- a/test/prototype/test_quantized_training.py
+++ b/test/prototype/test_quantized_training.py
@@ -34,13 +34,11 @@
     quantize_int8_rowwise,
 )
 from torchao.quantization.quant_api import quantize_
-from torchao.utils import get_current_accelerator_device
 
 if common_utils.SEED is None:
     common_utils.SEED = 1234
 
 _DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
-_DEVICE = get_current_accelerator_device()
 
 
 def _reset():
@@ -298,6 +296,7 @@ def world_size(self) -> int:
         return _FSDP_WORLD_SIZE
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_fsdp2_correctness(self):
         mp_policy = MixedPrecisionPolicy()
 
@@ -343,7 +342,7 @@ def _run_subtest(self, args):
             dropout_p=0,
         )
         torch.manual_seed(42)
-        base_model = Transformer(model_args).to(_DEVICE)
+        base_model = Transformer(model_args).cuda()
         fsdp_model = copy.deepcopy(base_model)
 
         quantize_(base_model.layers, quantize_fn)
@@ -363,7 +362,7 @@ def _run_subtest(self, args):
 
         torch.manual_seed(42 + self.rank + 1)
         for iter_idx in range(5):
-            inp = torch.randint(0, vocab_size, (batch_size, seq_len), device=_DEVICE)
+            inp = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
             fsdp_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
             fsdp_loss = fsdp_model(inp).sum()
             fsdp_loss.backward()
@@ -388,15 +387,14 @@ def _run_subtest(self, args):
             )
 
     @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE)
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_precompute_bitnet_scale(self):
         from torchao.prototype.quantized_training.bitnet import (
             get_bitnet_scale,
             precompute_bitnet_scale_for_fsdp,
         )
 
-        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to(
-            _DEVICE
-        )
+        model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).cuda()
         model_fsdp = copy.deepcopy(model)
         quantize_(model_fsdp, bitnet_training())
         fully_shard(model_fsdp)