From 00124ffb9e3cc5b93857089210a230d0c9ab0535 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 19 Nov 2025 11:40:06 +0800 Subject: [PATCH 1/5] refine thr pytest command --- .github/scripts/ci_test_xpu.sh | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh index 05089db7c8..6ce91664b7 100644 --- a/.github/scripts/ci_test_xpu.sh +++ b/.github/scripts/ci_test_xpu.sh @@ -14,14 +14,9 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' -pytest -v -s torchao/test/quantization/ - -pytest -v -s torchao/test/dtypes/ - -pytest -v -s torchao/test/float8/ - -pytest -v -s torchao/test/integration/test_integration.py - -pytest -v -s torchao/test/prototype/ - -pytest -v -s torchao/test/test_ao_models.py +pytest -v -s torchao/test/quantization/ \ + torchao/test/dtypes/ \ + torchao/test/float8/ \ + torchao/test/integration/test_integration.py \ + torchao/test/prototype/ \ + torchao/test/test_ao_models.py From 521920d7c9768aa8e005855b11cef2c48d5bff52 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 19 Nov 2025 15:49:36 +0800 Subject: [PATCH 2/5] add dependency --- .github/scripts/ci_test_xpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh index 6ce91664b7..d38705d5b9 100644 --- a/.github/scripts/ci_test_xpu.sh +++ b/.github/scripts/ci_test_xpu.sh @@ -12,7 +12,7 @@ cd torchao && pip install . --no-build-isolation && cd .. python3 -c "import torch; import torchao; print(f'Torch version: {torch.__version__}')" -pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' +pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' transformers tabulate fire pytest -v -s torchao/test/quantization/ \ torchao/test/dtypes/ \ From 2a860307d16c94df8475cb3a2d6cc4b6bde874c7 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 19 Nov 2025 16:02:43 +0800 Subject: [PATCH 3/5] update --- test/dtypes/test_nf4.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py index 2a711413f0..221eed8216 100644 --- a/test/dtypes/test_nf4.py +++ b/test/dtypes/test_nf4.py @@ -756,6 +756,7 @@ def world_size(self) -> int: return 2 @skip_if_lt_x_gpu(2) + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_comm(self): self.run_subtests( {"input_size": [512, 2048]}, From bef32645b8d7a111206df81de2290348bca3b8b3 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 20 Nov 2025 09:37:51 +0800 Subject: [PATCH 4/5] update device --- test/prototype/test_quantized_training.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py index fa0edd694b..c27523d690 100644 --- a/test/prototype/test_quantized_training.py +++ b/test/prototype/test_quantized_training.py @@ -34,11 +34,13 @@ quantize_int8_rowwise, ) from torchao.quantization.quant_api import quantize_ +from torchao.utils import get_current_accelerator_device if common_utils.SEED is None: common_utils.SEED = 1234 _DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) +_DEVICE = get_current_accelerator_device() def _reset(): @@ -341,7 +343,7 @@ def _run_subtest(self, args): dropout_p=0, ) torch.manual_seed(42) - base_model = Transformer(model_args).cuda() + base_model = Transformer(model_args).to(_DEVICE) fsdp_model = copy.deepcopy(base_model) quantize_(base_model.layers, quantize_fn) @@ -361,7 +363,7 @@ def _run_subtest(self, args): torch.manual_seed(42 + self.rank + 1) for iter_idx in range(5): - inp = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + inp = torch.randint(0, vocab_size, (batch_size, seq_len), device=_DEVICE) fsdp_optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) fsdp_loss = fsdp_model(inp).sum() fsdp_loss.backward() @@ -392,7 +394,9 @@ def test_precompute_bitnet_scale(self): precompute_bitnet_scale_for_fsdp, ) - model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).cuda() + model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to( + _DEVICE + ) model_fsdp = copy.deepcopy(model) quantize_(model_fsdp, bitnet_training()) fully_shard(model_fsdp) From 51e9a3e3afb91e22d31c050c90d32299a0f85efc Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 20 Nov 2025 09:42:32 +0800 Subject: [PATCH 5/5] update skip --- test/prototype/test_quantized_training.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py index c27523d690..ab25a38bb3 100644 --- a/test/prototype/test_quantized_training.py +++ b/test/prototype/test_quantized_training.py @@ -34,13 +34,11 @@ quantize_int8_rowwise, ) from torchao.quantization.quant_api import quantize_ -from torchao.utils import get_current_accelerator_device if common_utils.SEED is None: common_utils.SEED = 1234 _DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) -_DEVICE = get_current_accelerator_device() def _reset(): @@ -298,6 +296,7 @@ def world_size(self) -> int: return _FSDP_WORLD_SIZE @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_fsdp2_correctness(self): mp_policy = MixedPrecisionPolicy() @@ -343,7 +342,7 @@ def _run_subtest(self, args): dropout_p=0, ) torch.manual_seed(42) - base_model = Transformer(model_args).to(_DEVICE) + base_model = Transformer(model_args).cuda() fsdp_model = copy.deepcopy(base_model) quantize_(base_model.layers, quantize_fn) @@ -363,7 +362,7 @@ def _run_subtest(self, args): torch.manual_seed(42 + self.rank + 1) for iter_idx in range(5): - inp = torch.randint(0, vocab_size, (batch_size, seq_len), device=_DEVICE) + inp = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") fsdp_optim.zero_grad(set_to_none=(iter_idx % 2 == 0)) fsdp_loss = fsdp_model(inp).sum() fsdp_loss.backward() @@ -388,15 +387,14 @@ def _run_subtest(self, args): ) @skip_if_lt_x_gpu(_FSDP_WORLD_SIZE) + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_precompute_bitnet_scale(self): from torchao.prototype.quantized_training.bitnet import ( get_bitnet_scale, precompute_bitnet_scale_for_fsdp, ) - model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).to( - _DEVICE - ) + model = nn.Sequential(nn.Linear(32, 64), nn.GELU(), nn.Linear(64, 32)).cuda() model_fsdp = copy.deepcopy(model) quantize_(model_fsdp, bitnet_training()) fully_shard(model_fsdp)