From c62c01c035aa84ad523ebdabb58995631d90d6b1 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 18 Nov 2025 09:58:36 +0800 Subject: [PATCH 1/3] port 2 test/quantization UTs to intel XPU --- test/quantization/test_gptq.py | 34 ++++++++++++++-------- test/quantization/test_quant_primitives.py | 12 +++++--- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py index 6f7ac10d45..7774c3d7f7 100644 --- a/test/quantization/test_gptq.py +++ b/test/quantization/test_gptq.py @@ -18,13 +18,16 @@ from torchao._models.llama.tokenizer import get_tokenizer from torchao.quantization import Int4WeightOnlyConfig, quantize_ from torchao.quantization.utils import compute_error +from torchao.utils import get_current_accelerator_device torch.manual_seed(0) +_DEVICE = get_current_accelerator_device() + class TestGPTQ(TestCase): @unittest.skip("skipping until we get checkpoints for gpt-fast") - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_gptq_quantizer_int4_weight_only(self): from torchao._models._eval import ( LMEvalInputRecorder, @@ -33,7 +36,7 @@ def test_gptq_quantizer_int4_weight_only(self): from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer precision = torch.bfloat16 - device = "cuda" + device = _DEVICE checkpoint_path = Path( "../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth" ) @@ -80,15 +83,15 @@ def test_gptq_quantizer_int4_weight_only(self): ) model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length) - model = quantizer.quantize(model, *inputs).cuda() + model = quantizer.quantize(model, *inputs).to(_DEVICE) model.reset_caches() - with torch.device("cuda"): + with torch.device(_DEVICE): model.setup_caches(max_batch_size=1, max_seq_length=model.config.block_size) limit = 1 result = TransformerEvalWrapper( - model.cuda(), + model.to(_DEVICE), tokenizer, model.config.block_size, prepare_inputs_for_model, @@ -104,7 +107,7 @@ def test_gptq_quantizer_int4_weight_only(self): class TestMultiTensorFlow(TestCase): - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_add_tensors(self): from torchao.quantization.GPTQ import MultiTensor @@ -116,7 +119,7 @@ def test_multitensor_add_tensors(self): self.assertTrue(torch.equal(mt.values[0], tensor1)) self.assertTrue(torch.equal(mt.values[1], tensor2)) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_pad_unpad(self): from torchao.quantization.GPTQ import MultiTensor @@ -127,7 +130,7 @@ def test_multitensor_pad_unpad(self): mt.unpad() self.assertEqual(mt.count, 1) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_inplace_operation(self): from torchao.quantization.GPTQ import MultiTensor @@ -138,7 +141,7 @@ def test_multitensor_inplace_operation(self): class TestMultiTensorInputRecorder(TestCase): - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_input_recorder(self): from torchao.quantization.GPTQ import MultiTensor, MultiTensorInputRecorder @@ -159,7 +162,7 @@ def test_multitensor_input_recorder(self): self.assertTrue(isinstance(MT_input[2][2], MultiTensor)) self.assertEqual(MT_input[3], torch.float) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_gptq_with_input_recorder(self): from torchao.quantization.GPTQ import ( Int4WeightOnlyGPTQQuantizer, @@ -170,7 +173,7 @@ def test_gptq_with_input_recorder(self): config = ModelArgs(n_layer=2) - with torch.device("cuda"): + with torch.device(_DEVICE): model = Transformer(config) model.setup_caches(max_batch_size=2, max_seq_length=100) idx = torch.randint(1, 10000, (10, 2, 50)).to(torch.int32) @@ -191,7 +194,14 @@ def test_gptq_with_input_recorder(self): args = input_recorder.get_recorded_inputs() - quantizer = Int4WeightOnlyGPTQQuantizer() + if _DEVICE.type == "xpu": + from torchao.dtypes import Int4XPULayout + + quantizer = Int4WeightOnlyGPTQQuantizer( + device=torch.device("xpu"), layout=Int4XPULayout() + ) + else: + quantizer = Int4WeightOnlyGPTQQuantizer() quantizer.quantize(model, *args) diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py index cc6b7fff91..5f383ba582 100644 --- a/test/quantization/test_quant_primitives.py +++ b/test/quantization/test_quant_primitives.py @@ -37,11 +37,14 @@ check_cpu_version, check_xpu_version, is_fbcode, + get_current_accelerator_device, ) _SEED = 1234 torch.manual_seed(_SEED) +_DEVICE = get_current_accelerator_device() + # Helper function to run a function twice # and verify that the result is the same. @@ -592,16 +595,17 @@ def test_choose_qparams_tensor_asym_eps(self): self.assertEqual(scale, eps) @unittest.skipIf( - not torch.cuda.is_available(), "skipping when cuda is not available" + not torch.accelerator.is_available(), "skipping when gpu is not available" ) def test_get_group_qparams_symmetric_memory(self): """Check the memory usage of the op""" - weight = torch.randn(1024, 1024).to(device="cuda") - original_mem_use = torch.cuda.memory_allocated() + weight = torch.randn(1024, 1024).to(device=_DEVICE) + device_module = torch.get_device_module(_DEVICE) + original_mem_use = device_module.memory_allocated() n_bit = 4 groupsize = 128 (scale_ao, _) = get_group_qparams_symmetric(weight, n_bit, groupsize) - after_choose_qparams_mem_use = torch.cuda.memory_allocated() + after_choose_qparams_mem_use = device_module.memory_allocated() self.assertTrue(after_choose_qparams_mem_use < 1.2 * original_mem_use) def test_raises(self): From 6210606c46521a9f0f0ae4eba6ba5e1a4d586898 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 18 Nov 2025 11:15:52 +0800 Subject: [PATCH 2/3] update format --- test/quantization/test_quant_primitives.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py index 5f383ba582..c224a6bab6 100644 --- a/test/quantization/test_quant_primitives.py +++ b/test/quantization/test_quant_primitives.py @@ -24,7 +24,6 @@ dequantize_affine, quantize_affine, ) - # TODO: remove test for utils? from torchao.quantization.utils import ( _quantize_activation_per_token_absmax, @@ -36,8 +35,8 @@ from torchao.utils import ( check_cpu_version, check_xpu_version, - is_fbcode, get_current_accelerator_device, + is_fbcode, ) _SEED = 1234 From ee782a9ea35503cb15478fd4f1577f81d856d73d Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 18 Nov 2025 11:23:19 +0800 Subject: [PATCH 3/3] update format --- test/quantization/test_quant_primitives.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py index c224a6bab6..31b15839e5 100644 --- a/test/quantization/test_quant_primitives.py +++ b/test/quantization/test_quant_primitives.py @@ -24,6 +24,7 @@ dequantize_affine, quantize_affine, ) + # TODO: remove test for utils? from torchao.quantization.utils import ( _quantize_activation_per_token_absmax,