From c62c01c035aa84ad523ebdabb58995631d90d6b1 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 18 Nov 2025 09:58:36 +0800
Subject: [PATCH 1/3] port 2 test/quantization UTs to intel XPU

---
 test/quantization/test_gptq.py             | 34 ++++++++++++++--------
 test/quantization/test_quant_primitives.py | 12 +++++---
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py
index 6f7ac10d45..7774c3d7f7 100644
--- a/test/quantization/test_gptq.py
+++ b/test/quantization/test_gptq.py
@@ -18,13 +18,16 @@
 from torchao._models.llama.tokenizer import get_tokenizer
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
 from torchao.quantization.utils import compute_error
+from torchao.utils import get_current_accelerator_device
 
 torch.manual_seed(0)
 
+_DEVICE = get_current_accelerator_device()
+
 
 class TestGPTQ(TestCase):
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_gptq_quantizer_int4_weight_only(self):
         from torchao._models._eval import (
             LMEvalInputRecorder,
@@ -33,7 +36,7 @@ def test_gptq_quantizer_int4_weight_only(self):
         from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 
         precision = torch.bfloat16
-        device = "cuda"
+        device = _DEVICE
         checkpoint_path = Path(
             "../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"
         )
@@ -80,15 +83,15 @@ def test_gptq_quantizer_int4_weight_only(self):
         )
         model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length)
 
-        model = quantizer.quantize(model, *inputs).cuda()
+        model = quantizer.quantize(model, *inputs).to(_DEVICE)
 
         model.reset_caches()
-        with torch.device("cuda"):
+        with torch.device(_DEVICE):
             model.setup_caches(max_batch_size=1, max_seq_length=model.config.block_size)
 
         limit = 1
         result = TransformerEvalWrapper(
-            model.cuda(),
+            model.to(_DEVICE),
             tokenizer,
             model.config.block_size,
             prepare_inputs_for_model,
@@ -104,7 +107,7 @@ def test_gptq_quantizer_int4_weight_only(self):
 
 
 class TestMultiTensorFlow(TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_add_tensors(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -116,7 +119,7 @@ def test_multitensor_add_tensors(self):
         self.assertTrue(torch.equal(mt.values[0], tensor1))
         self.assertTrue(torch.equal(mt.values[1], tensor2))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_pad_unpad(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -127,7 +130,7 @@ def test_multitensor_pad_unpad(self):
         mt.unpad()
         self.assertEqual(mt.count, 1)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_inplace_operation(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -138,7 +141,7 @@ def test_multitensor_inplace_operation(self):
 
 
 class TestMultiTensorInputRecorder(TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_input_recorder(self):
         from torchao.quantization.GPTQ import MultiTensor, MultiTensorInputRecorder
 
@@ -159,7 +162,7 @@ def test_multitensor_input_recorder(self):
         self.assertTrue(isinstance(MT_input[2][2], MultiTensor))
         self.assertEqual(MT_input[3], torch.float)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_gptq_with_input_recorder(self):
         from torchao.quantization.GPTQ import (
             Int4WeightOnlyGPTQQuantizer,
@@ -170,7 +173,7 @@ def test_gptq_with_input_recorder(self):
 
         config = ModelArgs(n_layer=2)
 
-        with torch.device("cuda"):
+        with torch.device(_DEVICE):
             model = Transformer(config)
             model.setup_caches(max_batch_size=2, max_seq_length=100)
             idx = torch.randint(1, 10000, (10, 2, 50)).to(torch.int32)
@@ -191,7 +194,14 @@ def test_gptq_with_input_recorder(self):
 
         args = input_recorder.get_recorded_inputs()
 
-        quantizer = Int4WeightOnlyGPTQQuantizer()
+        if _DEVICE.type == "xpu":
+            from torchao.dtypes import Int4XPULayout
+
+            quantizer = Int4WeightOnlyGPTQQuantizer(
+                device=torch.device("xpu"), layout=Int4XPULayout()
+            )
+        else:
+            quantizer = Int4WeightOnlyGPTQQuantizer()
 
         quantizer.quantize(model, *args)
 
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index cc6b7fff91..5f383ba582 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -37,11 +37,14 @@
     check_cpu_version,
     check_xpu_version,
     is_fbcode,
+    get_current_accelerator_device,
 )
 
 _SEED = 1234
 torch.manual_seed(_SEED)
 
+_DEVICE = get_current_accelerator_device()
+
 
 # Helper function to run a function twice
 # and verify that the result is the same.
@@ -592,16 +595,17 @@ def test_choose_qparams_tensor_asym_eps(self):
         self.assertEqual(scale, eps)
 
     @unittest.skipIf(
-        not torch.cuda.is_available(), "skipping when cuda is not available"
+        not torch.accelerator.is_available(), "skipping when gpu is not available"
     )
     def test_get_group_qparams_symmetric_memory(self):
         """Check the memory usage of the op"""
-        weight = torch.randn(1024, 1024).to(device="cuda")
-        original_mem_use = torch.cuda.memory_allocated()
+        weight = torch.randn(1024, 1024).to(device=_DEVICE)
+        device_module = torch.get_device_module(_DEVICE)
+        original_mem_use = device_module.memory_allocated()
         n_bit = 4
         groupsize = 128
         (scale_ao, _) = get_group_qparams_symmetric(weight, n_bit, groupsize)
-        after_choose_qparams_mem_use = torch.cuda.memory_allocated()
+        after_choose_qparams_mem_use = device_module.memory_allocated()
         self.assertTrue(after_choose_qparams_mem_use < 1.2 * original_mem_use)
 
     def test_raises(self):

From 6210606c46521a9f0f0ae4eba6ba5e1a4d586898 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 18 Nov 2025 11:15:52 +0800
Subject: [PATCH 2/3] update format

---
 test/quantization/test_quant_primitives.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index 5f383ba582..c224a6bab6 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -24,7 +24,6 @@
     dequantize_affine,
     quantize_affine,
 )
-
 # TODO: remove test for utils?
 from torchao.quantization.utils import (
     _quantize_activation_per_token_absmax,
@@ -36,8 +35,8 @@
 from torchao.utils import (
     check_cpu_version,
     check_xpu_version,
-    is_fbcode,
     get_current_accelerator_device,
+    is_fbcode,
 )
 
 _SEED = 1234

From ee782a9ea35503cb15478fd4f1577f81d856d73d Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 18 Nov 2025 11:23:19 +0800
Subject: [PATCH 3/3] update format

---
 test/quantization/test_quant_primitives.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index c224a6bab6..31b15839e5 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -24,6 +24,7 @@
     dequantize_affine,
     quantize_affine,
 )
+
 # TODO: remove test for utils?
 from torchao.quantization.utils import (
     _quantize_activation_per_token_absmax,