From a736c41187363d39cf72d1b4a3bda5b62554f3a1 Mon Sep 17 00:00:00 2001
From: "Sun, Diwei" <diwei.sun@intel.com>
Date: Tue, 19 Aug 2025 08:28:02 +0000
Subject: [PATCH 1/9] enable xpu ci test

---
 .github/workflows/pr-test-xpu.yml | 156 ++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 .github/workflows/pr-test-xpu.yml

diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
new file mode 100644
index 0000000000..79621a06d1
--- /dev/null
+++ b/.github/workflows/pr-test-xpu.yml
@@ -0,0 +1,156 @@
+# TODO: this looks sort of similar to _linux-test, but there are like a dozen
+# places where you would have to insert an if statement. Probably it's better to
+# just use a different workflow altogether
+
+name: xpu-test
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+
+concurrency:
+  group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    # Don't run on forked repos or empty test matrix
+    # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
+    timeout-minutes: 60
+    runs-on: ao-pvc
+    env:
+      DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3
+      TEST_COMMAND: .github/scripts/ci_test_xpu.sh
+      PYTORCH_RETRY_TEST_CASES: 1
+      PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
+      XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+    steps:
+      # [see note: pytorch repo ref]
+      - name: Checkout Torchao
+        uses: actions/checkout@v4
+
+      - name: Clean all stopped docker containers
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi
+
+      - name: Runner health check GPU count
+        if: always()
+        shell: bash
+        run: |
+          ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
+          msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
+          if [[ $ngpu -eq 0 ]]; then
+            echo "Error: Failed to detect any GPUs on the runner"
+            echo "$msg"
+            exit 1
+          fi
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        shell: bash
+        run: |
+          echo "docker pull ${DOCKER_IMAGE}"
+          docker pull ${DOCKER_IMAGE}
+
+      - name: Test
+        id: test
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        timeout-minutes: 60
+        run: |
+          set -x
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for GPU_FLAG since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e GITHUB_ACTIONS \
+            -e GITHUB_REPOSITORY \
+            -e GITHUB_WORKFLOW \
+            -e GITHUB_JOB \
+            -e GITHUB_RUN_ID \
+            -e GITHUB_RUN_NUMBER \
+            -e GITHUB_RUN_ATTEMPT \
+            -e JOB_ID \
+            -e BRANCH \
+            -e SHA1 \
+            --user $(id -u):$(id -g) \
+            --ulimit stack=10485760:83886080 \
+            --ulimit core=0 \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --shm-size="8g" \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            --user jenkins \
+            --privileged \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          # save container name for later step
+          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
+          docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
+
+      - name: Change permissions
+        if: ${{ always() && steps.test.conclusion }}
+        run: |
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Stop container before exit
+        if: always()
+        run: |
+          # Workaround for multiple runners on same IDC node
+          docker stop "${{ env.CONTAINER_NAME }}"
+
+      - name: Store Core dumps on GitHub
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: failure()
+        with:
+          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Teardown XPU
+        if: always()
+        shell: bash
+        run: |
+          # Prune all stopped containers.
+          # If other runner is pruning on this node, will skip.
+          nprune=$(ps -ef | grep -c "docker container prune")
+          if [[ $nprune -eq 1 ]]; then
+            docker container prune -f
+          fi

From 7c96ad46d16039d37a685fa9ed28010d1a3ab0aa Mon Sep 17 00:00:00 2001
From: "Sun, Diwei" <diwei.sun@intel.com>
Date: Wed, 20 Aug 2025 01:52:54 +0000
Subject: [PATCH 2/9] Revert "enable xpu ci test"

This reverts commit a736c41187363d39cf72d1b4a3bda5b62554f3a1.
---
 .github/workflows/pr-test-xpu.yml | 156 ------------------------------
 1 file changed, 156 deletions(-)
 delete mode 100644 .github/workflows/pr-test-xpu.yml

diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml
deleted file mode 100644
index 79621a06d1..0000000000
--- a/.github/workflows/pr-test-xpu.yml
+++ /dev/null
@@ -1,156 +0,0 @@
-# TODO: this looks sort of similar to _linux-test, but there are like a dozen
-# places where you would have to insert an if statement. Probably it's better to
-# just use a different workflow altogether
-
-name: xpu-test
-
-on:
-  push:
-    branches:
-      - main
-      - 'gh/**'
-  pull_request:
-    branches:
-      - main
-      - 'gh/**'
-
-concurrency:
-  group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    # Don't run on forked repos or empty test matrix
-    # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
-    timeout-minutes: 60
-    runs-on: ao-pvc
-    env:
-      DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3
-      TEST_COMMAND: .github/scripts/ci_test_xpu.sh
-      PYTORCH_RETRY_TEST_CASES: 1
-      PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
-      XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-    steps:
-      # [see note: pytorch repo ref]
-      - name: Checkout Torchao
-        uses: actions/checkout@v4
-
-      - name: Clean all stopped docker containers
-        if: always()
-        shell: bash
-        run: |
-          # Prune all stopped containers.
-          # If other runner is pruning on this node, will skip.
-          nprune=$(ps -ef | grep -c "docker container prune")
-          if [[ $nprune -eq 1 ]]; then
-            docker container prune -f
-          fi
-
-      - name: Runner health check GPU count
-        if: always()
-        shell: bash
-        run: |
-          ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
-          msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
-          if [[ $ngpu -eq 0 ]]; then
-            echo "Error: Failed to detect any GPUs on the runner"
-            echo "$msg"
-            exit 1
-          fi
-
-      - name: Use following to pull public copy of the image
-        id: print-ghcr-mirror
-        shell: bash
-        run: |
-          echo "docker pull ${DOCKER_IMAGE}"
-          docker pull ${DOCKER_IMAGE}
-
-      - name: Test
-        id: test
-        env:
-          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          GITHUB_REPOSITORY: ${{ github.repository }}
-          GITHUB_WORKFLOW: ${{ github.workflow }}
-          GITHUB_JOB: ${{ github.job }}
-          GITHUB_RUN_ID: ${{ github.run_id }}
-          GITHUB_RUN_NUMBER: ${{ github.run_number }}
-          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
-          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-        timeout-minutes: 60
-        run: |
-          set -x
-
-          # detached container should get cleaned up by teardown_ec2_linux
-          # Used for GPU_FLAG since that doesn't play nice
-          # shellcheck disable=SC2086,SC2090
-          container_name=$(docker run \
-            ${GPU_FLAG:-} \
-            -e BUILD_ENVIRONMENT \
-            -e PR_NUMBER \
-            -e GITHUB_ACTIONS \
-            -e GITHUB_REPOSITORY \
-            -e GITHUB_WORKFLOW \
-            -e GITHUB_JOB \
-            -e GITHUB_RUN_ID \
-            -e GITHUB_RUN_NUMBER \
-            -e GITHUB_RUN_ATTEMPT \
-            -e JOB_ID \
-            -e BRANCH \
-            -e SHA1 \
-            --user $(id -u):$(id -g) \
-            --ulimit stack=10485760:83886080 \
-            --ulimit core=0 \
-            --security-opt seccomp=unconfined \
-            --cap-add=SYS_PTRACE \
-            --shm-size="8g" \
-            --tty \
-            --detach \
-            --name="${container_name}" \
-            --user jenkins \
-            --privileged \
-            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
-          )
-          # save container name for later step
-          echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
-          # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
-          docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
-
-      - name: Change permissions
-        if: ${{ always() && steps.test.conclusion }}
-        run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
-
-      - name: Collect backtraces from coredumps (if any)
-        if: always()
-        run: |
-          # shellcheck disable=SC2156
-          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
-
-      - name: Stop container before exit
-        if: always()
-        run: |
-          # Workaround for multiple runners on same IDC node
-          docker stop "${{ env.CONTAINER_NAME }}"
-
-      - name: Store Core dumps on GitHub
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        if: failure()
-        with:
-          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
-          retention-days: 14
-          if-no-files-found: ignore
-          path: ./**/core.[1-9]*
-
-      - name: Teardown XPU
-        if: always()
-        shell: bash
-        run: |
-          # Prune all stopped containers.
-          # If other runner is pruning on this node, will skip.
-          nprune=$(ps -ef | grep -c "docker container prune")
-          if [[ $nprune -eq 1 ]]; then
-            docker container prune -f
-          fi

From 9d1cc1f6fda1aaddcebbd8adfb5d28c8931b38b1 Mon Sep 17 00:00:00 2001
From: "Sun, Diwei" <diwei.sun@intel.com>
Date: Mon, 27 Oct 2025 09:03:51 +0000
Subject: [PATCH 3/9] enabel quantiation ut cases in xpu ci

---
 .github/scripts/ci_test_xpu.sh             |  2 +
 test/quantization/test_gptq.py             | 30 ++++++------
 test/quantization/test_moe_quant.py        | 38 +++++++---------
 test/quantization/test_qat.py              | 14 ++++--
 test/quantization/test_quant_api.py        | 53 ++++++++++++----------
 test/quantization/test_quant_primitives.py |  5 +-
 torchao/testing/utils.py                   | 41 +++++++++++++++++
 torchao/utils.py                           |  7 +++
 8 files changed, 128 insertions(+), 62 deletions(-)

diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
index d765696b40..79114d01c0 100644
--- a/.github/scripts/ci_test_xpu.sh
+++ b/.github/scripts/ci_test_xpu.sh
@@ -15,3 +15,5 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio
 pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
 
 pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py 
+
+pytest -v -s torchao/test/quantization/
diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py
index 6f7ac10d45..746800abeb 100644
--- a/test/quantization/test_gptq.py
+++ b/test/quantization/test_gptq.py
@@ -18,13 +18,15 @@
 from torchao._models.llama.tokenizer import get_tokenizer
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
 from torchao.quantization.utils import compute_error
+from torchao.utils import auto_detect_device
 
 torch.manual_seed(0)
 
+_DEVICE = auto_detect_device()
+
 
 class TestGPTQ(TestCase):
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_gptq_quantizer_int4_weight_only(self):
         from torchao._models._eval import (
             LMEvalInputRecorder,
@@ -33,7 +35,6 @@ def test_gptq_quantizer_int4_weight_only(self):
         from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 
         precision = torch.bfloat16
-        device = "cuda"
         checkpoint_path = Path(
             "../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"
         )
@@ -80,19 +81,19 @@ def test_gptq_quantizer_int4_weight_only(self):
         )
         model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length)
 
-        model = quantizer.quantize(model, *inputs).cuda()
+        model = quantizer.quantize(model, *inputs).to(_DEVICE)
 
         model.reset_caches()
-        with torch.device("cuda"):
+        with torch.device(_DEVICE):
             model.setup_caches(max_batch_size=1, max_seq_length=model.config.block_size)
 
         limit = 1
         result = TransformerEvalWrapper(
-            model.cuda(),
+            model.to(_DEVICE),
             tokenizer,
             model.config.block_size,
             prepare_inputs_for_model,
-            device,
+            _DEVICE,
         ).run_eval(
             ["wikitext"],
             limit,
@@ -104,7 +105,6 @@ def test_gptq_quantizer_int4_weight_only(self):
 
 
 class TestMultiTensorFlow(TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_add_tensors(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -116,7 +116,6 @@ def test_multitensor_add_tensors(self):
         self.assertTrue(torch.equal(mt.values[0], tensor1))
         self.assertTrue(torch.equal(mt.values[1], tensor2))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_pad_unpad(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -127,7 +126,6 @@ def test_multitensor_pad_unpad(self):
         mt.unpad()
         self.assertEqual(mt.count, 1)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_inplace_operation(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -138,7 +136,6 @@ def test_multitensor_inplace_operation(self):
 
 
 class TestMultiTensorInputRecorder(TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_multitensor_input_recorder(self):
         from torchao.quantization.GPTQ import MultiTensor, MultiTensorInputRecorder
 
@@ -159,7 +156,7 @@ def test_multitensor_input_recorder(self):
         self.assertTrue(isinstance(MT_input[2][2], MultiTensor))
         self.assertEqual(MT_input[3], torch.float)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_gptq_with_input_recorder(self):
         from torchao.quantization.GPTQ import (
             Int4WeightOnlyGPTQQuantizer,
@@ -170,7 +167,7 @@ def test_gptq_with_input_recorder(self):
 
         config = ModelArgs(n_layer=2)
 
-        with torch.device("cuda"):
+        with torch.device(_DEVICE):
             model = Transformer(config)
             model.setup_caches(max_batch_size=2, max_seq_length=100)
             idx = torch.randint(1, 10000, (10, 2, 50)).to(torch.int32)
@@ -191,7 +188,14 @@ def test_gptq_with_input_recorder(self):
 
         args = input_recorder.get_recorded_inputs()
 
-        quantizer = Int4WeightOnlyGPTQQuantizer()
+        if _DEVICE == "xpu":
+            from torchao.dtypes import Int4XPULayout
+
+            quantizer = Int4WeightOnlyGPTQQuantizer(
+                device=torch.device("xpu"), layout=Int4XPULayout()
+            )
+        else:
+            quantizer = Int4WeightOnlyGPTQQuantizer()
 
         quantizer.quantize(model, *args)
 
diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
index 61000babc1..55a6a87e24 100644
--- a/test/quantization/test_moe_quant.py
+++ b/test/quantization/test_moe_quant.py
@@ -33,7 +33,13 @@
     quantize_,
 )
 from torchao.quantization.utils import compute_error
-from torchao.utils import is_sm_at_least_90
+from torchao.testing.utils import skip_if_no_cuda
+from torchao.utils import (
+    auto_detect_device,
+    is_sm_at_least_90,
+)
+
+_DEVICE = auto_detect_device()
 
 if torch.version.hip is not None:
     pytest.skip(
@@ -54,7 +60,7 @@ def _test_impl_moe_quant(
         base_class=AffineQuantizedTensor,
         tensor_impl_class=None,
         dtype=torch.bfloat16,
-        device="cuda",
+        device=_DEVICE,
         fullgraph=False,
     ):
         """
@@ -115,10 +121,8 @@ def _test_impl_moe_quant(
             ("multiple_tokens", 8, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
-
         config = MoEQuantConfig(
             Int4WeightOnlyConfig(version=1),
             use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
@@ -138,6 +142,7 @@ def test_int4wo_fake_dim(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 8, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_int4wo_base(self, name, num_tokens, fullgraph):
         if not torch.cuda.is_available():
             self.skipTest("Need CUDA available")
@@ -160,10 +165,8 @@ def test_int4wo_base(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 8, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_int8wo_fake_dim(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
-
         config = MoEQuantConfig(
             Int8WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
         )
@@ -182,10 +185,8 @@ def test_int8wo_fake_dim(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 8, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_int8wo_base(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
-
         config = MoEQuantConfig(Int8WeightOnlyConfig())
         tensor_impl_class = PlainAQTTensorImpl
 
@@ -202,6 +203,7 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 8, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_int8wo_base_cpu(self, name, num_tokens, fullgraph):
         config = MoEQuantConfig(Int8WeightOnlyConfig())
         tensor_impl_class = PlainAQTTensorImpl
@@ -219,10 +221,8 @@ def test_int8wo_base_cpu(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 32, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_int8dq_fake_dim(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
-
         config = MoEQuantConfig(
             Int8DynamicActivationInt8WeightConfig(),
             use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
@@ -242,10 +242,8 @@ def test_int8dq_fake_dim(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 32, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_int8dq_base(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
-
         config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig())
         base_class = LinearActivationQuantizedTensor
 
@@ -263,9 +261,8 @@ def test_int8dq_base(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 8, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_fp8wo_fake_dim(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
         if not is_sm_at_least_90():
             self.skipTest("Requires CUDA capability >= 9.0")
 
@@ -335,9 +332,8 @@ def test_fp8dq_fake_dim(self, name, num_tokens, fullgraph):
             ("multiple_tokens", 8, False),
         ]
     )
+    @skip_if_no_cuda()
     def test_fp8dq_base(self, name, num_tokens, fullgraph):
-        if not torch.cuda.is_available():
-            self.skipTest("Need CUDA available")
         if not is_sm_at_least_90():
             self.skipTest("Requires CUDA capability >= 9.0")
 
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
index f523cb091c..73b8009a81 100644
--- a/test/quantization/test_qat.py
+++ b/test/quantization/test_qat.py
@@ -98,12 +98,15 @@
 )
 from torchao.utils import (
     _is_fbgemm_gpu_genai_available,
+    auto_detect_device,
     is_fbcode,
     is_sm_at_least_89,
 )
 
 # TODO: put this in a common test utils file
 _CUDA_IS_AVAILABLE = torch.cuda.is_available()
+_GPU_IS_AVAILABLE = torch.accelerator.is_available()
+_DEVICE = auto_detect_device()
 
 
 class Sub(torch.nn.Module):
@@ -347,7 +350,7 @@ def _set_ptq_weight(
                 group_size,
             )
             q_weight = torch.ops.aten._convert_weight_to_int4pack(
-                q_weight.to("cuda"),
+                q_weight.to(_DEVICE),
                 qat_linear.inner_k_tiles,
             )
             ptq_linear.weight = q_weight
@@ -600,13 +603,15 @@ def _assert_close_4w(self, val, ref):
         print(mean_err)
         self.assertTrue(mean_err < 0.05)
 
-    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(
+        not _GPU_IS_AVAILABLE, "skipping when cuda or xpu is not available"
+    )
     def test_qat_4w_primitives(self):
         n_bit = 4
         group_size = 32
         inner_k_tiles = 8
         scales_precision = torch.bfloat16
-        device = torch.device("cuda")
+        device = torch.device(_DEVICE)
         dtype = torch.bfloat16
         torch.manual_seed(self.SEED)
         x = torch.randn(100, 256, dtype=dtype, device=device)
@@ -699,11 +704,12 @@ def test_qat_4w_quantizer(self):
 
         group_size = 32
         inner_k_tiles = 8
-        device = torch.device("cuda")
+        device = torch.device(_DEVICE)
         dtype = torch.bfloat16
         torch.manual_seed(self.SEED)
         m = M().to(device).to(dtype)
         m2 = copy.deepcopy(m)
+
         qat_quantizer = Int4WeightOnlyQATQuantizer(
             groupsize=group_size,
             inner_k_tiles=inner_k_tiles,
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 577ca6789a..164cf6bad0 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -60,14 +60,17 @@
 )
 from torchao.quantization.quant_primitives import MappingType
 from torchao.quantization.utils import compute_error
-from torchao.testing.utils import skip_if_rocm
+from torchao.testing.utils import skip_if_rocm, skip_if_xpu
 from torchao.utils import (
+    auto_detect_device,
     is_sm_at_least_89,
     is_sm_at_least_90,
     torch_version_at_least,
     unwrap_tensor_subclass,
 )
 
+_DEVICE = auto_detect_device()
+
 try:
     import gemlite  # noqa: F401
 
@@ -258,7 +261,7 @@ def api(model):
 
         m2.load_state_dict(state_dict)
         m2 = m2.to(device="cuda")
-        example_inputs = map(lambda x: x.cuda(), example_inputs)
+        example_inputs = map(lambda x: x.to(_DEVICE), example_inputs)
         res = m2(*example_inputs)
 
         # TODO: figure out why ROCm has a larger error
@@ -290,12 +293,13 @@ def test_8da4w_quantizer_linear_bias(self):
         m(*example_inputs)
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantizer_int4_weight_only(self):
         from torchao._models._eval import TransformerEvalWrapper
         from torchao.quantization.linear_quant_modules import Int4WeightOnlyQuantizer
 
         precision = torch.bfloat16
-        device = "cuda"
+        device = _DEVICE
         checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
@@ -312,7 +316,7 @@ def test_quantizer_int4_weight_only(self):
         quantizer = Int4WeightOnlyQuantizer(
             groupsize,
         )
-        model = quantizer.quantize(model).cuda()
+        model = quantizer.quantize(model).to(_DEVICE)
         result = TransformerEvalWrapper(
             model,
             tokenizer,
@@ -328,11 +332,12 @@ def test_quantizer_int4_weight_only(self):
         )
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_eval_wrapper(self):
         from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
-        device = "cuda"
+        device = _DEVICE
         checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
@@ -361,11 +366,12 @@ def test_eval_wrapper(self):
 
     # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_eval_wrapper_llama3(self):
         from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
-        device = "cuda"
+        device = _DEVICE
         checkpoint_path = Path(
             ".../gpt-fast/checkpoints/meta-llama/Meta-Llama-3-8B/model.pth"
         )
@@ -534,7 +540,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq):
             assert "aten.mm.default" not in code[0]
 
     # TODO(#1690): move to new config names
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @common_utils.parametrize(
         "config",
         [
@@ -551,6 +557,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq):
             UIntXWeightOnlyConfig(dtype=torch.uint4),
         ],
     )
+    @skip_if_xpu("XPU enablement in progress")
     @skip_if_rocm("ROCm enablement in progress")
     def test_workflow_e2e_numerics(self, config):
         """
@@ -579,17 +586,17 @@ def test_workflow_e2e_numerics(self, config):
         # scale has to be moved to cuda here because the parametrization init
         # code happens before gating for cuda availability
         if isinstance(config, Float8StaticActivationFloat8WeightConfig):
-            config.scale = config.scale.to("cuda")
+            config.scale = config.scale.to(_DEVICE)
 
         dtype = torch.bfloat16
         if isinstance(config, GemliteUIntXWeightOnlyConfig):
             dtype = torch.float16
 
         # set up inputs
-        x = torch.randn(128, 128, device="cuda", dtype=dtype)
+        x = torch.randn(128, 128, device=_DEVICE, dtype=dtype)
         # TODO(future): model in float32 leads to error: https://gist.github.com/vkuzo/63b3bcd7818393021a6e3fb4ccf3c469
         # is that expected?
-        m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).cuda().to(dtype)
+        m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).to(_DEVICE).to(dtype)
         m_q = copy.deepcopy(m_ref)
 
         # quantize
@@ -602,13 +609,13 @@ def test_workflow_e2e_numerics(self, config):
         sqnr = compute_error(y_ref, y_q)
         assert sqnr >= 16.5, f"SQNR {sqnr} is too low"
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_default(self):
         config1 = Int4WeightOnlyConfig(group_size=32, version=1)
         config2 = Int8WeightOnlyConfig()
         config = ModuleFqnToConfig({"_default": config1, "linear2": config2})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         quantize_(model, config)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
@@ -616,13 +623,13 @@ def test_module_fqn_to_config_default(self):
         assert isinstance(model.linear2.weight, AffineQuantizedTensor)
         assert isinstance(model.linear2.weight._layout, PlainLayout)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_module_name(self):
         config1 = Int4WeightOnlyConfig(group_size=32, version=1)
         config2 = Int8WeightOnlyConfig()
         config = ModuleFqnToConfig({"linear1": config1, "linear2": config2})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         quantize_(model, config)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
@@ -756,25 +763,25 @@ def test_module_fqn_to_config_embedding_linear(self):
         assert isinstance(model.emb.weight, IntxUnpackedToInt8Tensor)
         assert isinstance(model.linear.weight, IntxUnpackedToInt8Tensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_skip(self):
         config1 = Int4WeightOnlyConfig(group_size=32, version=1)
         config = ModuleFqnToConfig({"_default": config1, "linear2": None})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         quantize_(model, config)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
         assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
         assert not isinstance(model.linear2.weight, AffineQuantizedTensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_int4wo_cuda_serialization(self):
         config = Int4WeightOnlyConfig(group_size=32, version=1)
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
         # quantize in cuda
         quantize_(model, config)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         model(*example_inputs)
         with tempfile.NamedTemporaryFile() as ckpt:
             # save checkpoint in cuda
@@ -783,7 +790,7 @@ def test_int4wo_cuda_serialization(self):
             # This is what torchtune does: https://github.com/pytorch/torchtune/blob/v0.6.1/torchtune/training/checkpointing/_utils.py#L253
             sd = torch.load(ckpt.name, weights_only=False, map_location="cpu")
             for k, v in sd.items():
-                sd[k] = v.to("cuda")
+                sd[k] = v.to(_DEVICE)
             # load state_dict in cuda
             model.load_state_dict(sd, assign=True)
 
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index bed8421671..c251d71915 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -30,6 +30,7 @@
     groupwise_affine_quantize_tensor_from_qparams,
 )
 from torchao.utils import (
+    auto_detect_device,
     check_cpu_version,
     check_xpu_version,
     is_fbcode,
@@ -38,6 +39,8 @@
 _SEED = 1234
 torch.manual_seed(_SEED)
 
+_DEVICE = auto_detect_device()
+
 
 # Helper function to run a function twice
 # and verify that the result is the same.
@@ -575,7 +578,7 @@ def test_choose_qparams_tensor_asym_eps(self):
     )
     def test_get_group_qparams_symmetric_memory(self):
         """Check the memory usage of the op"""
-        weight = torch.randn(1024, 1024).to(device="cuda")
+        weight = torch.randn(1024, 1024).to(device=_DEVICE)
         original_mem_use = torch.cuda.memory_allocated()
         n_bit = 4
         groupsize = 128
diff --git a/torchao/testing/utils.py b/torchao/testing/utils.py
index a1dc40fdd3..aef3ea3ecf 100644
--- a/torchao/testing/utils.py
+++ b/torchao/testing/utils.py
@@ -98,6 +98,47 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+def skip_if_no_xpu(message=None):
+    """Decorator to skip tests on ROCm platform with custom message.
+
+    Args:
+        message (str, optional): Additional information about why the test is skipped.
+    """
+    import unittest
+
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if not torch.xpu.is_available():
+                skip_message = "Skipping the test in XPU"
+                if message:
+                    skip_message += f": {message}"
+                unittest.skip(skip_message)
+            return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def skip_if_xpu(message=None):
+    """
+    Decorator to skip tests if XPU is available.
+
+    Args:
+        message (str, optional): Additional information about why the test is skipped.
+    """
+
+    def decorator(func):
+        reason = "Skipping the test on XPU"
+        if message:
+            reason += f": {message}"
+
+        return unittest.skipIf(torch.xpu.is_available(), reason)(func)
+
+    return decorator
+
+
 def skip_if_no_cuda():
     import unittest
 
diff --git a/torchao/utils.py b/torchao/utils.py
index 5af3e00cfa..4ebd2d781c 100644
--- a/torchao/utils.py
+++ b/torchao/utils.py
@@ -147,6 +147,13 @@ def get_available_devices():
     return devices
 
 
+def auto_detect_device():
+    if torch.accelerator.is_available():
+        return torch.accelerator.current_accelerator()
+    else:
+        return "cpu"
+
+
 def get_compute_capability():
     if torch.cuda.is_available():
         capability = torch.cuda.get_device_capability()

From e54bda38a15d103897bb4dc5a200661fe13f1125 Mon Sep 17 00:00:00 2001
From: DiweiSun <105627594+DiweiSun@users.noreply.github.com>
Date: Tue, 28 Oct 2025 15:35:42 +0800
Subject: [PATCH 4/9] fix xpu int4 ut cases

---
 test/quantization/test_gptq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py
index 746800abeb..34dafcdbc4 100644
--- a/test/quantization/test_gptq.py
+++ b/test/quantization/test_gptq.py
@@ -188,7 +188,7 @@ def test_gptq_with_input_recorder(self):
 
         args = input_recorder.get_recorded_inputs()
 
-        if _DEVICE == "xpu":
+        if _DEVICE.type == "xpu":
             from torchao.dtypes import Int4XPULayout
 
             quantizer = Int4WeightOnlyGPTQQuantizer(

From 83afd19e11fd3f66c5713d93a06f723b79b95428 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 4 Nov 2025 16:30:35 +0800
Subject: [PATCH 5/9] debug RuntimeError

---
 .github/scripts/ci_test_xpu.sh     | 4 ++--
 test/quantization/test_observer.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
index 79114d01c0..c344895015 100644
--- a/.github/scripts/ci_test_xpu.sh
+++ b/.github/scripts/ci_test_xpu.sh
@@ -14,6 +14,6 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio
 
 pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
 
-pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py 
+# pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py 
 
-pytest -v -s torchao/test/quantization/
+pytest -v -s torchao/test/quantization/test_observer.py::TestQuantFlow::test_fixed_qparams_observer
diff --git a/test/quantization/test_observer.py b/test/quantization/test_observer.py
index 84428ba8d7..7326a78420 100644
--- a/test/quantization/test_observer.py
+++ b/test/quantization/test_observer.py
@@ -187,6 +187,7 @@ def test_fixed_qparams_observer(self):
         obs(example_input)
         obs.set_qparams(torch.ones(2048))
         scale, zero_point = obs.calculate_qparams()
+        print("scale", scale)
         self.assertTrue(torch.allclose(scale, torch.ones(2048)))
 
 

From 9c8e66b70e75d7a47289d9c332234469905750ee Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Tue, 4 Nov 2025 17:47:48 +0800
Subject: [PATCH 6/9] revert debug

---
 .github/scripts/ci_test_xpu.sh     | 4 ++--
 test/quantization/test_observer.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh
index c344895015..79114d01c0 100644
--- a/.github/scripts/ci_test_xpu.sh
+++ b/.github/scripts/ci_test_xpu.sh
@@ -14,6 +14,6 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio
 
 pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0'
 
-# pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py 
+pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py 
 
-pytest -v -s torchao/test/quantization/test_observer.py::TestQuantFlow::test_fixed_qparams_observer
+pytest -v -s torchao/test/quantization/
diff --git a/test/quantization/test_observer.py b/test/quantization/test_observer.py
index 7326a78420..84428ba8d7 100644
--- a/test/quantization/test_observer.py
+++ b/test/quantization/test_observer.py
@@ -187,7 +187,6 @@ def test_fixed_qparams_observer(self):
         obs(example_input)
         obs.set_qparams(torch.ones(2048))
         scale, zero_point = obs.calculate_qparams()
-        print("scale", scale)
         self.assertTrue(torch.allclose(scale, torch.ones(2048)))
 
 

From 7b5d2c4a603bddc3e07f2397bd3fdff595333813 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Thu, 6 Nov 2025 10:35:25 +0800
Subject: [PATCH 7/9] refine the device

---
 test/quantization/test_gptq.py | 5 +++++
 torchao/utils.py               | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py
index 34dafcdbc4..44441cca50 100644
--- a/test/quantization/test_gptq.py
+++ b/test/quantization/test_gptq.py
@@ -27,6 +27,7 @@
 
 class TestGPTQ(TestCase):
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_gptq_quantizer_int4_weight_only(self):
         from torchao._models._eval import (
             LMEvalInputRecorder,
@@ -105,6 +106,7 @@ def test_gptq_quantizer_int4_weight_only(self):
 
 
 class TestMultiTensorFlow(TestCase):
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_add_tensors(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -116,6 +118,7 @@ def test_multitensor_add_tensors(self):
         self.assertTrue(torch.equal(mt.values[0], tensor1))
         self.assertTrue(torch.equal(mt.values[1], tensor2))
 
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_pad_unpad(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -126,6 +129,7 @@ def test_multitensor_pad_unpad(self):
         mt.unpad()
         self.assertEqual(mt.count, 1)
 
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_inplace_operation(self):
         from torchao.quantization.GPTQ import MultiTensor
 
@@ -136,6 +140,7 @@ def test_multitensor_inplace_operation(self):
 
 
 class TestMultiTensorInputRecorder(TestCase):
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_multitensor_input_recorder(self):
         from torchao.quantization.GPTQ import MultiTensor, MultiTensorInputRecorder
 
diff --git a/torchao/utils.py b/torchao/utils.py
index b97db320a1..55605dbc11 100644
--- a/torchao/utils.py
+++ b/torchao/utils.py
@@ -152,7 +152,7 @@ def auto_detect_device():
     if torch.accelerator.is_available():
         return torch.accelerator.current_accelerator()
     else:
-        return "cpu"
+        return None
 
 
 def get_compute_capability():

From 54bc5c8cf1aebb890bdac72a83066ee894d40460 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 7 Nov 2025 13:58:23 +0800
Subject: [PATCH 8/9] refine the device

---
 test/quantization/test_quant_api.py        | 78 +++++++++++-----------
 test/quantization/test_quant_primitives.py |  6 +-
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index c239b3c66c..cc340bdd4b 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -243,7 +243,7 @@ def api(model):
 
         torch.testing.assert_close(ref, res.cpu())
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_int8_wo_quant_save_load(self):
         m = ToyLinearModel().eval().cpu()
 
@@ -264,7 +264,7 @@ def api(model):
         api(m2)
 
         m2.load_state_dict(state_dict)
-        m2 = m2.to(device="cuda")
+        m2 = m2.to(_DEVICE)
         example_inputs = map(lambda x: x.to(_DEVICE), example_inputs)
         res = m2(*example_inputs)
 
@@ -444,7 +444,7 @@ def test_quantized_tensor_subclass_8da4w(self, mapping_type):
         ref = m_copy(*example_inputs)
         self.assertTrue(torch.equal(res, ref))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_tensor_subclass_save_load(self):
         m = ToyLinearModel().eval().to(torch.bfloat16)
         m_copy = copy.deepcopy(m)
@@ -462,7 +462,7 @@ def test_quantized_tensor_subclass_save_load(self):
         res = m_copy(*example_inputs)
         self.assertEqual(res, ref)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_int8wo_quantized_model_to_device(self):
         m = ToyLinearModel().eval().to(torch.bfloat16)
         example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cpu")
@@ -470,15 +470,15 @@ def test_int8wo_quantized_model_to_device(self):
         quantize_(m, Int8WeightOnlyConfig())
         ref = m(*example_inputs)
 
-        example_inputs_cuda = (example_inputs[0].to("cuda"),)
-        m.to(device="cuda")
+        example_inputs_cuda = (example_inputs[0].to(_DEVICE),)
+        m.to(_DEVICE)
         cuda_res = m(*example_inputs_cuda)
         self.assertEqual(cuda_res.cpu(), ref)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_tensor_subclass_save_load_map_location(self):
-        m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device="cuda")
-        example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cuda")
+        m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device=_DEVICE.type)
+        example_inputs = m.example_inputs(dtype=torch.bfloat16, device=_DEVICE.type)
 
         quantize_(m, Int8WeightOnlyConfig())
         ref = m(*example_inputs)
@@ -491,31 +491,31 @@ def test_quantized_tensor_subclass_save_load_map_location(self):
             m_copy = ToyLinearModel().eval()
 
         m_copy.load_state_dict(state_dict, assign=True)
-        m_copy.to(dtype=torch.bfloat16, device="cuda")
+        m_copy.to(dtype=torch.bfloat16, device=_DEVICE.type)
 
         res = m_copy(*example_inputs)
         self.assertEqual(res, ref)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_model_streaming(self):
         def reset_memory():
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
+            torch.accelerator.empty_cache()
+            torch.accelerator.reset_peak_memory_stats()
 
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m.to(device="cuda"), Int8WeightOnlyConfig())
-        memory_baseline = torch.cuda.max_memory_allocated()
+        quantize_(m.to(device=_DEVICE.type), Int8WeightOnlyConfig())
+        memory_baseline = torch.accelerator.max_memory_allocated()
 
         del m
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m, Int8WeightOnlyConfig(), device="cuda")
-        memory_streaming = torch.cuda.max_memory_allocated()
+        quantize_(m, Int8WeightOnlyConfig(), device=_DEVICE.type)
+        memory_streaming = torch.accelerator.max_memory_allocated()
 
         for param in m.parameters():
-            assert param.is_cuda
+            assert param.device.type == _DEVICE.type
         self.assertLess(memory_streaming, memory_baseline)
 
     @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half])
@@ -641,20 +641,20 @@ def test_module_fqn_to_config_module_name(self):
         assert isinstance(model.linear2.weight, AffineQuantizedTensor)
         assert isinstance(model.linear2.weight._layout, PlainLayout)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_regex_basic(self):
         config1 = Int4WeightOnlyConfig(
             group_size=32, int4_packing_format="tile_packed_to_4d"
         )
         config = ModuleFqnToConfig({"re:linear.": config1})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor)
         assert isinstance(model.linear2.weight, Int4TilePackedTo4dTensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_regex_precedence(self):
         """Testing that full path config takes precedence over
         regex config in ModuleFqnToConfig
@@ -664,14 +664,14 @@ def test_module_fqn_to_config_regex_precedence(self):
         )
         config2 = IntxWeightOnlyConfig()
         config = ModuleFqnToConfig({"linear1": config1, "re:linear.": config2})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor)
         assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_regex_precedence2(self):
         """Testing that full path config takes precedence over
         regex config in ModuleFqnToConfig, swapping
@@ -683,14 +683,14 @@ def test_module_fqn_to_config_regex_precedence2(self):
         )
         config2 = IntxWeightOnlyConfig()
         config = ModuleFqnToConfig({"re:linear.": config2, "linear1": config1})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor)
         assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_regex_fullmatch(self):
         """Testing that we will only match the fqns that fully
         matches the regex
@@ -729,7 +729,7 @@ def example_inputs(self):
                 "linear3_full_match.bias": None,
             }
         )
-        model = M(dtype=torch.bfloat16, device="cuda")
+        model = M(dtype=torch.bfloat16, device=_DEVICE.type)
         example_inputs = model.example_inputs()
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
@@ -851,7 +851,7 @@ def test_config_deprecation(self):
 common_utils.instantiate_parametrized_tests(TestQuantFlow)
 
 
-@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not torch.accelerator.is_available(), "Need CUDA available")
 @unittest.skipIf(not is_sm_at_least_90(), "Checkpoints are produced in SM90+")
 class TestFqnToConfig(TestCase):
     def test_quantize_param_fqn_exact(self):
@@ -861,7 +861,7 @@ def test_quantize_param_fqn_exact(self):
         config = AutoConfig.from_pretrained(
             "unsloth/Llama-4-Scout-17B-16E-Instruct"
         ).text_config
-        model = Llama4TextMoe(config).to(torch.bfloat16).cuda()
+        model = Llama4TextMoe(config).to(torch.bfloat16).to(_DEVICE)
 
         quant_config = FqnToConfig(
             {
@@ -1106,27 +1106,27 @@ def test_non_fqn_config_filter_fn_none(self):
         assert isinstance(model.weight, Float8Tensor)
         assert model.weight.scale.numel() == 1
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_model_streaming_fqn_config(self):
         def reset_memory():
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
+            torch.accelerator.empty_cache()
+            torch.accelerator.reset_peak_memory_stats()
 
         quant_config = FqnToConfig({"_default": Int8WeightOnlyConfig()})
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m.to(device="cuda"), quant_config, filter_fn=None)
-        memory_baseline = torch.cuda.max_memory_allocated()
+        quantize_(m.to(device=_DEVICE.type), quant_config, filter_fn=None)
+        memory_baseline = torch.accelerator.max_memory_allocated()
 
         del m
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m, quant_config, device="cuda", filter_fn=None)
-        memory_streaming = torch.cuda.max_memory_allocated()
+        quantize_(m, quant_config, device=_DEVICE.type, filter_fn=None)
+        memory_streaming = torch.accelerator.max_memory_allocated()
 
         for param in m.parameters():
-            assert param.is_cuda
+            assert param.device.type == _DEVICE.type
         self.assertLess(memory_streaming, memory_baseline)
 
 
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index 2ce7cf11a2..dae40a2673 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -593,16 +593,16 @@ def test_choose_qparams_tensor_asym_eps(self):
         self.assertEqual(scale, eps)
 
     @unittest.skipIf(
-        not torch.cuda.is_available(), "skipping when cuda is not available"
+        not torch.accelerator.is_available(), "skipping when gpu is not available"
     )
     def test_get_group_qparams_symmetric_memory(self):
         """Check the memory usage of the op"""
         weight = torch.randn(1024, 1024).to(device=_DEVICE)
-        original_mem_use = torch.cuda.memory_allocated()
+        original_mem_use = torch.accelerator.memory_allocated()
         n_bit = 4
         groupsize = 128
         (scale_ao, _) = get_group_qparams_symmetric(weight, n_bit, groupsize)
-        after_choose_qparams_mem_use = torch.cuda.memory_allocated()
+        after_choose_qparams_mem_use = torch.accelerator.memory_allocated()
         self.assertTrue(after_choose_qparams_mem_use < 1.2 * original_mem_use)
 
     def test_raises(self):

From a9043e2f0f8ca3d0f2b76ad3a3a9bba84c28605e Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Fri, 7 Nov 2025 14:08:25 +0800
Subject: [PATCH 9/9] refine the device

---
 test/quantization/test_quant_api.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index cc340bdd4b..b8f56374ec 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -641,20 +641,20 @@ def test_module_fqn_to_config_module_name(self):
         assert isinstance(model.linear2.weight, AffineQuantizedTensor)
         assert isinstance(model.linear2.weight._layout, PlainLayout)
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_module_fqn_to_config_regex_basic(self):
         config1 = Int4WeightOnlyConfig(
             group_size=32, int4_packing_format="tile_packed_to_4d"
         )
         config = ModuleFqnToConfig({"re:linear.": config1})
-        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16)
+        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor)
         assert isinstance(model.linear2.weight, Int4TilePackedTo4dTensor)
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_module_fqn_to_config_regex_precedence(self):
         """Testing that full path config takes precedence over
         regex config in ModuleFqnToConfig
@@ -664,14 +664,14 @@ def test_module_fqn_to_config_regex_precedence(self):
         )
         config2 = IntxWeightOnlyConfig()
         config = ModuleFqnToConfig({"linear1": config1, "re:linear.": config2})
-        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16)
+        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor)
         assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor)
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_module_fqn_to_config_regex_precedence2(self):
         """Testing that full path config takes precedence over
         regex config in ModuleFqnToConfig, swapping
@@ -683,14 +683,14 @@ def test_module_fqn_to_config_regex_precedence2(self):
         )
         config2 = IntxWeightOnlyConfig()
         config = ModuleFqnToConfig({"re:linear.": config2, "linear1": config1})
-        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16)
+        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor)
         assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor)
 
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_module_fqn_to_config_regex_fullmatch(self):
         """Testing that we will only match the fqns that fully
         matches the regex
@@ -729,7 +729,7 @@ def example_inputs(self):
                 "linear3_full_match.bias": None,
             }
         )
-        model = M(dtype=torch.bfloat16, device=_DEVICE.type)
+        model = M(dtype=torch.bfloat16, device="cuda")
         example_inputs = model.example_inputs()
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)