From a736c41187363d39cf72d1b4a3bda5b62554f3a1 Mon Sep 17 00:00:00 2001 From: "Sun, Diwei" Date: Tue, 19 Aug 2025 08:28:02 +0000 Subject: [PATCH 1/9] enable xpu ci test --- .github/workflows/pr-test-xpu.yml | 156 ++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 .github/workflows/pr-test-xpu.yml diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml new file mode 100644 index 0000000000..79621a06d1 --- /dev/null +++ b/.github/workflows/pr-test-xpu.yml @@ -0,0 +1,156 @@ +# TODO: this looks sort of similar to _linux-test, but there are like a dozen +# places where you would have to insert an if statement. Probably it's better to +# just use a different workflow altogether + +name: xpu-test + +on: + push: + branches: + - main + - 'gh/**' + pull_request: + branches: + - main + - 'gh/**' + +concurrency: + group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +jobs: + test: + # Don't run on forked repos or empty test matrix + # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' + timeout-minutes: 60 + runs-on: ao-pvc + env: + DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3 + TEST_COMMAND: .github/scripts/ci_test_xpu.sh + PYTORCH_RETRY_TEST_CASES: 1 + PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + steps: + # [see note: pytorch repo ref] + - name: Checkout Torchao + uses: actions/checkout@v4 + + - name: Clean all stopped docker containers + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi + + - name: Runner health check GPU count + if: always() + shell: bash + run: | + ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) + msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" + if [[ $ngpu -eq 0 ]]; then + echo "Error: Failed to detect any GPUs on the runner" + echo "$msg" + exit 1 + fi + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + shell: bash + run: | + echo "docker pull ${DOCKER_IMAGE}" + docker pull ${DOCKER_IMAGE} + + - name: Test + id: test + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + timeout-minutes: 60 + run: | + set -x + + # detached container should get cleaned up by teardown_ec2_linux + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e GITHUB_ACTIONS \ + -e GITHUB_REPOSITORY \ + -e GITHUB_WORKFLOW \ + -e GITHUB_JOB \ + -e GITHUB_RUN_ID \ + -e GITHUB_RUN_NUMBER \ + -e GITHUB_RUN_ATTEMPT \ + -e JOB_ID \ + -e BRANCH \ + -e SHA1 \ + --user $(id -u):$(id -g) \ + --ulimit stack=10485760:83886080 \ + --ulimit core=0 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="8g" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + --privileged \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # save container name for later step + echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" + + - name: Change permissions + if: ${{ always() && steps.test.conclusion }} + run: | + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" + + - name: Collect backtraces from coredumps (if any) + if: always() + run: | + # shellcheck disable=SC2156 + find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; + + - name: Stop container before exit + if: always() + run: | + # Workaround for multiple runners on same IDC node + docker stop "${{ env.CONTAINER_NAME }}" + + - name: Store Core dumps on GitHub + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + if: failure() + with: + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Teardown XPU + if: always() + shell: bash + run: | + # Prune all stopped containers. + # If other runner is pruning on this node, will skip. + nprune=$(ps -ef | grep -c "docker container prune") + if [[ $nprune -eq 1 ]]; then + docker container prune -f + fi From 7c96ad46d16039d37a685fa9ed28010d1a3ab0aa Mon Sep 17 00:00:00 2001 From: "Sun, Diwei" Date: Wed, 20 Aug 2025 01:52:54 +0000 Subject: [PATCH 2/9] Revert "enable xpu ci test" This reverts commit a736c41187363d39cf72d1b4a3bda5b62554f3a1. --- .github/workflows/pr-test-xpu.yml | 156 ------------------------------ 1 file changed, 156 deletions(-) delete mode 100644 .github/workflows/pr-test-xpu.yml diff --git a/.github/workflows/pr-test-xpu.yml b/.github/workflows/pr-test-xpu.yml deleted file mode 100644 index 79621a06d1..0000000000 --- a/.github/workflows/pr-test-xpu.yml +++ /dev/null @@ -1,156 +0,0 @@ -# TODO: this looks sort of similar to _linux-test, but there are like a dozen -# places where you would have to insert an if statement. Probably it's better to -# just use a different workflow altogether - -name: xpu-test - -on: - push: - branches: - - main - - 'gh/**' - pull_request: - branches: - - main - - 'gh/**' - -concurrency: - group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} - cancel-in-progress: true - -jobs: - test: - # Don't run on forked repos or empty test matrix - # if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' - timeout-minutes: 60 - runs-on: ao-pvc - env: - DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-jammy-xpu-2025.1-py3-b388c12018df5d6ce2f94b7fb337fa3729978ab3 - TEST_COMMAND: .github/scripts/ci_test_xpu.sh - PYTORCH_RETRY_TEST_CASES: 1 - PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 - XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla - steps: - # [see note: pytorch repo ref] - - name: Checkout Torchao - uses: actions/checkout@v4 - - - name: Clean all stopped docker containers - if: always() - shell: bash - run: | - # Prune all stopped containers. - # If other runner is pruning on this node, will skip. - nprune=$(ps -ef | grep -c "docker container prune") - if [[ $nprune -eq 1 ]]; then - docker container prune -f - fi - - - name: Runner health check GPU count - if: always() - shell: bash - run: | - ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) - msg="Please file an issue on pytorch/ao reporting the faulty runner. Include a link to the runner logs so the runner can be identified" - if [[ $ngpu -eq 0 ]]; then - echo "Error: Failed to detect any GPUs on the runner" - echo "$msg" - exit 1 - fi - - - name: Use following to pull public copy of the image - id: print-ghcr-mirror - shell: bash - run: | - echo "docker pull ${DOCKER_IMAGE}" - docker pull ${DOCKER_IMAGE} - - - name: Test - id: test - env: - BUILD_ENVIRONMENT: ${{ inputs.build-environment }} - PR_NUMBER: ${{ github.event.pull_request.number }} - GITHUB_REPOSITORY: ${{ github.repository }} - GITHUB_WORKFLOW: ${{ github.workflow }} - GITHUB_JOB: ${{ github.job }} - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_RUN_NUMBER: ${{ github.run_number }} - GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - timeout-minutes: 60 - run: | - set -x - - # detached container should get cleaned up by teardown_ec2_linux - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e GITHUB_ACTIONS \ - -e GITHUB_REPOSITORY \ - -e GITHUB_WORKFLOW \ - -e GITHUB_JOB \ - -e GITHUB_RUN_ID \ - -e GITHUB_RUN_NUMBER \ - -e GITHUB_RUN_ATTEMPT \ - -e JOB_ID \ - -e BRANCH \ - -e SHA1 \ - --user $(id -u):$(id -g) \ - --ulimit stack=10485760:83886080 \ - --ulimit core=0 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="8g" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - --privileged \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - # save container name for later step - echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" - # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home - docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" - - - name: Change permissions - if: ${{ always() && steps.test.conclusion }} - run: | - docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" - - - name: Collect backtraces from coredumps (if any) - if: always() - run: | - # shellcheck disable=SC2156 - find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; - - - name: Stop container before exit - if: always() - run: | - # Workaround for multiple runners on same IDC node - docker stop "${{ env.CONTAINER_NAME }}" - - - name: Store Core dumps on GitHub - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 - if: failure() - with: - name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} - retention-days: 14 - if-no-files-found: ignore - path: ./**/core.[1-9]* - - - name: Teardown XPU - if: always() - shell: bash - run: | - # Prune all stopped containers. - # If other runner is pruning on this node, will skip. - nprune=$(ps -ef | grep -c "docker container prune") - if [[ $nprune -eq 1 ]]; then - docker container prune -f - fi From 9d1cc1f6fda1aaddcebbd8adfb5d28c8931b38b1 Mon Sep 17 00:00:00 2001 From: "Sun, Diwei" Date: Mon, 27 Oct 2025 09:03:51 +0000 Subject: [PATCH 3/9] enabel quantiation ut cases in xpu ci --- .github/scripts/ci_test_xpu.sh | 2 + test/quantization/test_gptq.py | 30 ++++++------ test/quantization/test_moe_quant.py | 38 +++++++--------- test/quantization/test_qat.py | 14 ++++-- test/quantization/test_quant_api.py | 53 ++++++++++++---------- test/quantization/test_quant_primitives.py | 5 +- torchao/testing/utils.py | 41 +++++++++++++++++ torchao/utils.py | 7 +++ 8 files changed, 128 insertions(+), 62 deletions(-) diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh index d765696b40..79114d01c0 100644 --- a/.github/scripts/ci_test_xpu.sh +++ b/.github/scripts/ci_test_xpu.sh @@ -15,3 +15,5 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py + +pytest -v -s torchao/test/quantization/ diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py index 6f7ac10d45..746800abeb 100644 --- a/test/quantization/test_gptq.py +++ b/test/quantization/test_gptq.py @@ -18,13 +18,15 @@ from torchao._models.llama.tokenizer import get_tokenizer from torchao.quantization import Int4WeightOnlyConfig, quantize_ from torchao.quantization.utils import compute_error +from torchao.utils import auto_detect_device torch.manual_seed(0) +_DEVICE = auto_detect_device() + class TestGPTQ(TestCase): @unittest.skip("skipping until we get checkpoints for gpt-fast") - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_gptq_quantizer_int4_weight_only(self): from torchao._models._eval import ( LMEvalInputRecorder, @@ -33,7 +35,6 @@ def test_gptq_quantizer_int4_weight_only(self): from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer precision = torch.bfloat16 - device = "cuda" checkpoint_path = Path( "../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth" ) @@ -80,19 +81,19 @@ def test_gptq_quantizer_int4_weight_only(self): ) model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length) - model = quantizer.quantize(model, *inputs).cuda() + model = quantizer.quantize(model, *inputs).to(_DEVICE) model.reset_caches() - with torch.device("cuda"): + with torch.device(_DEVICE): model.setup_caches(max_batch_size=1, max_seq_length=model.config.block_size) limit = 1 result = TransformerEvalWrapper( - model.cuda(), + model.to(_DEVICE), tokenizer, model.config.block_size, prepare_inputs_for_model, - device, + _DEVICE, ).run_eval( ["wikitext"], limit, @@ -104,7 +105,6 @@ def test_gptq_quantizer_int4_weight_only(self): class TestMultiTensorFlow(TestCase): - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_multitensor_add_tensors(self): from torchao.quantization.GPTQ import MultiTensor @@ -116,7 +116,6 @@ def test_multitensor_add_tensors(self): self.assertTrue(torch.equal(mt.values[0], tensor1)) self.assertTrue(torch.equal(mt.values[1], tensor2)) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_multitensor_pad_unpad(self): from torchao.quantization.GPTQ import MultiTensor @@ -127,7 +126,6 @@ def test_multitensor_pad_unpad(self): mt.unpad() self.assertEqual(mt.count, 1) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_multitensor_inplace_operation(self): from torchao.quantization.GPTQ import MultiTensor @@ -138,7 +136,6 @@ def test_multitensor_inplace_operation(self): class TestMultiTensorInputRecorder(TestCase): - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_multitensor_input_recorder(self): from torchao.quantization.GPTQ import MultiTensor, MultiTensorInputRecorder @@ -159,7 +156,7 @@ def test_multitensor_input_recorder(self): self.assertTrue(isinstance(MT_input[2][2], MultiTensor)) self.assertEqual(MT_input[3], torch.float) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_gptq_with_input_recorder(self): from torchao.quantization.GPTQ import ( Int4WeightOnlyGPTQQuantizer, @@ -170,7 +167,7 @@ def test_gptq_with_input_recorder(self): config = ModelArgs(n_layer=2) - with torch.device("cuda"): + with torch.device(_DEVICE): model = Transformer(config) model.setup_caches(max_batch_size=2, max_seq_length=100) idx = torch.randint(1, 10000, (10, 2, 50)).to(torch.int32) @@ -191,7 +188,14 @@ def test_gptq_with_input_recorder(self): args = input_recorder.get_recorded_inputs() - quantizer = Int4WeightOnlyGPTQQuantizer() + if _DEVICE == "xpu": + from torchao.dtypes import Int4XPULayout + + quantizer = Int4WeightOnlyGPTQQuantizer( + device=torch.device("xpu"), layout=Int4XPULayout() + ) + else: + quantizer = Int4WeightOnlyGPTQQuantizer() quantizer.quantize(model, *args) diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py index 61000babc1..55a6a87e24 100644 --- a/test/quantization/test_moe_quant.py +++ b/test/quantization/test_moe_quant.py @@ -33,7 +33,13 @@ quantize_, ) from torchao.quantization.utils import compute_error -from torchao.utils import is_sm_at_least_90 +from torchao.testing.utils import skip_if_no_cuda +from torchao.utils import ( + auto_detect_device, + is_sm_at_least_90, +) + +_DEVICE = auto_detect_device() if torch.version.hip is not None: pytest.skip( @@ -54,7 +60,7 @@ def _test_impl_moe_quant( base_class=AffineQuantizedTensor, tensor_impl_class=None, dtype=torch.bfloat16, - device="cuda", + device=_DEVICE, fullgraph=False, ): """ @@ -115,10 +121,8 @@ def _test_impl_moe_quant( ("multiple_tokens", 8, False), ] ) + @skip_if_no_cuda() def test_int4wo_fake_dim(self, name, num_tokens, fullgraph): - if not torch.cuda.is_available(): - self.skipTest("Need CUDA available") - config = MoEQuantConfig( Int4WeightOnlyConfig(version=1), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE, @@ -138,6 +142,7 @@ def test_int4wo_fake_dim(self, name, num_tokens, fullgraph): ("multiple_tokens", 8, False), ] ) + @skip_if_no_cuda() def test_int4wo_base(self, name, num_tokens, fullgraph): if not torch.cuda.is_available(): self.skipTest("Need CUDA available") @@ -160,10 +165,8 @@ def test_int4wo_base(self, name, num_tokens, fullgraph): ("multiple_tokens", 8, False), ] ) + @skip_if_no_cuda() def test_int8wo_fake_dim(self, name, num_tokens, fullgraph): - if not torch.cuda.is_available(): - self.skipTest("Need CUDA available") - config = MoEQuantConfig( Int8WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE ) @@ -182,10 +185,8 @@ def test_int8wo_fake_dim(self, name, num_tokens, fullgraph): ("multiple_tokens", 8, False), ] ) + @skip_if_no_cuda() def test_int8wo_base(self, name, num_tokens, fullgraph): - if not torch.cuda.is_available(): - self.skipTest("Need CUDA available") - config = MoEQuantConfig(Int8WeightOnlyConfig()) tensor_impl_class = PlainAQTTensorImpl @@ -202,6 +203,7 @@ def test_int8wo_base(self, name, num_tokens, fullgraph): ("multiple_tokens", 8, False), ] ) + @skip_if_no_cuda() def test_int8wo_base_cpu(self, name, num_tokens, fullgraph): config = MoEQuantConfig(Int8WeightOnlyConfig()) tensor_impl_class = PlainAQTTensorImpl @@ -219,10 +221,8 @@ def test_int8wo_base_cpu(self, name, num_tokens, fullgraph): ("multiple_tokens", 32, False), ] ) + @skip_if_no_cuda() def test_int8dq_fake_dim(self, name, num_tokens, fullgraph): - if not torch.cuda.is_available(): - self.skipTest("Need CUDA available") - config = MoEQuantConfig( Int8DynamicActivationInt8WeightConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE, @@ -242,10 +242,8 @@ def test_int8dq_fake_dim(self, name, num_tokens, fullgraph): ("multiple_tokens", 32, False), ] ) + @skip_if_no_cuda() def test_int8dq_base(self, name, num_tokens, fullgraph): - if not torch.cuda.is_available(): - self.skipTest("Need CUDA available") - config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig()) base_class = LinearActivationQuantizedTensor @@ -263,9 +261,8 @@ def test_int8dq_base(self, name, num_tokens, fullgraph): ("multiple_tokens", 8, False), ] ) + @skip_if_no_cuda() def test_fp8wo_fake_dim(self, name, num_tokens, fullgraph): - if not torch.cuda.is_available(): - self.skipTest("Need CUDA available") if not is_sm_at_least_90(): self.skipTest("Requires CUDA capability >= 9.0") @@ -335,9 +332,8 @@ def test_fp8dq_fake_dim(self, name, num_tokens, fullgraph): ("multiple_tokens", 8, False), ] ) + @skip_if_no_cuda() def test_fp8dq_base(self, name, num_tokens, fullgraph): - if not torch.cuda.is_available(): - self.skipTest("Need CUDA available") if not is_sm_at_least_90(): self.skipTest("Requires CUDA capability >= 9.0") diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py index f523cb091c..73b8009a81 100644 --- a/test/quantization/test_qat.py +++ b/test/quantization/test_qat.py @@ -98,12 +98,15 @@ ) from torchao.utils import ( _is_fbgemm_gpu_genai_available, + auto_detect_device, is_fbcode, is_sm_at_least_89, ) # TODO: put this in a common test utils file _CUDA_IS_AVAILABLE = torch.cuda.is_available() +_GPU_IS_AVAILABLE = torch.accelerator.is_available() +_DEVICE = auto_detect_device() class Sub(torch.nn.Module): @@ -347,7 +350,7 @@ def _set_ptq_weight( group_size, ) q_weight = torch.ops.aten._convert_weight_to_int4pack( - q_weight.to("cuda"), + q_weight.to(_DEVICE), qat_linear.inner_k_tiles, ) ptq_linear.weight = q_weight @@ -600,13 +603,15 @@ def _assert_close_4w(self, val, ref): print(mean_err) self.assertTrue(mean_err < 0.05) - @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available") + @unittest.skipIf( + not _GPU_IS_AVAILABLE, "skipping when cuda or xpu is not available" + ) def test_qat_4w_primitives(self): n_bit = 4 group_size = 32 inner_k_tiles = 8 scales_precision = torch.bfloat16 - device = torch.device("cuda") + device = torch.device(_DEVICE) dtype = torch.bfloat16 torch.manual_seed(self.SEED) x = torch.randn(100, 256, dtype=dtype, device=device) @@ -699,11 +704,12 @@ def test_qat_4w_quantizer(self): group_size = 32 inner_k_tiles = 8 - device = torch.device("cuda") + device = torch.device(_DEVICE) dtype = torch.bfloat16 torch.manual_seed(self.SEED) m = M().to(device).to(dtype) m2 = copy.deepcopy(m) + qat_quantizer = Int4WeightOnlyQATQuantizer( groupsize=group_size, inner_k_tiles=inner_k_tiles, diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py index 577ca6789a..164cf6bad0 100644 --- a/test/quantization/test_quant_api.py +++ b/test/quantization/test_quant_api.py @@ -60,14 +60,17 @@ ) from torchao.quantization.quant_primitives import MappingType from torchao.quantization.utils import compute_error -from torchao.testing.utils import skip_if_rocm +from torchao.testing.utils import skip_if_rocm, skip_if_xpu from torchao.utils import ( + auto_detect_device, is_sm_at_least_89, is_sm_at_least_90, torch_version_at_least, unwrap_tensor_subclass, ) +_DEVICE = auto_detect_device() + try: import gemlite # noqa: F401 @@ -258,7 +261,7 @@ def api(model): m2.load_state_dict(state_dict) m2 = m2.to(device="cuda") - example_inputs = map(lambda x: x.cuda(), example_inputs) + example_inputs = map(lambda x: x.to(_DEVICE), example_inputs) res = m2(*example_inputs) # TODO: figure out why ROCm has a larger error @@ -290,12 +293,13 @@ def test_8da4w_quantizer_linear_bias(self): m(*example_inputs) @unittest.skip("skipping until we get checkpoints for gpt-fast") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantizer_int4_weight_only(self): from torchao._models._eval import TransformerEvalWrapper from torchao.quantization.linear_quant_modules import Int4WeightOnlyQuantizer precision = torch.bfloat16 - device = "cuda" + device = _DEVICE checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth") model = Transformer.from_name(checkpoint_path.parent.name) checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True) @@ -312,7 +316,7 @@ def test_quantizer_int4_weight_only(self): quantizer = Int4WeightOnlyQuantizer( groupsize, ) - model = quantizer.quantize(model).cuda() + model = quantizer.quantize(model).to(_DEVICE) result = TransformerEvalWrapper( model, tokenizer, @@ -328,11 +332,12 @@ def test_quantizer_int4_weight_only(self): ) @unittest.skip("skipping until we get checkpoints for gpt-fast") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_eval_wrapper(self): from torchao._models._eval import TransformerEvalWrapper precision = torch.bfloat16 - device = "cuda" + device = _DEVICE checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth") model = Transformer.from_name(checkpoint_path.parent.name) checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True) @@ -361,11 +366,12 @@ def test_eval_wrapper(self): # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY @unittest.skip("skipping until we get checkpoints for gpt-fast") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_eval_wrapper_llama3(self): from torchao._models._eval import TransformerEvalWrapper precision = torch.bfloat16 - device = "cuda" + device = _DEVICE checkpoint_path = Path( ".../gpt-fast/checkpoints/meta-llama/Meta-Llama-3-8B/model.pth" ) @@ -534,7 +540,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq): assert "aten.mm.default" not in code[0] # TODO(#1690): move to new config names - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @common_utils.parametrize( "config", [ @@ -551,6 +557,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq): UIntXWeightOnlyConfig(dtype=torch.uint4), ], ) + @skip_if_xpu("XPU enablement in progress") @skip_if_rocm("ROCm enablement in progress") def test_workflow_e2e_numerics(self, config): """ @@ -579,17 +586,17 @@ def test_workflow_e2e_numerics(self, config): # scale has to be moved to cuda here because the parametrization init # code happens before gating for cuda availability if isinstance(config, Float8StaticActivationFloat8WeightConfig): - config.scale = config.scale.to("cuda") + config.scale = config.scale.to(_DEVICE) dtype = torch.bfloat16 if isinstance(config, GemliteUIntXWeightOnlyConfig): dtype = torch.float16 # set up inputs - x = torch.randn(128, 128, device="cuda", dtype=dtype) + x = torch.randn(128, 128, device=_DEVICE, dtype=dtype) # TODO(future): model in float32 leads to error: https://gist.github.com/vkuzo/63b3bcd7818393021a6e3fb4ccf3c469 # is that expected? - m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).cuda().to(dtype) + m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).to(_DEVICE).to(dtype) m_q = copy.deepcopy(m_ref) # quantize @@ -602,13 +609,13 @@ def test_workflow_e2e_numerics(self, config): sqnr = compute_error(y_ref, y_q) assert sqnr >= 16.5, f"SQNR {sqnr} is too low" - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_default(self): config1 = Int4WeightOnlyConfig(group_size=32, version=1) config2 = Int8WeightOnlyConfig() config = ModuleFqnToConfig({"_default": config1, "linear2": config2}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) quantize_(model, config) model(*example_inputs) assert isinstance(model.linear1.weight, AffineQuantizedTensor) @@ -616,13 +623,13 @@ def test_module_fqn_to_config_default(self): assert isinstance(model.linear2.weight, AffineQuantizedTensor) assert isinstance(model.linear2.weight._layout, PlainLayout) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_module_name(self): config1 = Int4WeightOnlyConfig(group_size=32, version=1) config2 = Int8WeightOnlyConfig() config = ModuleFqnToConfig({"linear1": config1, "linear2": config2}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) quantize_(model, config) model(*example_inputs) assert isinstance(model.linear1.weight, AffineQuantizedTensor) @@ -756,25 +763,25 @@ def test_module_fqn_to_config_embedding_linear(self): assert isinstance(model.emb.weight, IntxUnpackedToInt8Tensor) assert isinstance(model.linear.weight, IntxUnpackedToInt8Tensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_skip(self): config1 = Int4WeightOnlyConfig(group_size=32, version=1) config = ModuleFqnToConfig({"_default": config1, "linear2": None}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) quantize_(model, config) model(*example_inputs) assert isinstance(model.linear1.weight, AffineQuantizedTensor) assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout) assert not isinstance(model.linear2.weight, AffineQuantizedTensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_int4wo_cuda_serialization(self): config = Int4WeightOnlyConfig(group_size=32, version=1) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) # quantize in cuda quantize_(model, config) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) model(*example_inputs) with tempfile.NamedTemporaryFile() as ckpt: # save checkpoint in cuda @@ -783,7 +790,7 @@ def test_int4wo_cuda_serialization(self): # This is what torchtune does: https://github.com/pytorch/torchtune/blob/v0.6.1/torchtune/training/checkpointing/_utils.py#L253 sd = torch.load(ckpt.name, weights_only=False, map_location="cpu") for k, v in sd.items(): - sd[k] = v.to("cuda") + sd[k] = v.to(_DEVICE) # load state_dict in cuda model.load_state_dict(sd, assign=True) diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py index bed8421671..c251d71915 100644 --- a/test/quantization/test_quant_primitives.py +++ b/test/quantization/test_quant_primitives.py @@ -30,6 +30,7 @@ groupwise_affine_quantize_tensor_from_qparams, ) from torchao.utils import ( + auto_detect_device, check_cpu_version, check_xpu_version, is_fbcode, @@ -38,6 +39,8 @@ _SEED = 1234 torch.manual_seed(_SEED) +_DEVICE = auto_detect_device() + # Helper function to run a function twice # and verify that the result is the same. @@ -575,7 +578,7 @@ def test_choose_qparams_tensor_asym_eps(self): ) def test_get_group_qparams_symmetric_memory(self): """Check the memory usage of the op""" - weight = torch.randn(1024, 1024).to(device="cuda") + weight = torch.randn(1024, 1024).to(device=_DEVICE) original_mem_use = torch.cuda.memory_allocated() n_bit = 4 groupsize = 128 diff --git a/torchao/testing/utils.py b/torchao/testing/utils.py index a1dc40fdd3..aef3ea3ecf 100644 --- a/torchao/testing/utils.py +++ b/torchao/testing/utils.py @@ -98,6 +98,47 @@ def wrapper(*args, **kwargs): return decorator +def skip_if_no_xpu(message=None): + """Decorator to skip tests on ROCm platform with custom message. + + Args: + message (str, optional): Additional information about why the test is skipped. + """ + import unittest + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not torch.xpu.is_available(): + skip_message = "Skipping the test in XPU" + if message: + skip_message += f": {message}" + unittest.skip(skip_message) + return func(*args, **kwargs) + + return wrapper + + return decorator + + +def skip_if_xpu(message=None): + """ + Decorator to skip tests if XPU is available. + + Args: + message (str, optional): Additional information about why the test is skipped. + """ + + def decorator(func): + reason = "Skipping the test on XPU" + if message: + reason += f": {message}" + + return unittest.skipIf(torch.xpu.is_available(), reason)(func) + + return decorator + + def skip_if_no_cuda(): import unittest diff --git a/torchao/utils.py b/torchao/utils.py index 5af3e00cfa..4ebd2d781c 100644 --- a/torchao/utils.py +++ b/torchao/utils.py @@ -147,6 +147,13 @@ def get_available_devices(): return devices +def auto_detect_device(): + if torch.accelerator.is_available(): + return torch.accelerator.current_accelerator() + else: + return "cpu" + + def get_compute_capability(): if torch.cuda.is_available(): capability = torch.cuda.get_device_capability() From e54bda38a15d103897bb4dc5a200661fe13f1125 Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Tue, 28 Oct 2025 15:35:42 +0800 Subject: [PATCH 4/9] fix xpu int4 ut cases --- test/quantization/test_gptq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py index 746800abeb..34dafcdbc4 100644 --- a/test/quantization/test_gptq.py +++ b/test/quantization/test_gptq.py @@ -188,7 +188,7 @@ def test_gptq_with_input_recorder(self): args = input_recorder.get_recorded_inputs() - if _DEVICE == "xpu": + if _DEVICE.type == "xpu": from torchao.dtypes import Int4XPULayout quantizer = Int4WeightOnlyGPTQQuantizer( From 83afd19e11fd3f66c5713d93a06f723b79b95428 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 4 Nov 2025 16:30:35 +0800 Subject: [PATCH 5/9] debug RuntimeError --- .github/scripts/ci_test_xpu.sh | 4 ++-- test/quantization/test_observer.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh index 79114d01c0..c344895015 100644 --- a/.github/scripts/ci_test_xpu.sh +++ b/.github/scripts/ci_test_xpu.sh @@ -14,6 +14,6 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' -pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py +# pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py -pytest -v -s torchao/test/quantization/ +pytest -v -s torchao/test/quantization/test_observer.py::TestQuantFlow::test_fixed_qparams_observer diff --git a/test/quantization/test_observer.py b/test/quantization/test_observer.py index 84428ba8d7..7326a78420 100644 --- a/test/quantization/test_observer.py +++ b/test/quantization/test_observer.py @@ -187,6 +187,7 @@ def test_fixed_qparams_observer(self): obs(example_input) obs.set_qparams(torch.ones(2048)) scale, zero_point = obs.calculate_qparams() + print("scale", scale) self.assertTrue(torch.allclose(scale, torch.ones(2048))) From 9c8e66b70e75d7a47289d9c332234469905750ee Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Tue, 4 Nov 2025 17:47:48 +0800 Subject: [PATCH 6/9] revert debug --- .github/scripts/ci_test_xpu.sh | 4 ++-- test/quantization/test_observer.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/scripts/ci_test_xpu.sh b/.github/scripts/ci_test_xpu.sh index c344895015..79114d01c0 100644 --- a/.github/scripts/ci_test_xpu.sh +++ b/.github/scripts/ci_test_xpu.sh @@ -14,6 +14,6 @@ python3 -c "import torch; import torchao; print(f'Torch version: {torch.__versio pip install pytest expecttest parameterized accelerate hf_transfer 'modelscope!=1.15.0' -# pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py +pytest -v -s torchao/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py -pytest -v -s torchao/test/quantization/test_observer.py::TestQuantFlow::test_fixed_qparams_observer +pytest -v -s torchao/test/quantization/ diff --git a/test/quantization/test_observer.py b/test/quantization/test_observer.py index 7326a78420..84428ba8d7 100644 --- a/test/quantization/test_observer.py +++ b/test/quantization/test_observer.py @@ -187,7 +187,6 @@ def test_fixed_qparams_observer(self): obs(example_input) obs.set_qparams(torch.ones(2048)) scale, zero_point = obs.calculate_qparams() - print("scale", scale) self.assertTrue(torch.allclose(scale, torch.ones(2048))) From 7b5d2c4a603bddc3e07f2397bd3fdff595333813 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 6 Nov 2025 10:35:25 +0800 Subject: [PATCH 7/9] refine the device --- test/quantization/test_gptq.py | 5 +++++ torchao/utils.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/quantization/test_gptq.py b/test/quantization/test_gptq.py index 34dafcdbc4..44441cca50 100644 --- a/test/quantization/test_gptq.py +++ b/test/quantization/test_gptq.py @@ -27,6 +27,7 @@ class TestGPTQ(TestCase): @unittest.skip("skipping until we get checkpoints for gpt-fast") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_gptq_quantizer_int4_weight_only(self): from torchao._models._eval import ( LMEvalInputRecorder, @@ -105,6 +106,7 @@ def test_gptq_quantizer_int4_weight_only(self): class TestMultiTensorFlow(TestCase): + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_add_tensors(self): from torchao.quantization.GPTQ import MultiTensor @@ -116,6 +118,7 @@ def test_multitensor_add_tensors(self): self.assertTrue(torch.equal(mt.values[0], tensor1)) self.assertTrue(torch.equal(mt.values[1], tensor2)) + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_pad_unpad(self): from torchao.quantization.GPTQ import MultiTensor @@ -126,6 +129,7 @@ def test_multitensor_pad_unpad(self): mt.unpad() self.assertEqual(mt.count, 1) + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_inplace_operation(self): from torchao.quantization.GPTQ import MultiTensor @@ -136,6 +140,7 @@ def test_multitensor_inplace_operation(self): class TestMultiTensorInputRecorder(TestCase): + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_multitensor_input_recorder(self): from torchao.quantization.GPTQ import MultiTensor, MultiTensorInputRecorder diff --git a/torchao/utils.py b/torchao/utils.py index b97db320a1..55605dbc11 100644 --- a/torchao/utils.py +++ b/torchao/utils.py @@ -152,7 +152,7 @@ def auto_detect_device(): if torch.accelerator.is_available(): return torch.accelerator.current_accelerator() else: - return "cpu" + return None def get_compute_capability(): From 54bc5c8cf1aebb890bdac72a83066ee894d40460 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 7 Nov 2025 13:58:23 +0800 Subject: [PATCH 8/9] refine the device --- test/quantization/test_quant_api.py | 78 +++++++++++----------- test/quantization/test_quant_primitives.py | 6 +- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py index c239b3c66c..cc340bdd4b 100644 --- a/test/quantization/test_quant_api.py +++ b/test/quantization/test_quant_api.py @@ -243,7 +243,7 @@ def api(model): torch.testing.assert_close(ref, res.cpu()) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_int8_wo_quant_save_load(self): m = ToyLinearModel().eval().cpu() @@ -264,7 +264,7 @@ def api(model): api(m2) m2.load_state_dict(state_dict) - m2 = m2.to(device="cuda") + m2 = m2.to(_DEVICE) example_inputs = map(lambda x: x.to(_DEVICE), example_inputs) res = m2(*example_inputs) @@ -444,7 +444,7 @@ def test_quantized_tensor_subclass_8da4w(self, mapping_type): ref = m_copy(*example_inputs) self.assertTrue(torch.equal(res, ref)) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_tensor_subclass_save_load(self): m = ToyLinearModel().eval().to(torch.bfloat16) m_copy = copy.deepcopy(m) @@ -462,7 +462,7 @@ def test_quantized_tensor_subclass_save_load(self): res = m_copy(*example_inputs) self.assertEqual(res, ref) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_int8wo_quantized_model_to_device(self): m = ToyLinearModel().eval().to(torch.bfloat16) example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cpu") @@ -470,15 +470,15 @@ def test_int8wo_quantized_model_to_device(self): quantize_(m, Int8WeightOnlyConfig()) ref = m(*example_inputs) - example_inputs_cuda = (example_inputs[0].to("cuda"),) - m.to(device="cuda") + example_inputs_cuda = (example_inputs[0].to(_DEVICE),) + m.to(_DEVICE) cuda_res = m(*example_inputs_cuda) self.assertEqual(cuda_res.cpu(), ref) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_tensor_subclass_save_load_map_location(self): - m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device="cuda") - example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cuda") + m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device=_DEVICE.type) + example_inputs = m.example_inputs(dtype=torch.bfloat16, device=_DEVICE.type) quantize_(m, Int8WeightOnlyConfig()) ref = m(*example_inputs) @@ -491,31 +491,31 @@ def test_quantized_tensor_subclass_save_load_map_location(self): m_copy = ToyLinearModel().eval() m_copy.load_state_dict(state_dict, assign=True) - m_copy.to(dtype=torch.bfloat16, device="cuda") + m_copy.to(dtype=torch.bfloat16, device=_DEVICE.type) res = m_copy(*example_inputs) self.assertEqual(res, ref) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_model_streaming(self): def reset_memory(): gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + torch.accelerator.empty_cache() + torch.accelerator.reset_peak_memory_stats() reset_memory() m = ToyLinearModel() - quantize_(m.to(device="cuda"), Int8WeightOnlyConfig()) - memory_baseline = torch.cuda.max_memory_allocated() + quantize_(m.to(device=_DEVICE.type), Int8WeightOnlyConfig()) + memory_baseline = torch.accelerator.max_memory_allocated() del m reset_memory() m = ToyLinearModel() - quantize_(m, Int8WeightOnlyConfig(), device="cuda") - memory_streaming = torch.cuda.max_memory_allocated() + quantize_(m, Int8WeightOnlyConfig(), device=_DEVICE.type) + memory_streaming = torch.accelerator.max_memory_allocated() for param in m.parameters(): - assert param.is_cuda + assert param.device.type == _DEVICE.type self.assertLess(memory_streaming, memory_baseline) @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half]) @@ -641,20 +641,20 @@ def test_module_fqn_to_config_module_name(self): assert isinstance(model.linear2.weight, AffineQuantizedTensor) assert isinstance(model.linear2.weight._layout, PlainLayout) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_regex_basic(self): config1 = Int4WeightOnlyConfig( group_size=32, int4_packing_format="tile_packed_to_4d" ) config = ModuleFqnToConfig({"re:linear.": config1}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor) assert isinstance(model.linear2.weight, Int4TilePackedTo4dTensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_regex_precedence(self): """Testing that full path config takes precedence over regex config in ModuleFqnToConfig @@ -664,14 +664,14 @@ def test_module_fqn_to_config_regex_precedence(self): ) config2 = IntxWeightOnlyConfig() config = ModuleFqnToConfig({"linear1": config1, "re:linear.": config2}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor) assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_regex_precedence2(self): """Testing that full path config takes precedence over regex config in ModuleFqnToConfig, swapping @@ -683,14 +683,14 @@ def test_module_fqn_to_config_regex_precedence2(self): ) config2 = IntxWeightOnlyConfig() config = ModuleFqnToConfig({"re:linear.": config2, "linear1": config1}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor) assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_regex_fullmatch(self): """Testing that we will only match the fqns that fully matches the regex @@ -729,7 +729,7 @@ def example_inputs(self): "linear3_full_match.bias": None, } ) - model = M(dtype=torch.bfloat16, device="cuda") + model = M(dtype=torch.bfloat16, device=_DEVICE.type) example_inputs = model.example_inputs() quantize_(model, config, filter_fn=None) model(*example_inputs) @@ -851,7 +851,7 @@ def test_config_deprecation(self): common_utils.instantiate_parametrized_tests(TestQuantFlow) -@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") +@unittest.skipIf(not torch.accelerator.is_available(), "Need CUDA available") @unittest.skipIf(not is_sm_at_least_90(), "Checkpoints are produced in SM90+") class TestFqnToConfig(TestCase): def test_quantize_param_fqn_exact(self): @@ -861,7 +861,7 @@ def test_quantize_param_fqn_exact(self): config = AutoConfig.from_pretrained( "unsloth/Llama-4-Scout-17B-16E-Instruct" ).text_config - model = Llama4TextMoe(config).to(torch.bfloat16).cuda() + model = Llama4TextMoe(config).to(torch.bfloat16).to(_DEVICE) quant_config = FqnToConfig( { @@ -1106,27 +1106,27 @@ def test_non_fqn_config_filter_fn_none(self): assert isinstance(model.weight, Float8Tensor) assert model.weight.scale.numel() == 1 - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_model_streaming_fqn_config(self): def reset_memory(): gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + torch.accelerator.empty_cache() + torch.accelerator.reset_peak_memory_stats() quant_config = FqnToConfig({"_default": Int8WeightOnlyConfig()}) reset_memory() m = ToyLinearModel() - quantize_(m.to(device="cuda"), quant_config, filter_fn=None) - memory_baseline = torch.cuda.max_memory_allocated() + quantize_(m.to(device=_DEVICE.type), quant_config, filter_fn=None) + memory_baseline = torch.accelerator.max_memory_allocated() del m reset_memory() m = ToyLinearModel() - quantize_(m, quant_config, device="cuda", filter_fn=None) - memory_streaming = torch.cuda.max_memory_allocated() + quantize_(m, quant_config, device=_DEVICE.type, filter_fn=None) + memory_streaming = torch.accelerator.max_memory_allocated() for param in m.parameters(): - assert param.is_cuda + assert param.device.type == _DEVICE.type self.assertLess(memory_streaming, memory_baseline) diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py index 2ce7cf11a2..dae40a2673 100644 --- a/test/quantization/test_quant_primitives.py +++ b/test/quantization/test_quant_primitives.py @@ -593,16 +593,16 @@ def test_choose_qparams_tensor_asym_eps(self): self.assertEqual(scale, eps) @unittest.skipIf( - not torch.cuda.is_available(), "skipping when cuda is not available" + not torch.accelerator.is_available(), "skipping when gpu is not available" ) def test_get_group_qparams_symmetric_memory(self): """Check the memory usage of the op""" weight = torch.randn(1024, 1024).to(device=_DEVICE) - original_mem_use = torch.cuda.memory_allocated() + original_mem_use = torch.accelerator.memory_allocated() n_bit = 4 groupsize = 128 (scale_ao, _) = get_group_qparams_symmetric(weight, n_bit, groupsize) - after_choose_qparams_mem_use = torch.cuda.memory_allocated() + after_choose_qparams_mem_use = torch.accelerator.memory_allocated() self.assertTrue(after_choose_qparams_mem_use < 1.2 * original_mem_use) def test_raises(self): From a9043e2f0f8ca3d0f2b76ad3a3a9bba84c28605e Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 7 Nov 2025 14:08:25 +0800 Subject: [PATCH 9/9] refine the device --- test/quantization/test_quant_api.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py index cc340bdd4b..b8f56374ec 100644 --- a/test/quantization/test_quant_api.py +++ b/test/quantization/test_quant_api.py @@ -641,20 +641,20 @@ def test_module_fqn_to_config_module_name(self): assert isinstance(model.linear2.weight, AffineQuantizedTensor) assert isinstance(model.linear2.weight._layout, PlainLayout) - @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_module_fqn_to_config_regex_basic(self): config1 = Int4WeightOnlyConfig( group_size=32, int4_packing_format="tile_packed_to_4d" ) config = ModuleFqnToConfig({"re:linear.": config1}) - model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16) + model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor) assert isinstance(model.linear2.weight, Int4TilePackedTo4dTensor) - @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_module_fqn_to_config_regex_precedence(self): """Testing that full path config takes precedence over regex config in ModuleFqnToConfig @@ -664,14 +664,14 @@ def test_module_fqn_to_config_regex_precedence(self): ) config2 = IntxWeightOnlyConfig() config = ModuleFqnToConfig({"linear1": config1, "re:linear.": config2}) - model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16) + model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor) assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor) - @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_module_fqn_to_config_regex_precedence2(self): """Testing that full path config takes precedence over regex config in ModuleFqnToConfig, swapping @@ -683,14 +683,14 @@ def test_module_fqn_to_config_regex_precedence2(self): ) config2 = IntxWeightOnlyConfig() config = ModuleFqnToConfig({"re:linear.": config2, "linear1": config1}) - model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device=_DEVICE.type, dtype=torch.bfloat16) + model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, Int4TilePackedTo4dTensor) assert isinstance(model.linear2.weight, IntxUnpackedToInt8Tensor) - @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") def test_module_fqn_to_config_regex_fullmatch(self): """Testing that we will only match the fqns that fully matches the regex @@ -729,7 +729,7 @@ def example_inputs(self): "linear3_full_match.bias": None, } ) - model = M(dtype=torch.bfloat16, device=_DEVICE.type) + model = M(dtype=torch.bfloat16, device="cuda") example_inputs = model.example_inputs() quantize_(model, config, filter_fn=None) model(*example_inputs)