diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
index f523cb091c..db33561fa9 100644
--- a/test/quantization/test_qat.py
+++ b/test/quantization/test_qat.py
@@ -98,12 +98,14 @@
 )
 from torchao.utils import (
     _is_fbgemm_gpu_genai_available,
+    get_current_accelerator_device,
     is_fbcode,
     is_sm_at_least_89,
 )
 
 # TODO: put this in a common test utils file
 _CUDA_IS_AVAILABLE = torch.cuda.is_available()
+_DEVICE = get_current_accelerator_device()
 
 
 class Sub(torch.nn.Module):
@@ -347,7 +349,7 @@ def _set_ptq_weight(
                 group_size,
             )
             q_weight = torch.ops.aten._convert_weight_to_int4pack(
-                q_weight.to("cuda"),
+                q_weight.to(_DEVICE),
                 qat_linear.inner_k_tiles,
             )
             ptq_linear.weight = q_weight
@@ -600,13 +602,13 @@ def _assert_close_4w(self, val, ref):
         print(mean_err)
         self.assertTrue(mean_err < 0.05)
 
-    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @unittest.skipIf(_DEVICE is None, "skipping when gpu is not available")
     def test_qat_4w_primitives(self):
         n_bit = 4
         group_size = 32
         inner_k_tiles = 8
         scales_precision = torch.bfloat16
-        device = torch.device("cuda")
+        device = torch.device(_DEVICE)
         dtype = torch.bfloat16
         torch.manual_seed(self.SEED)
         x = torch.randn(100, 256, dtype=dtype, device=device)
@@ -699,7 +701,7 @@ def test_qat_4w_quantizer(self):
 
         group_size = 32
         inner_k_tiles = 8
-        device = torch.device("cuda")
+        device = torch.device(_DEVICE)
         dtype = torch.bfloat16
         torch.manual_seed(self.SEED)
         m = M().to(device).to(dtype)
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index e530babdb9..dc74948c40 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -64,14 +64,17 @@
 )
 from torchao.quantization.quant_primitives import MappingType
 from torchao.quantization.utils import compute_error
-from torchao.testing.utils import skip_if_rocm
+from torchao.testing.utils import skip_if_rocm, skip_if_xpu
 from torchao.utils import (
+    get_current_accelerator_device,
     is_sm_at_least_89,
     is_sm_at_least_90,
     torch_version_at_least,
     unwrap_tensor_subclass,
 )
 
+_DEVICE = get_current_accelerator_device()
+
 try:
     import gemlite  # noqa: F401
 
@@ -240,7 +243,7 @@ def api(model):
 
         torch.testing.assert_close(ref, res.cpu())
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_int8_wo_quant_save_load(self):
         m = ToyLinearModel().eval().cpu()
 
@@ -261,8 +264,8 @@ def api(model):
         api(m2)
 
         m2.load_state_dict(state_dict)
-        m2 = m2.to(device="cuda")
-        example_inputs = map(lambda x: x.cuda(), example_inputs)
+        m2 = m2.to(_DEVICE)
+        example_inputs = map(lambda x: x.to(_DEVICE), example_inputs)
         res = m2(*example_inputs)
 
         # TODO: figure out why ROCm has a larger error
@@ -294,12 +297,13 @@ def test_8da4w_quantizer_linear_bias(self):
         m(*example_inputs)
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantizer_int4_weight_only(self):
         from torchao._models._eval import TransformerEvalWrapper
         from torchao.quantization.linear_quant_modules import Int4WeightOnlyQuantizer
 
         precision = torch.bfloat16
-        device = "cuda"
+        device = _DEVICE
         checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
@@ -316,7 +320,7 @@ def test_quantizer_int4_weight_only(self):
         quantizer = Int4WeightOnlyQuantizer(
             groupsize,
         )
-        model = quantizer.quantize(model).cuda()
+        model = quantizer.quantize(model).to(_DEVICE)
         result = TransformerEvalWrapper(
             model,
             tokenizer,
@@ -332,11 +336,12 @@ def test_quantizer_int4_weight_only(self):
         )
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_eval_wrapper(self):
         from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
-        device = "cuda"
+        device = _DEVICE
         checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
@@ -365,11 +370,12 @@ def test_eval_wrapper(self):
 
     # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_eval_wrapper_llama3(self):
         from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
-        device = "cuda"
+        device = _DEVICE
         checkpoint_path = Path(
             ".../gpt-fast/checkpoints/meta-llama/Meta-Llama-3-8B/model.pth"
         )
@@ -438,7 +444,7 @@ def test_quantized_tensor_subclass_8da4w(self, mapping_type):
         ref = m_copy(*example_inputs)
         self.assertTrue(torch.equal(res, ref))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_tensor_subclass_save_load(self):
         m = ToyLinearModel().eval().to(torch.bfloat16)
         m_copy = copy.deepcopy(m)
@@ -456,7 +462,7 @@ def test_quantized_tensor_subclass_save_load(self):
         res = m_copy(*example_inputs)
         self.assertEqual(res, ref)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_int8wo_quantized_model_to_device(self):
         m = ToyLinearModel().eval().to(torch.bfloat16)
         example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cpu")
@@ -464,15 +470,15 @@ def test_int8wo_quantized_model_to_device(self):
         quantize_(m, Int8WeightOnlyConfig())
         ref = m(*example_inputs)
 
-        example_inputs_cuda = (example_inputs[0].to("cuda"),)
-        m.to(device="cuda")
+        example_inputs_cuda = (example_inputs[0].to(_DEVICE),)
+        m.to(_DEVICE)
         cuda_res = m(*example_inputs_cuda)
         self.assertEqual(cuda_res.cpu(), ref)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_tensor_subclass_save_load_map_location(self):
-        m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device="cuda")
-        example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cuda")
+        m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device=_DEVICE)
+        example_inputs = m.example_inputs(dtype=torch.bfloat16, device=_DEVICE)
 
         quantize_(m, Int8WeightOnlyConfig())
         ref = m(*example_inputs)
@@ -485,31 +491,33 @@ def test_quantized_tensor_subclass_save_load_map_location(self):
             m_copy = ToyLinearModel().eval()
 
         m_copy.load_state_dict(state_dict, assign=True)
-        m_copy.to(dtype=torch.bfloat16, device="cuda")
+        m_copy.to(dtype=torch.bfloat16, device=_DEVICE)
 
         res = m_copy(*example_inputs)
         self.assertEqual(res, ref)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_model_streaming(self):
+        device_module = torch.get_device_module(_DEVICE)
+
         def reset_memory():
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
+            device_module.empty_cache()
+            device_module.reset_peak_memory_stats()
 
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m.to(device="cuda"), Int8WeightOnlyConfig())
-        memory_baseline = torch.cuda.max_memory_allocated()
+        quantize_(m.to(device=_DEVICE), Int8WeightOnlyConfig())
+        memory_baseline = device_module.max_memory_allocated()
 
         del m
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m, Int8WeightOnlyConfig(), device="cuda")
-        memory_streaming = torch.cuda.max_memory_allocated()
+        quantize_(m, Int8WeightOnlyConfig(), device=_DEVICE)
+        memory_streaming = device_module.max_memory_allocated()
 
         for param in m.parameters():
-            assert param.is_cuda
+            assert param.device.type == _DEVICE.type
         self.assertLess(memory_streaming, memory_baseline)
 
     @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half])
@@ -538,7 +546,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq):
             assert "aten.mm.default" not in code[0]
 
     # TODO(#1690): move to new config names
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @common_utils.parametrize(
         "config",
         [
@@ -555,6 +563,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq):
             UIntXWeightOnlyConfig(dtype=torch.uint4),
         ],
     )
+    @skip_if_xpu("XPU enablement in progress")
     @skip_if_rocm("ROCm enablement in progress")
     def test_workflow_e2e_numerics(self, config):
         """
@@ -583,17 +592,17 @@ def test_workflow_e2e_numerics(self, config):
         # scale has to be moved to cuda here because the parametrization init
         # code happens before gating for cuda availability
         if isinstance(config, Float8StaticActivationFloat8WeightConfig):
-            config.scale = config.scale.to("cuda")
+            config.scale = config.scale.to(_DEVICE)
 
         dtype = torch.bfloat16
         if isinstance(config, GemliteUIntXWeightOnlyConfig):
             dtype = torch.float16
 
         # set up inputs
-        x = torch.randn(128, 128, device="cuda", dtype=dtype)
+        x = torch.randn(128, 128, device=_DEVICE, dtype=dtype)
         # TODO(future): model in float32 leads to error: https://gist.github.com/vkuzo/63b3bcd7818393021a6e3fb4ccf3c469
         # is that expected?
-        m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).cuda().to(dtype)
+        m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).to(_DEVICE).to(dtype)
         m_q = copy.deepcopy(m_ref)
 
         # quantize
@@ -606,13 +615,13 @@ def test_workflow_e2e_numerics(self, config):
         sqnr = compute_error(y_ref, y_q)
         assert sqnr >= 16.5, f"SQNR {sqnr} is too low"
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_default(self):
         config1 = Int4WeightOnlyConfig(group_size=32, version=1)
         config2 = Int8WeightOnlyConfig()
         config = ModuleFqnToConfig({"_default": config1, "linear2": config2})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
@@ -620,13 +629,13 @@ def test_module_fqn_to_config_default(self):
         assert isinstance(model.linear2.weight, AffineQuantizedTensor)
         assert isinstance(model.linear2.weight._layout, PlainLayout)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_module_name(self):
         config1 = Int4WeightOnlyConfig(group_size=32, version=1)
         config2 = Int8WeightOnlyConfig()
         config = ModuleFqnToConfig({"linear1": config1, "linear2": config2})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
@@ -759,25 +768,25 @@ def test_module_fqn_to_config_embedding_linear(self):
         assert isinstance(model.emb.weight, IntxUnpackedToInt8Tensor)
         assert isinstance(model.linear.weight, IntxUnpackedToInt8Tensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_skip(self):
         config1 = Int4WeightOnlyConfig(group_size=32, version=1)
         config = ModuleFqnToConfig({"_default": config1, "linear2": None})
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         quantize_(model, config, filter_fn=None)
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
         assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
         assert not isinstance(model.linear2.weight, AffineQuantizedTensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_int4wo_cuda_serialization(self):
         config = Int4WeightOnlyConfig(group_size=32, version=1)
-        model = ToyLinearModel().cuda().to(dtype=torch.bfloat16)
+        model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16)
         # quantize in cuda
         quantize_(model, config)
-        example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16)
+        example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16)
         model(*example_inputs)
         with tempfile.NamedTemporaryFile() as ckpt:
             # save checkpoint in cuda
@@ -786,7 +795,7 @@ def test_int4wo_cuda_serialization(self):
             # This is what torchtune does: https://github.com/pytorch/torchtune/blob/v0.6.1/torchtune/training/checkpointing/_utils.py#L253
             sd = torch.load(ckpt.name, weights_only=False, map_location="cpu")
             for k, v in sd.items():
-                sd[k] = v.to("cuda")
+                sd[k] = v.to(_DEVICE)
             # load state_dict in cuda
             model.load_state_dict(sd, assign=True)
 
@@ -850,7 +859,7 @@ def test_config_deprecation(self):
 common_utils.instantiate_parametrized_tests(TestQuantFlow)
 
 
-@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not torch.accelerator.is_available(), "Need CUDA available")
 @unittest.skipIf(not is_sm_at_least_90(), "Checkpoints are produced in SM90+")
 class TestFqnToConfig(TestCase):
     def test_quantize_param_fqn_exact(self):
@@ -860,7 +869,7 @@ def test_quantize_param_fqn_exact(self):
         config = AutoConfig.from_pretrained(
             "unsloth/Llama-4-Scout-17B-16E-Instruct"
         ).text_config
-        model = Llama4TextMoe(config).to(torch.bfloat16).cuda()
+        model = Llama4TextMoe(config).to(torch.bfloat16).to(_DEVICE)
 
         quant_config = FqnToConfig(
             {
@@ -1105,30 +1114,32 @@ def test_non_fqn_config_filter_fn_none(self):
         assert isinstance(model.weight, Float8Tensor)
         assert model.weight.scale.numel() == 1
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_quantized_model_streaming_fqn_config(self):
+        device_module = torch.get_device_module(_DEVICE)
+
         def reset_memory():
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
+            device_module.empty_cache()
+            device_module.reset_peak_memory_stats()
 
         quant_config = FqnToConfig({"_default": Int8WeightOnlyConfig()})
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m.to(device="cuda"), quant_config, filter_fn=None)
-        memory_baseline = torch.cuda.max_memory_allocated()
+        quantize_(m.to(device=_DEVICE), quant_config, filter_fn=None)
+        memory_baseline = device_module.max_memory_allocated()
 
         del m
         reset_memory()
         m = ToyLinearModel()
-        quantize_(m, quant_config, device="cuda", filter_fn=None)
-        memory_streaming = torch.cuda.max_memory_allocated()
+        quantize_(m, quant_config, device=_DEVICE, filter_fn=None)
+        memory_streaming = device_module.max_memory_allocated()
 
         for param in m.parameters():
-            assert param.is_cuda
+            assert param.device.type == _DEVICE.type
         self.assertLess(memory_streaming, memory_baseline)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_fqn_config_quantized_nested_module(self):
         class NestedModule(torch.nn.Module):
             def __init__(self):
@@ -1153,7 +1164,7 @@ def __init__(self):
         assert isinstance(m.nested.linear.weight, AffineQuantizedTensor)
         assert isinstance(m.linear1.weight, AffineQuantizedTensor)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_fqn_config_quantized_nested_module_param(self):
         class NestedModule(torch.nn.Module):
             def __init__(self):