diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py index f523cb091c..db33561fa9 100644 --- a/test/quantization/test_qat.py +++ b/test/quantization/test_qat.py @@ -98,12 +98,14 @@ ) from torchao.utils import ( _is_fbgemm_gpu_genai_available, + get_current_accelerator_device, is_fbcode, is_sm_at_least_89, ) # TODO: put this in a common test utils file _CUDA_IS_AVAILABLE = torch.cuda.is_available() +_DEVICE = get_current_accelerator_device() class Sub(torch.nn.Module): @@ -347,7 +349,7 @@ def _set_ptq_weight( group_size, ) q_weight = torch.ops.aten._convert_weight_to_int4pack( - q_weight.to("cuda"), + q_weight.to(_DEVICE), qat_linear.inner_k_tiles, ) ptq_linear.weight = q_weight @@ -600,13 +602,13 @@ def _assert_close_4w(self, val, ref): print(mean_err) self.assertTrue(mean_err < 0.05) - @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available") + @unittest.skipIf(_DEVICE is None, "skipping when gpu is not available") def test_qat_4w_primitives(self): n_bit = 4 group_size = 32 inner_k_tiles = 8 scales_precision = torch.bfloat16 - device = torch.device("cuda") + device = torch.device(_DEVICE) dtype = torch.bfloat16 torch.manual_seed(self.SEED) x = torch.randn(100, 256, dtype=dtype, device=device) @@ -699,7 +701,7 @@ def test_qat_4w_quantizer(self): group_size = 32 inner_k_tiles = 8 - device = torch.device("cuda") + device = torch.device(_DEVICE) dtype = torch.bfloat16 torch.manual_seed(self.SEED) m = M().to(device).to(dtype) diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py index e530babdb9..dc74948c40 100644 --- a/test/quantization/test_quant_api.py +++ b/test/quantization/test_quant_api.py @@ -64,14 +64,17 @@ ) from torchao.quantization.quant_primitives import MappingType from torchao.quantization.utils import compute_error -from torchao.testing.utils import skip_if_rocm +from torchao.testing.utils import skip_if_rocm, skip_if_xpu from torchao.utils import ( + get_current_accelerator_device, is_sm_at_least_89, is_sm_at_least_90, torch_version_at_least, unwrap_tensor_subclass, ) +_DEVICE = get_current_accelerator_device() + try: import gemlite # noqa: F401 @@ -240,7 +243,7 @@ def api(model): torch.testing.assert_close(ref, res.cpu()) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_int8_wo_quant_save_load(self): m = ToyLinearModel().eval().cpu() @@ -261,8 +264,8 @@ def api(model): api(m2) m2.load_state_dict(state_dict) - m2 = m2.to(device="cuda") - example_inputs = map(lambda x: x.cuda(), example_inputs) + m2 = m2.to(_DEVICE) + example_inputs = map(lambda x: x.to(_DEVICE), example_inputs) res = m2(*example_inputs) # TODO: figure out why ROCm has a larger error @@ -294,12 +297,13 @@ def test_8da4w_quantizer_linear_bias(self): m(*example_inputs) @unittest.skip("skipping until we get checkpoints for gpt-fast") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantizer_int4_weight_only(self): from torchao._models._eval import TransformerEvalWrapper from torchao.quantization.linear_quant_modules import Int4WeightOnlyQuantizer precision = torch.bfloat16 - device = "cuda" + device = _DEVICE checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth") model = Transformer.from_name(checkpoint_path.parent.name) checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True) @@ -316,7 +320,7 @@ def test_quantizer_int4_weight_only(self): quantizer = Int4WeightOnlyQuantizer( groupsize, ) - model = quantizer.quantize(model).cuda() + model = quantizer.quantize(model).to(_DEVICE) result = TransformerEvalWrapper( model, tokenizer, @@ -332,11 +336,12 @@ def test_quantizer_int4_weight_only(self): ) @unittest.skip("skipping until we get checkpoints for gpt-fast") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_eval_wrapper(self): from torchao._models._eval import TransformerEvalWrapper precision = torch.bfloat16 - device = "cuda" + device = _DEVICE checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth") model = Transformer.from_name(checkpoint_path.parent.name) checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True) @@ -365,11 +370,12 @@ def test_eval_wrapper(self): # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY @unittest.skip("skipping until we get checkpoints for gpt-fast") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_eval_wrapper_llama3(self): from torchao._models._eval import TransformerEvalWrapper precision = torch.bfloat16 - device = "cuda" + device = _DEVICE checkpoint_path = Path( ".../gpt-fast/checkpoints/meta-llama/Meta-Llama-3-8B/model.pth" ) @@ -438,7 +444,7 @@ def test_quantized_tensor_subclass_8da4w(self, mapping_type): ref = m_copy(*example_inputs) self.assertTrue(torch.equal(res, ref)) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_tensor_subclass_save_load(self): m = ToyLinearModel().eval().to(torch.bfloat16) m_copy = copy.deepcopy(m) @@ -456,7 +462,7 @@ def test_quantized_tensor_subclass_save_load(self): res = m_copy(*example_inputs) self.assertEqual(res, ref) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_int8wo_quantized_model_to_device(self): m = ToyLinearModel().eval().to(torch.bfloat16) example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cpu") @@ -464,15 +470,15 @@ def test_int8wo_quantized_model_to_device(self): quantize_(m, Int8WeightOnlyConfig()) ref = m(*example_inputs) - example_inputs_cuda = (example_inputs[0].to("cuda"),) - m.to(device="cuda") + example_inputs_cuda = (example_inputs[0].to(_DEVICE),) + m.to(_DEVICE) cuda_res = m(*example_inputs_cuda) self.assertEqual(cuda_res.cpu(), ref) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_tensor_subclass_save_load_map_location(self): - m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device="cuda") - example_inputs = m.example_inputs(dtype=torch.bfloat16, device="cuda") + m = ToyLinearModel().eval().to(dtype=torch.bfloat16, device=_DEVICE) + example_inputs = m.example_inputs(dtype=torch.bfloat16, device=_DEVICE) quantize_(m, Int8WeightOnlyConfig()) ref = m(*example_inputs) @@ -485,31 +491,33 @@ def test_quantized_tensor_subclass_save_load_map_location(self): m_copy = ToyLinearModel().eval() m_copy.load_state_dict(state_dict, assign=True) - m_copy.to(dtype=torch.bfloat16, device="cuda") + m_copy.to(dtype=torch.bfloat16, device=_DEVICE) res = m_copy(*example_inputs) self.assertEqual(res, ref) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_model_streaming(self): + device_module = torch.get_device_module(_DEVICE) + def reset_memory(): gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + device_module.empty_cache() + device_module.reset_peak_memory_stats() reset_memory() m = ToyLinearModel() - quantize_(m.to(device="cuda"), Int8WeightOnlyConfig()) - memory_baseline = torch.cuda.max_memory_allocated() + quantize_(m.to(device=_DEVICE), Int8WeightOnlyConfig()) + memory_baseline = device_module.max_memory_allocated() del m reset_memory() m = ToyLinearModel() - quantize_(m, Int8WeightOnlyConfig(), device="cuda") - memory_streaming = torch.cuda.max_memory_allocated() + quantize_(m, Int8WeightOnlyConfig(), device=_DEVICE) + memory_streaming = device_module.max_memory_allocated() for param in m.parameters(): - assert param.is_cuda + assert param.device.type == _DEVICE.type self.assertLess(memory_streaming, memory_baseline) @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half]) @@ -538,7 +546,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq): assert "aten.mm.default" not in code[0] # TODO(#1690): move to new config names - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @common_utils.parametrize( "config", [ @@ -555,6 +563,7 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq): UIntXWeightOnlyConfig(dtype=torch.uint4), ], ) + @skip_if_xpu("XPU enablement in progress") @skip_if_rocm("ROCm enablement in progress") def test_workflow_e2e_numerics(self, config): """ @@ -583,17 +592,17 @@ def test_workflow_e2e_numerics(self, config): # scale has to be moved to cuda here because the parametrization init # code happens before gating for cuda availability if isinstance(config, Float8StaticActivationFloat8WeightConfig): - config.scale = config.scale.to("cuda") + config.scale = config.scale.to(_DEVICE) dtype = torch.bfloat16 if isinstance(config, GemliteUIntXWeightOnlyConfig): dtype = torch.float16 # set up inputs - x = torch.randn(128, 128, device="cuda", dtype=dtype) + x = torch.randn(128, 128, device=_DEVICE, dtype=dtype) # TODO(future): model in float32 leads to error: https://gist.github.com/vkuzo/63b3bcd7818393021a6e3fb4ccf3c469 # is that expected? - m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).cuda().to(dtype) + m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).to(_DEVICE).to(dtype) m_q = copy.deepcopy(m_ref) # quantize @@ -606,13 +615,13 @@ def test_workflow_e2e_numerics(self, config): sqnr = compute_error(y_ref, y_q) assert sqnr >= 16.5, f"SQNR {sqnr} is too low" - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_default(self): config1 = Int4WeightOnlyConfig(group_size=32, version=1) config2 = Int8WeightOnlyConfig() config = ModuleFqnToConfig({"_default": config1, "linear2": config2}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, AffineQuantizedTensor) @@ -620,13 +629,13 @@ def test_module_fqn_to_config_default(self): assert isinstance(model.linear2.weight, AffineQuantizedTensor) assert isinstance(model.linear2.weight._layout, PlainLayout) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_module_name(self): config1 = Int4WeightOnlyConfig(group_size=32, version=1) config2 = Int8WeightOnlyConfig() config = ModuleFqnToConfig({"linear1": config1, "linear2": config2}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, AffineQuantizedTensor) @@ -759,25 +768,25 @@ def test_module_fqn_to_config_embedding_linear(self): assert isinstance(model.emb.weight, IntxUnpackedToInt8Tensor) assert isinstance(model.linear.weight, IntxUnpackedToInt8Tensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_module_fqn_to_config_skip(self): config1 = Int4WeightOnlyConfig(group_size=32, version=1) config = ModuleFqnToConfig({"_default": config1, "linear2": None}) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) quantize_(model, config, filter_fn=None) model(*example_inputs) assert isinstance(model.linear1.weight, AffineQuantizedTensor) assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout) assert not isinstance(model.linear2.weight, AffineQuantizedTensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_int4wo_cuda_serialization(self): config = Int4WeightOnlyConfig(group_size=32, version=1) - model = ToyLinearModel().cuda().to(dtype=torch.bfloat16) + model = ToyLinearModel().to(_DEVICE).to(dtype=torch.bfloat16) # quantize in cuda quantize_(model, config) - example_inputs = model.example_inputs(device="cuda", dtype=torch.bfloat16) + example_inputs = model.example_inputs(device=_DEVICE, dtype=torch.bfloat16) model(*example_inputs) with tempfile.NamedTemporaryFile() as ckpt: # save checkpoint in cuda @@ -786,7 +795,7 @@ def test_int4wo_cuda_serialization(self): # This is what torchtune does: https://github.com/pytorch/torchtune/blob/v0.6.1/torchtune/training/checkpointing/_utils.py#L253 sd = torch.load(ckpt.name, weights_only=False, map_location="cpu") for k, v in sd.items(): - sd[k] = v.to("cuda") + sd[k] = v.to(_DEVICE) # load state_dict in cuda model.load_state_dict(sd, assign=True) @@ -850,7 +859,7 @@ def test_config_deprecation(self): common_utils.instantiate_parametrized_tests(TestQuantFlow) -@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") +@unittest.skipIf(not torch.accelerator.is_available(), "Need CUDA available") @unittest.skipIf(not is_sm_at_least_90(), "Checkpoints are produced in SM90+") class TestFqnToConfig(TestCase): def test_quantize_param_fqn_exact(self): @@ -860,7 +869,7 @@ def test_quantize_param_fqn_exact(self): config = AutoConfig.from_pretrained( "unsloth/Llama-4-Scout-17B-16E-Instruct" ).text_config - model = Llama4TextMoe(config).to(torch.bfloat16).cuda() + model = Llama4TextMoe(config).to(torch.bfloat16).to(_DEVICE) quant_config = FqnToConfig( { @@ -1105,30 +1114,32 @@ def test_non_fqn_config_filter_fn_none(self): assert isinstance(model.weight, Float8Tensor) assert model.weight.scale.numel() == 1 - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantized_model_streaming_fqn_config(self): + device_module = torch.get_device_module(_DEVICE) + def reset_memory(): gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + device_module.empty_cache() + device_module.reset_peak_memory_stats() quant_config = FqnToConfig({"_default": Int8WeightOnlyConfig()}) reset_memory() m = ToyLinearModel() - quantize_(m.to(device="cuda"), quant_config, filter_fn=None) - memory_baseline = torch.cuda.max_memory_allocated() + quantize_(m.to(device=_DEVICE), quant_config, filter_fn=None) + memory_baseline = device_module.max_memory_allocated() del m reset_memory() m = ToyLinearModel() - quantize_(m, quant_config, device="cuda", filter_fn=None) - memory_streaming = torch.cuda.max_memory_allocated() + quantize_(m, quant_config, device=_DEVICE, filter_fn=None) + memory_streaming = device_module.max_memory_allocated() for param in m.parameters(): - assert param.is_cuda + assert param.device.type == _DEVICE.type self.assertLess(memory_streaming, memory_baseline) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_fqn_config_quantized_nested_module(self): class NestedModule(torch.nn.Module): def __init__(self): @@ -1153,7 +1164,7 @@ def __init__(self): assert isinstance(m.nested.linear.weight, AffineQuantizedTensor) assert isinstance(m.linear1.weight, AffineQuantizedTensor) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_fqn_config_quantized_nested_module_param(self): class NestedModule(torch.nn.Module): def __init__(self):