|
13 | 13 | StretchedUnifTorchaoQuantizer,
|
14 | 14 | )
|
15 | 15 | from torchao.prototype.quantization.int8_lut_tensor.int8_lut_tensor import Int8LutTensor
|
16 |
| -from torchao.prototype.tensor_conversion.api import _convert_model_for_aarch64 |
17 |
| -from torchao.quantization import MappingType |
| 16 | +from torchao.prototype.tensor_conversion.api import ( |
| 17 | + _convert_model_for_aarch64, |
| 18 | + convert_to_packed_tensor_based_on_current_hardware, |
| 19 | +) |
| 20 | +from torchao.quantization import ( |
| 21 | + Int4PreshuffledTensor, |
| 22 | + Int4Tensor, |
| 23 | + MappingType, |
| 24 | +) |
18 | 25 | from torchao.quantization.granularity import PerAxis, PerGroup
|
19 | 26 | from torchao.quantization.quant_api import (
|
| 27 | + Int4WeightOnlyConfig, |
20 | 28 | Int8DynamicActivationIntxWeightConfig,
|
21 | 29 | IntxWeightOnlyConfig,
|
22 | 30 | quantize_,
|
|
26 | 34 | _is_kernel_library_loaded,
|
27 | 35 | )
|
28 | 36 | from torchao.quantization.utils import compute_error
|
| 37 | +from torchao.utils import _is_fbgemm_genai_gpu_available |
29 | 38 |
|
30 | 39 |
|
31 | 40 | class ToyLinearModelWithTiedEmbedding(torch.nn.Module):
|
@@ -178,3 +187,24 @@ def test_aarch64_conversion(dtype, granularity, bit_width, lead_dim):
|
178 | 187 | assert ep.graph_module.code.count(line) == cnt, (
|
179 | 188 | f"expected {cnt} {line} in {ep.graph_module.code}"
|
180 | 189 | )
|
| 190 | + |
| 191 | + |
| 192 | +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Need CUDA") |
| 193 | +@pytest.mark.skipif( |
| 194 | + not _is_fbgemm_genai_gpu_available(), reason="Requires fbgemm-gpu-genai >= 1.2.0" |
| 195 | +) |
| 196 | +def test_int4_tensor_conversion(): |
| 197 | + m = torch.nn.Sequential( |
| 198 | + torch.nn.Linear(256, 512, dtype=torch.bfloat16, device="cuda") |
| 199 | + ) |
| 200 | + quantize_(m, Int4WeightOnlyConfig(group_size=128)) |
| 201 | + weight = m[0].weight |
| 202 | + assert isinstance(weight, Int4Tensor) |
| 203 | + example_inputs = (torch.randn(32, 256, dtype=torch.bfloat16, device="cuda"),) |
| 204 | + before_conversion = m(*example_inputs) |
| 205 | + m[0].weight = torch.nn.Parameter( |
| 206 | + convert_to_packed_tensor_based_on_current_hardware(weight), requires_grad=False |
| 207 | + ) |
| 208 | + after_conversion = m(*example_inputs) |
| 209 | + assert isinstance(m[0].weight, Int4PreshuffledTensor) |
| 210 | + assert torch.equal(before_conversion, after_conversion) |
0 commit comments