From 9eaa4764e751e0efe52eade1e026f8d137835fe0 Mon Sep 17 00:00:00 2001 From: V-E-D Date: Sun, 25 May 2025 10:59:03 +0530 Subject: [PATCH 1/3] test_affine_quantized_float.py pytest too unittest --- test/dtypes/test_affine_quantized_float.py | 108 ++++++++++----------- 1 file changed, 52 insertions(+), 56 deletions(-) diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py index 8c36f5ac7a..7fa14be88c 100644 --- a/test/dtypes/test_affine_quantized_float.py +++ b/test/dtypes/test_affine_quantized_float.py @@ -3,24 +3,22 @@ # # This source code is licensed under the BSD 3-Clause license found in the # LICENSE file in the root directory of this source tree. -import pytest +import unittest from torchao.utils import ( TORCH_VERSION_AT_LEAST_2_5, ) if not TORCH_VERSION_AT_LEAST_2_5: - pytest.skip("Unsupported PyTorch version", allow_module_level=True) + unittest.skip("Unsupported PyTorch version") import copy import io import random -import unittest from contextlib import nullcontext from functools import partial from typing import Tuple -import pytest import torch from torch._inductor.test_case import TestCase as InductorTestCase from torch.testing._internal import common_utils @@ -95,68 +93,66 @@ def test_fp8_linear_variants( "Static quantization only supports PerTensor granularity" ) - error_context = ( - pytest.raises(AssertionError, match=error_message) - if error_message - else nullcontext() + if error_message: + with self.assertRaisesRegex(AssertionError, error_message): + self._run_fp8_linear_variant(dtype, mode, compile, sizes, granularity) + else: + self._run_fp8_linear_variant(dtype, mode, compile, sizes, granularity) + + def _run_fp8_linear_variant(self, dtype, mode, compile, sizes, granularity): + M, N, K = sizes + input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda") + # Get a "reasonable" scale for the input tensor even though + # we use the same scale for multiple activations + scale, _ = choose_qparams_affine( + input_tensor, + MappingType.SYMMETRIC, + input_tensor.shape, + torch.float8_e4m3fn, + scale_dtype=torch.float32, ) + mode_map = { + "dynamic": partial( + float8_dynamic_activation_float8_weight, granularity=granularity + ), + "weight-only": float8_weight_only, + "static": partial( + float8_static_activation_float8_weight, + scale=scale, + granularity=granularity, + ), + } - with error_context: - M, N, K = sizes - input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda") - # Get a "reasonable" scale for the input tensor even though - # we use the same scale for multiple activations - scale, _ = choose_qparams_affine( - input_tensor, - MappingType.SYMMETRIC, - input_tensor.shape, - torch.float8_e4m3fn, - scale_dtype=torch.float32, - ) - mode_map = { - "dynamic": partial( - float8_dynamic_activation_float8_weight, granularity=granularity - ), - "weight-only": float8_weight_only, - "static": partial( - float8_static_activation_float8_weight, - scale=scale, - granularity=granularity, - ), - } - - # Create a linear layer with bfloat16 dtype - model = ToyLinearModel(K, N).eval().to(dtype).to("cuda") - - quantized_model = copy.deepcopy(model) - factory = mode_map[mode]() - quantize_(quantized_model, factory) - - if compile: - quantized_model = torch.compile(quantized_model, fullgraph=True) - - output_original = model(input_tensor) - output_quantized = quantized_model(input_tensor) - - error = compute_error(output_original, output_quantized) - assert compute_error(output_original, output_quantized) > 20, ( - f"Quantization error is too high got a SQNR of {error}" - ) + # Create a linear layer with bfloat16 dtype + model = ToyLinearModel(K, N).eval().to(dtype).to("cuda") + + quantized_model = copy.deepcopy(model) + factory = mode_map[mode]() + quantize_(quantized_model, factory) + + if compile: + quantized_model = torch.compile(quantized_model, fullgraph=True) + + output_original = model(input_tensor) + output_quantized = quantized_model(input_tensor) + + error = compute_error(output_original, output_quantized) + self.assertGreater(error, 20, f"Quantization error is too high got a SQNR of {error}") @unittest.skipIf( not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9" ) def test_invalid_granularity(self): - with pytest.raises(ValueError, match="Invalid granularity specification"): + with self.assertRaisesRegex(ValueError, "Invalid granularity specification"): float8_dynamic_activation_float8_weight(granularity="invalid") @unittest.skipIf( not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9" ) def test_mismatched_granularity(self): - with pytest.raises( + with self.assertRaisesRegex( ValueError, - match="Different granularities for activation and weight are not supported", + "Different granularities for activation and weight are not supported", ): float8_dynamic_activation_float8_weight(granularity=(PerTensor(), PerRow())) @@ -167,7 +163,7 @@ def test_unsupported_granularity(self): class UnsupportedGranularity: pass - with pytest.raises(ValueError, match="Invalid granularity types"): + with self.assertRaisesRegex(ValueError, "Invalid granularity types"): float8_dynamic_activation_float8_weight( granularity=(UnsupportedGranularity(), UnsupportedGranularity()) ) @@ -177,9 +173,9 @@ class UnsupportedGranularity: not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9" ) def test_per_row_with_float32(self): - with pytest.raises( + with self.assertRaisesRegex( AssertionError, - match="PerRow quantization only works for bfloat16 precision", + "PerRow quantization only works for bfloat16 precision", ): model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda") quantize_( @@ -317,4 +313,4 @@ def test_mm_float8dq(self): common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile) if __name__ == "__main__": - pytest.main([__file__]) + unittest.main() From 59ade08cd9e1ce7cedf2d083aa2c2b516b23ccbf Mon Sep 17 00:00:00 2001 From: V-E-D Date: Sun, 25 May 2025 11:05:38 +0530 Subject: [PATCH 2/3] test_bitpacking.py pytest too unittest --- test/dtypes/test_bitpacking.py | 139 +++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 57 deletions(-) diff --git a/test/dtypes/test_bitpacking.py b/test/dtypes/test_bitpacking.py index 0ed4462d5d..80d3245401 100644 --- a/test/dtypes/test_bitpacking.py +++ b/test/dtypes/test_bitpacking.py @@ -3,9 +3,10 @@ # # This source code is licensed under the BSD 3-Clause license found in the # LICENSE file in the root directory of this source tree. -import pytest +import unittest import torch from torch.utils._triton import has_triton +from torch.testing._internal import common_utils from torchao.dtypes.uintx.bitpacking import pack, pack_cpu, unpack, unpack_cpu @@ -13,68 +14,92 @@ dimensions = (0, -1, 1) -@pytest.fixture(autouse=True) -def run_before_and_after_tests(): - yield - torch._dynamo.reset() # reset cache between tests +class TestBitpacking(unittest.TestCase): + def setUp(self): + """Set up test fixtures before each test method.""" + # Set random seeds for reproducibility + torch.manual_seed(0) + if torch.cuda.is_available(): + torch.cuda.manual_seed(0) + + # Initialize any common test data + self.test_shape = (32, 32, 32) + + # Set default device + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Reset any cached state + torch._dynamo.reset() + def tearDown(self): + """Clean up after each test method.""" + # Reset torch._dynamo cache + torch._dynamo.reset() + + # Clear CUDA cache if available + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Reset any other state if needed + torch._dynamo.config.specialize_int = False -@pytest.mark.parametrize("bit_width", bit_widths) -@pytest.mark.parametrize("dim", dimensions) -def test_CPU(bit_width, dim): - test_tensor = torch.randint( - 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8, device="cpu" - ) - packed = pack_cpu(test_tensor, bit_width, dim=dim) - unpacked = unpack_cpu(packed, bit_width, dim=dim) - assert unpacked.allclose(test_tensor) + @common_utils.parametrize("bit_width", bit_widths) + @common_utils.parametrize("dim", dimensions) + def test_CPU(self, bit_width, dim): + test_tensor = torch.randint( + 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8, device="cpu" + ) + packed = pack_cpu(test_tensor, bit_width, dim=dim) + unpacked = unpack_cpu(packed, bit_width, dim=dim) + self.assertTrue(unpacked.allclose(test_tensor)) + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + @common_utils.parametrize("bit_width", bit_widths) + @common_utils.parametrize("dim", dimensions) + def test_GPU(self, bit_width, dim): + test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() + packed = pack(test_tensor, bit_width, dim=dim) + unpacked = unpack(packed, bit_width, dim=dim) + self.assertTrue(unpacked.allclose(test_tensor)) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -@pytest.mark.parametrize("bit_width", bit_widths) -@pytest.mark.parametrize("dim", dimensions) -def test_GPU(bit_width, dim): - test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() - packed = pack(test_tensor, bit_width, dim=dim) - unpacked = unpack(packed, bit_width, dim=dim) - assert unpacked.allclose(test_tensor) + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + @unittest.skipIf(not has_triton(), "unsupported without triton") + @common_utils.parametrize("bit_width", bit_widths) + @common_utils.parametrize("dim", dimensions) + def test_compile(self, bit_width, dim): + torch._dynamo.config.specialize_int = True + torch.compile(pack, fullgraph=True) + torch.compile(unpack, fullgraph=True) + test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() + packed = pack(test_tensor, bit_width, dim=dim) + unpacked = unpack(packed, bit_width, dim=dim) + self.assertTrue(unpacked.allclose(test_tensor)) + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_pack_example(self): + test_tensor = torch.tensor( + [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 + ).cuda() + shard_4, shard_2 = pack(test_tensor, 6) + print(shard_4, shard_2) + self.assertTrue(torch.tensor([0, 105, 151, 37], dtype=torch.uint8).cuda().allclose(shard_4)) + self.assertTrue(torch.tensor([39, 146], dtype=torch.uint8).cuda().allclose(shard_2)) + unpacked = unpack([shard_4, shard_2], 6) + self.assertTrue(unpacked.allclose(test_tensor)) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -@pytest.mark.skipif(not has_triton(), reason="unsupported without triton") -@pytest.mark.parametrize("bit_width", bit_widths) -@pytest.mark.parametrize("dim", dimensions) -def test_compile(bit_width, dim): - torch._dynamo.config.specialize_int = True - torch.compile(pack, fullgraph=True) - torch.compile(unpack, fullgraph=True) - test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() - packed = pack(test_tensor, bit_width, dim=dim) - unpacked = unpack(packed, bit_width, dim=dim) - assert unpacked.allclose(test_tensor) + def test_pack_example_CPU(self): + test_tensor = torch.tensor( + [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 + ) + shard_4, shard_2 = pack(test_tensor, 6) + print(shard_4, shard_2) + self.assertTrue(torch.tensor([0, 105, 151, 37], dtype=torch.uint8).allclose(shard_4)) + self.assertTrue(torch.tensor([39, 146], dtype=torch.uint8).allclose(shard_2)) + unpacked = unpack([shard_4, shard_2], 6) + self.assertTrue(unpacked.allclose(test_tensor)) -# these test cases are for the example pack walk through in the bitpacking.py file -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -def test_pack_example(): - test_tensor = torch.tensor( - [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 - ).cuda() - shard_4, shard_2 = pack(test_tensor, 6) - print(shard_4, shard_2) - assert torch.tensor([0, 105, 151, 37], dtype=torch.uint8).cuda().allclose(shard_4) - assert torch.tensor([39, 146], dtype=torch.uint8).cuda().allclose(shard_2) - unpacked = unpack([shard_4, shard_2], 6) - assert unpacked.allclose(test_tensor) +common_utils.instantiate_parametrized_tests(TestBitpacking) - -def test_pack_example_CPU(): - test_tensor = torch.tensor( - [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8 - ) - shard_4, shard_2 = pack(test_tensor, 6) - print(shard_4, shard_2) - assert torch.tensor([0, 105, 151, 37], dtype=torch.uint8).allclose(shard_4) - assert torch.tensor([39, 146], dtype=torch.uint8).allclose(shard_2) - unpacked = unpack([shard_4, shard_2], 6) - assert unpacked.allclose(test_tensor) +if __name__ == "__main__": + unittest.main() From e9c369ba3b97d8a221109c9ce6a4c2e7a45e3487 Mon Sep 17 00:00:00 2001 From: V-E-D Date: Sun, 25 May 2025 11:08:20 +0530 Subject: [PATCH 3/3] precommit --- test/dtypes/test_affine_quantized_float.py | 5 ++-- test/dtypes/test_bitpacking.py | 33 ++++++++++++++-------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py index 7fa14be88c..b9488a0617 100644 --- a/test/dtypes/test_affine_quantized_float.py +++ b/test/dtypes/test_affine_quantized_float.py @@ -15,7 +15,6 @@ import copy import io import random -from contextlib import nullcontext from functools import partial from typing import Tuple @@ -137,7 +136,9 @@ def _run_fp8_linear_variant(self, dtype, mode, compile, sizes, granularity): output_quantized = quantized_model(input_tensor) error = compute_error(output_original, output_quantized) - self.assertGreater(error, 20, f"Quantization error is too high got a SQNR of {error}") + self.assertGreater( + error, 20, f"Quantization error is too high got a SQNR of {error}" + ) @unittest.skipIf( not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9" diff --git a/test/dtypes/test_bitpacking.py b/test/dtypes/test_bitpacking.py index 80d3245401..e91ad627d2 100644 --- a/test/dtypes/test_bitpacking.py +++ b/test/dtypes/test_bitpacking.py @@ -4,9 +4,10 @@ # This source code is licensed under the BSD 3-Clause license found in the # LICENSE file in the root directory of this source tree. import unittest + import torch -from torch.utils._triton import has_triton from torch.testing._internal import common_utils +from torch.utils._triton import has_triton from torchao.dtypes.uintx.bitpacking import pack, pack_cpu, unpack, unpack_cpu @@ -21,13 +22,13 @@ def setUp(self): torch.manual_seed(0) if torch.cuda.is_available(): torch.cuda.manual_seed(0) - + # Initialize any common test data self.test_shape = (32, 32, 32) - + # Set default device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - + # Reset any cached state torch._dynamo.reset() @@ -35,11 +36,11 @@ def tearDown(self): """Clean up after each test method.""" # Reset torch._dynamo cache torch._dynamo.reset() - + # Clear CUDA cache if available if torch.cuda.is_available(): torch.cuda.empty_cache() - + # Reset any other state if needed torch._dynamo.config.specialize_int = False @@ -57,7 +58,9 @@ def test_CPU(self, bit_width, dim): @common_utils.parametrize("bit_width", bit_widths) @common_utils.parametrize("dim", dimensions) def test_GPU(self, bit_width, dim): - test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() + test_tensor = torch.randint( + 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8 + ).cuda() packed = pack(test_tensor, bit_width, dim=dim) unpacked = unpack(packed, bit_width, dim=dim) self.assertTrue(unpacked.allclose(test_tensor)) @@ -70,7 +73,9 @@ def test_compile(self, bit_width, dim): torch._dynamo.config.specialize_int = True torch.compile(pack, fullgraph=True) torch.compile(unpack, fullgraph=True) - test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).cuda() + test_tensor = torch.randint( + 0, 2**bit_width, (32, 32, 32), dtype=torch.uint8 + ).cuda() packed = pack(test_tensor, bit_width, dim=dim) unpacked = unpack(packed, bit_width, dim=dim) self.assertTrue(unpacked.allclose(test_tensor)) @@ -82,8 +87,12 @@ def test_pack_example(self): ).cuda() shard_4, shard_2 = pack(test_tensor, 6) print(shard_4, shard_2) - self.assertTrue(torch.tensor([0, 105, 151, 37], dtype=torch.uint8).cuda().allclose(shard_4)) - self.assertTrue(torch.tensor([39, 146], dtype=torch.uint8).cuda().allclose(shard_2)) + self.assertTrue( + torch.tensor([0, 105, 151, 37], dtype=torch.uint8).cuda().allclose(shard_4) + ) + self.assertTrue( + torch.tensor([39, 146], dtype=torch.uint8).cuda().allclose(shard_2) + ) unpacked = unpack([shard_4, shard_2], 6) self.assertTrue(unpacked.allclose(test_tensor)) @@ -93,7 +102,9 @@ def test_pack_example_CPU(self): ) shard_4, shard_2 = pack(test_tensor, 6) print(shard_4, shard_2) - self.assertTrue(torch.tensor([0, 105, 151, 37], dtype=torch.uint8).allclose(shard_4)) + self.assertTrue( + torch.tensor([0, 105, 151, 37], dtype=torch.uint8).allclose(shard_4) + ) self.assertTrue(torch.tensor([39, 146], dtype=torch.uint8).allclose(shard_2)) unpacked = unpack([shard_4, shard_2], 6) self.assertTrue(unpacked.allclose(test_tensor))