From 990ef89e6b5d4a785011787c938077f1310178ae Mon Sep 17 00:00:00 2001 From: vasiliy Date: Wed, 29 Oct 2025 04:05:16 -0700 Subject: [PATCH 1/3] Update [ghstack-poisoned] --- .../quantize_/workflows/float8/test_float8_tensor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py index 786e0cf59f..010682474e 100644 --- a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py +++ b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py @@ -294,6 +294,7 @@ def test_slice_and_copy_similar_to_vllm(self, granularity): self._test_slice_and_copy_similar_to_vllm(config) @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+") + @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai") def test_bmm(self): # only support per row quantization config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()) @@ -406,6 +407,7 @@ def test_cat(self, granularity, sizes): self.assertEqual(cat_qweight2.scale, ref_scale) @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+") + @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai") def test_moe_weight_reshape_ops(self): # only per row quantization is supported for bmm granularity = PerRow() @@ -416,6 +418,7 @@ def test_moe_weight_reshape_ops(self): # that should be moved here after v1 config is deprecated: # https://github.com/pytorch/ao/issues/2649 @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+") + @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai") def test_expected_gpu_kernel_fbgemm(self): """Making sure KernelPreference.FBGEMM calls correct quantize and gemm kernels and the bias add happens in the gemm kernel for per row quantization From cce08f0a71883504f526c7f75594b334e325b1d6 Mon Sep 17 00:00:00 2001 From: vasiliy Date: Wed, 29 Oct 2025 04:05:20 -0700 Subject: [PATCH 2/3] Update [ghstack-poisoned] --- benchmarks/benchmark_blockwise_scaled_linear_triton.py | 2 +- test/{prototype => kernel}/test_blockwise_triton.py | 2 +- .../blockwise_quantization.py | 0 torchao/prototype/blockwise_fp8_inference/__init__.py | 5 +++-- .../prototype/blockwise_fp8_inference/blockwise_linear.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) rename test/{prototype => kernel}/test_blockwise_triton.py (96%) rename torchao/{prototype/blockwise_fp8_inference => kernel}/blockwise_quantization.py (100%) diff --git a/benchmarks/benchmark_blockwise_scaled_linear_triton.py b/benchmarks/benchmark_blockwise_scaled_linear_triton.py index ffdd63ec8d..26ba04f2ce 100644 --- a/benchmarks/benchmark_blockwise_scaled_linear_triton.py +++ b/benchmarks/benchmark_blockwise_scaled_linear_triton.py @@ -13,7 +13,7 @@ from triton.testing import do_bench from torchao.float8.float8_utils import compute_error - from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import ( + from torchao.kernel.blockwise_quantization import ( blockwise_fp8_gemm, fp8_blockwise_act_quant, fp8_blockwise_weight_quant, diff --git a/test/prototype/test_blockwise_triton.py b/test/kernel/test_blockwise_triton.py similarity index 96% rename from test/prototype/test_blockwise_triton.py rename to test/kernel/test_blockwise_triton.py index 89f8cf869e..5de88ab7d9 100644 --- a/test/prototype/test_blockwise_triton.py +++ b/test/kernel/test_blockwise_triton.py @@ -11,7 +11,7 @@ triton = pytest.importorskip("triton", reason="Triton required to run this test") -from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import ( +from torchao.kernel.blockwise_quantization import ( blockwise_fp8_gemm, fp8_blockwise_act_quant, fp8_blockwise_weight_dequant, diff --git a/torchao/prototype/blockwise_fp8_inference/blockwise_quantization.py b/torchao/kernel/blockwise_quantization.py similarity index 100% rename from torchao/prototype/blockwise_fp8_inference/blockwise_quantization.py rename to torchao/kernel/blockwise_quantization.py diff --git a/torchao/prototype/blockwise_fp8_inference/__init__.py b/torchao/prototype/blockwise_fp8_inference/__init__.py index f2842417e4..eb6b7824bc 100644 --- a/torchao/prototype/blockwise_fp8_inference/__init__.py +++ b/torchao/prototype/blockwise_fp8_inference/__init__.py @@ -1,11 +1,12 @@ -from .blockwise_linear import BlockwiseQuantLinear -from .blockwise_quantization import ( +from torchao.kernel.blockwise_quantization import ( blockwise_fp8_gemm, fp8_blockwise_act_quant, fp8_blockwise_weight_dequant, fp8_blockwise_weight_quant, ) +from .blockwise_linear import BlockwiseQuantLinear + __all__ = [ "blockwise_fp8_gemm", "BlockwiseQuantLinear", diff --git a/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py b/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py index ebed3a84a4..a43574fa11 100644 --- a/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py +++ b/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py @@ -7,7 +7,7 @@ import torch from torch import nn -from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import ( +from torchao.kernel.blockwise_quantization import ( blockwise_fp8_gemm, fp8_blockwise_act_quant, ) From f76e10b9de0cd139feb6deb587b2484b960716d6 Mon Sep 17 00:00:00 2001 From: vasiliy Date: Wed, 29 Oct 2025 07:07:09 -0700 Subject: [PATCH 3/3] Update [ghstack-poisoned] --- .github/workflows/1xL4_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/1xL4_tests.yml b/.github/workflows/1xL4_tests.yml index 58980d8504..7a1c293074 100644 --- a/.github/workflows/1xL4_tests.yml +++ b/.github/workflows/1xL4_tests.yml @@ -51,3 +51,4 @@ jobs: pytest test/dtypes/test_affine_quantized_float.py --verbose -s ./test/float8/test_everything_single_gpu.sh python test/quantization/quantize_/workflows/float8/test_float8_tensor.py + python test/kernel/test_blockwise_triton.py --verbose -s