From 990ef89e6b5d4a785011787c938077f1310178ae Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Wed, 29 Oct 2025 04:05:16 -0700
Subject: [PATCH 1/3] Update

[ghstack-poisoned]
---
 .../quantize_/workflows/float8/test_float8_tensor.py           | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
index 786e0cf59f..010682474e 100644
--- a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+++ b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -294,6 +294,7 @@ def test_slice_and_copy_similar_to_vllm(self, granularity):
         self._test_slice_and_copy_similar_to_vllm(config)
 
     @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+    @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
     def test_bmm(self):
         # only support per row quantization
         config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
@@ -406,6 +407,7 @@ def test_cat(self, granularity, sizes):
         self.assertEqual(cat_qweight2.scale, ref_scale)
 
     @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+    @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
     def test_moe_weight_reshape_ops(self):
         # only per row quantization is supported for bmm
         granularity = PerRow()
@@ -416,6 +418,7 @@ def test_moe_weight_reshape_ops(self):
     # that should be moved here after v1 config is deprecated:
     # https://github.com/pytorch/ao/issues/2649
     @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+    @unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
     def test_expected_gpu_kernel_fbgemm(self):
         """Making sure KernelPreference.FBGEMM calls correct quantize and gemm kernels
         and the bias add happens in the gemm kernel for per row quantization

From cce08f0a71883504f526c7f75594b334e325b1d6 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Wed, 29 Oct 2025 04:05:20 -0700
Subject: [PATCH 2/3] Update

[ghstack-poisoned]
---
 benchmarks/benchmark_blockwise_scaled_linear_triton.py       | 2 +-
 test/{prototype => kernel}/test_blockwise_triton.py          | 2 +-
 .../blockwise_quantization.py                                | 0
 torchao/prototype/blockwise_fp8_inference/__init__.py        | 5 +++--
 .../prototype/blockwise_fp8_inference/blockwise_linear.py    | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)
 rename test/{prototype => kernel}/test_blockwise_triton.py (96%)
 rename torchao/{prototype/blockwise_fp8_inference => kernel}/blockwise_quantization.py (100%)

diff --git a/benchmarks/benchmark_blockwise_scaled_linear_triton.py b/benchmarks/benchmark_blockwise_scaled_linear_triton.py
index ffdd63ec8d..26ba04f2ce 100644
--- a/benchmarks/benchmark_blockwise_scaled_linear_triton.py
+++ b/benchmarks/benchmark_blockwise_scaled_linear_triton.py
@@ -13,7 +13,7 @@
     from triton.testing import do_bench
 
     from torchao.float8.float8_utils import compute_error
-    from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
+    from torchao.kernel.blockwise_quantization import (
         blockwise_fp8_gemm,
         fp8_blockwise_act_quant,
         fp8_blockwise_weight_quant,
diff --git a/test/prototype/test_blockwise_triton.py b/test/kernel/test_blockwise_triton.py
similarity index 96%
rename from test/prototype/test_blockwise_triton.py
rename to test/kernel/test_blockwise_triton.py
index 89f8cf869e..5de88ab7d9 100644
--- a/test/prototype/test_blockwise_triton.py
+++ b/test/kernel/test_blockwise_triton.py
@@ -11,7 +11,7 @@
 
 triton = pytest.importorskip("triton", reason="Triton required to run this test")
 
-from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
+from torchao.kernel.blockwise_quantization import (
     blockwise_fp8_gemm,
     fp8_blockwise_act_quant,
     fp8_blockwise_weight_dequant,
diff --git a/torchao/prototype/blockwise_fp8_inference/blockwise_quantization.py b/torchao/kernel/blockwise_quantization.py
similarity index 100%
rename from torchao/prototype/blockwise_fp8_inference/blockwise_quantization.py
rename to torchao/kernel/blockwise_quantization.py
diff --git a/torchao/prototype/blockwise_fp8_inference/__init__.py b/torchao/prototype/blockwise_fp8_inference/__init__.py
index f2842417e4..eb6b7824bc 100644
--- a/torchao/prototype/blockwise_fp8_inference/__init__.py
+++ b/torchao/prototype/blockwise_fp8_inference/__init__.py
@@ -1,11 +1,12 @@
-from .blockwise_linear import BlockwiseQuantLinear
-from .blockwise_quantization import (
+from torchao.kernel.blockwise_quantization import (
     blockwise_fp8_gemm,
     fp8_blockwise_act_quant,
     fp8_blockwise_weight_dequant,
     fp8_blockwise_weight_quant,
 )
 
+from .blockwise_linear import BlockwiseQuantLinear
+
 __all__ = [
     "blockwise_fp8_gemm",
     "BlockwiseQuantLinear",
diff --git a/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py b/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py
index ebed3a84a4..a43574fa11 100644
--- a/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py
+++ b/torchao/prototype/blockwise_fp8_inference/blockwise_linear.py
@@ -7,7 +7,7 @@
 import torch
 from torch import nn
 
-from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
+from torchao.kernel.blockwise_quantization import (
     blockwise_fp8_gemm,
     fp8_blockwise_act_quant,
 )

From f76e10b9de0cd139feb6deb587b2484b960716d6 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Wed, 29 Oct 2025 07:07:09 -0700
Subject: [PATCH 3/3] Update

[ghstack-poisoned]
---
 .github/workflows/1xL4_tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/1xL4_tests.yml b/.github/workflows/1xL4_tests.yml
index 58980d8504..7a1c293074 100644
--- a/.github/workflows/1xL4_tests.yml
+++ b/.github/workflows/1xL4_tests.yml
@@ -51,3 +51,4 @@ jobs:
         pytest test/dtypes/test_affine_quantized_float.py --verbose -s
         ./test/float8/test_everything_single_gpu.sh
         python test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+        python test/kernel/test_blockwise_triton.py --verbose -s