From 70461d9404411af0990738079f1c48eff3a27ad0 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 23 Sep 2025 13:32:00 -0700
Subject: [PATCH 01/16] Support NVFP4 dynamic per tensor scale

**Summary:** This commit adds an option for the existing
`NVFP4InferenceConfig` to dynamically compute an appropriate
fp32 per tensor scale to support the two level scaling
according to the NVFP4 specification:
https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/.

While two level scaling is supported in `NVFP4Tensor`, today
there is no config API for users to call this. The existing
`NVFP4InferenceConfig` only supports single level scaling
because including an explicit `per_tensor_scale` field would
make serialization tricky.

In the future, we should add an end-to-end calibration flow
so users can compute an appropriate per tensor scale for the
activations first, and then pass this to `NVFP4Tensor` as a
static scale, similar to the proposal in https://github.com/pytorch/ao/issues/2572.

**Test Plan:**
```
pytest test/prototype/mx_formats/test_inference_workflow.py -k test_inference_workflow_nvfp4
pytest test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

Also did a quick benchmark before and after:
```
import copy
import time
import torch
from torchao.quantization import quantize_
from torchao.prototype.mx_formats import NVFP4InferenceConfig

m_mx1 = torch.nn.Linear(64, 256, bias=True, dtype=torch.bfloat16, device="cuda")
m_mx2 = copy.deepcopy(m_mx1)
config1 = NVFP4InferenceConfig(use_dynamic_per_tensor_scale=False)
config2 = NVFP4InferenceConfig(use_dynamic_per_tensor_scale=True)
quantize_(m_mx1, config=config1)
quantize_(m_mx2, config=config2)
m_mx1 = torch.compile(m_mx1, fullgraph=True, backend="aot_eager")
m_mx2 = torch.compile(m_mx2, fullgraph=True, backend="aot_eager")

start = time.time()
for _ in range(1000):
    m_mx1(torch.randn(128, 64, device="cuda", dtype=torch.bfloat16))
print("No per_tensor_scale = ", time.time() - start, "seconds")

start = time.time()
for _ in range(1000):
    m_mx2(torch.randn(128, 64, device="cuda", dtype=torch.bfloat16))
print("With per_tensor_scale = ", time.time() - start, "seconds")
```

On a single B200:
```
No per_tensor_scale =  1.2855589389801025 seconds
With per_tensor_scale =  1.3009123802185059 seconds
```

[ghstack-poisoned]
---
 .../mx_formats/test_inference_workflow.py     |  6 ++++-
 test/quantization/test_qat.py                 |  7 +++---
 .../mx_formats/inference_workflow.py          | 15 +++++++++--
 torchao/prototype/mx_formats/nvfp4_tensor.py  | 25 ++++++++++++++++---
 .../quantization/qat/fake_quantize_config.py  | 13 +++++-----
 5 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/test/prototype/mx_formats/test_inference_workflow.py b/test/prototype/mx_formats/test_inference_workflow.py
index 988a879b5b..90dc2700ce 100644
--- a/test/prototype/mx_formats/test_inference_workflow.py
+++ b/test/prototype/mx_formats/test_inference_workflow.py
@@ -105,6 +105,7 @@ def test_inference_workflow_mx(elem_dtype, bias: bool, compile: bool):
 )
 @pytest.mark.parametrize("inpt_dtype", [torch.bfloat16, torch.float32])
 @pytest.mark.parametrize("use_triton_kernel", [True, False])
+@pytest.mark.parametrize("use_dynamic_per_tensor_scale", [True, False])
 @pytest.mark.parametrize(
     "shapes",
     [
@@ -126,6 +127,7 @@ def test_inference_workflow_nvfp4(
     mm_config: NVFP4MMConfig,
     inpt_dtype: torch.dtype,
     use_triton_kernel: bool,
+    use_dynamic_per_tensor_scale: bool,
     shapes: tuple,
 ):
     """
@@ -147,7 +149,9 @@ def test_inference_workflow_nvfp4(
     m_mx = copy.deepcopy(m)
 
     config = NVFP4InferenceConfig(
-        mm_config=mm_config, use_triton_kernel=use_triton_kernel
+        mm_config=mm_config,
+        use_triton_kernel=use_triton_kernel,
+        use_dynamic_per_tensor_scale=use_dynamic_per_tensor_scale,
     )
     quantize_(m_mx, config=config)
 
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
index a6ef09e6e8..fd4939232f 100644
--- a/test/quantization/test_qat.py
+++ b/test/quantization/test_qat.py
@@ -2077,7 +2077,8 @@ def test_infer_int4_weight_only_config(self):
         self.assertEqual(weight_config.activation_dtype, torch.bfloat16)
 
     @unittest.skipIf(not is_sm_at_least_89(), "Need sm89+")
-    def test_quantize_api_nvfp4(self):
+    @parametrize("use_per_tensor_scale", [True, False])
+    def test_quantize_api_nvfp4(self, use_per_tensor_scale: bool):
         """
         Test the following:
             quantize_(model, QATConfig(NVFP4InferenceConfig(), step="prepare"))
@@ -2086,8 +2087,8 @@ def test_quantize_api_nvfp4(self):
         from torchao.prototype.mx_formats import NVFP4InferenceConfig
 
         self._test_quantize_api_against_ptq(
-            NVFP4InferenceConfig(),
-            target_prepare_sqnr=8,
+            NVFP4InferenceConfig(use_dynamic_per_tensor_scale=use_per_tensor_scale),
+            target_prepare_sqnr=12,
             target_convert_sqnr=float("inf"),
         )
 
diff --git a/torchao/prototype/mx_formats/inference_workflow.py b/torchao/prototype/mx_formats/inference_workflow.py
index 34cf9e9506..39f0725390 100644
--- a/torchao/prototype/mx_formats/inference_workflow.py
+++ b/torchao/prototype/mx_formats/inference_workflow.py
@@ -22,6 +22,7 @@
     NVFP4MMConfig,
     NVFP4Tensor,
     QuantizeTensorToNVFP4Kwargs,
+    per_tensor_amax_to_scale,
 )
 from torchao.quantization.transform_module import (
     register_quantize_module_handler,
@@ -134,7 +135,8 @@ class NVFP4InferenceConfig(AOBaseConfig):
     This is a specialized configuration for NVIDIA's FP4 format.
     Configuration parameters:
     - mm_config: NVFP4MMConfig, which can be set to DYNAMIC or WEIGHT_ONLY (emulated mm in high precision)
-    - use_triton_kernel: bool, whether to use fused triton kernel for activation scaling (default: False)
+    - use_triton_kernel: bool, whether to use fused triton kernel for activation scaling (default: True)
+    - use_dynamic_per_tensor_scale: bool, whether to dynamically compute per tensor scale (default: True)
     - Data: float4_e2m1fn_x2
     - Scales: float8_e4m3fn
     - Block size: 16 along the reduction dim
@@ -145,6 +147,7 @@ class NVFP4InferenceConfig(AOBaseConfig):
 
     mm_config: NVFP4MMConfig = NVFP4MMConfig.DYNAMIC
     use_triton_kernel: bool = True
+    use_dynamic_per_tensor_scale: bool = True
 
     def __post_init__(self):
         # Validate PyTorch version
@@ -175,12 +178,20 @@ def _nvfp4_inference_linear_transform(
             "Please use bfloat16 or float16 weights, or remove the bias from the linear layer."
         )
 
+    per_tensor_scale = None
+    if config.use_dynamic_per_tensor_scale:
+        tensor_amax = torch.max(torch.abs(weight))
+        per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
+
     act_quant_kwargs = None
     if config.mm_config == NVFP4MMConfig.DYNAMIC:
-        act_quant_kwargs = QuantizeTensorToNVFP4Kwargs()
+        act_quant_kwargs = QuantizeTensorToNVFP4Kwargs(
+            use_dynamic_per_tensor_scale=config.use_dynamic_per_tensor_scale,
+        )
 
     quantized_weight = NVFP4Tensor.to_nvfp4(
         weight,
+        per_tensor_scale=per_tensor_scale,
         is_swizzled_scales=True,
         use_triton_kernel=False,  # Always use traditional construction for weights
         act_quant_kwargs=act_quant_kwargs,
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
index 3f2e8eeef3..1eb12dee90 100644
--- a/torchao/prototype/mx_formats/nvfp4_tensor.py
+++ b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -47,6 +47,7 @@ class QuantizeTensorToNVFP4Kwargs(QuantizeTensorKwargs):
     block_size: int = 16
     is_swizzled_scales: bool = False
     use_triton_kernel: bool = False
+    use_dynamic_per_tensor_scale: bool = False
 
 
 # TODO(future PR): move over to TorchAOBaseTensor's dispatch
@@ -245,7 +246,7 @@ def get_hp_scales(self) -> torch.Tensor:
 
         return (
             scale_e4m3.to(self._orig_dtype)
-            if not self._per_tensor_scale
+            if self._per_tensor_scale is None
             else self._per_tensor_scale * scale_e4m3.to(self._orig_dtype)
         )
 
@@ -645,10 +646,15 @@ def nvfp4_linear(func, types, args, kwargs):
     else:
         # dynamic quant
         k = weight_tensor.act_quant_kwargs
+        if k.use_dynamic_per_tensor_scale:
+            tensor_amax = torch.max(torch.abs(input_tensor))
+            per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
+        else:
+            per_tensor_scale = weight_tensor._act_per_tensor_scale
         input_tensor = NVFP4Tensor.to_nvfp4(
             input_tensor,
             block_size=k.block_size,
-            per_tensor_scale=weight_tensor._act_per_tensor_scale,
+            per_tensor_scale=per_tensor_scale,
             is_swizzled_scales=k.is_swizzled_scales,
             use_triton_kernel=k.use_triton_kernel,
         )
@@ -672,10 +678,15 @@ def nvfp4_mm(func, types, args, kwargs):
     else:
         if not isinstance(input_tensor, NVFP4Tensor):
             k = weight_tensor.act_quant_kwargs
+            if k.use_dynamic_per_tensor_scale:
+                tensor_amax = torch.max(torch.abs(input_tensor))
+                per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
+            else:
+                per_tensor_scale = weight_tensor._act_per_tensor_scale
             input_tensor = NVFP4Tensor.to_nvfp4(
                 input_tensor,
                 block_size=k.block_size,
-                per_tensor_scale=weight_tensor._act_per_tensor_scale,
+                per_tensor_scale=per_tensor_scale,
                 is_swizzled_scales=k.is_swizzled_scales,
                 use_triton_kernel=k.use_triton_kernel,
             )
@@ -697,12 +708,18 @@ def nvfp4_addmm(func, types, args, kwargs):
         else:
             return torch.addmm(bias, input_tensor, weight_dequant)
     else:
+        # TODO: refactor duplicate code
         if not isinstance(input_tensor, NVFP4Tensor):
             k = weight_tensor.act_quant_kwargs
+            if k.use_dynamic_per_tensor_scale:
+                tensor_amax = torch.max(torch.abs(input_tensor))
+                per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
+            else:
+                per_tensor_scale = weight_tensor._act_per_tensor_scale
             input_tensor = NVFP4Tensor.to_nvfp4(
                 input_tensor,
                 block_size=k.block_size,
-                per_tensor_scale=weight_tensor._act_per_tensor_scale,
+                per_tensor_scale=per_tensor_scale,
                 is_swizzled_scales=k.is_swizzled_scales,
                 use_triton_kernel=k.use_triton_kernel,
             )
diff --git a/torchao/quantization/qat/fake_quantize_config.py b/torchao/quantization/qat/fake_quantize_config.py
index ebc9864f3d..336a419af5 100644
--- a/torchao/quantization/qat/fake_quantize_config.py
+++ b/torchao/quantization/qat/fake_quantize_config.py
@@ -442,16 +442,15 @@ def _infer_fake_quantize_configs(
             activation_dtype=e4m3_dtype,
         )
     elif isinstance(base_config, NVFP4InferenceConfig):
-        # Note: today the PTQ config does not allow the user to specify
-        # `per_tensor_scales` due to serialization concerns. In the future
-        # we may add a way to compute these dynamically (for activations),
-        # but for now QAT will mimic the existing behavior of not having
-        # `per_tensor_scales` (subject to change)
         if NVFP4MMConfig.DYNAMIC:
-            act_config = NVFP4FakeQuantizeConfig(False)
+            act_config = NVFP4FakeQuantizeConfig(
+                use_per_tensor_scale=base_config.use_dynamic_per_tensor_scale
+            )
         else:
             act_config = None
-        weight_config = NVFP4FakeQuantizeConfig(False)
+        weight_config = NVFP4FakeQuantizeConfig(
+            use_per_tensor_scale=base_config.use_dynamic_per_tensor_scale
+        )
     elif isinstance(base_config, Int8DynamicActivationIntxWeightConfig):
         assert base_config.version >= 2, "Only version 2+ is supported"
         assert base_config.intx_packing_format == "unpacked_to_int8", (

From cce3b22cbb2175d4632f8ba0aa08c00c25bdc688 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 23 Sep 2025 13:32:04 -0700
Subject: [PATCH 02/16] Improve QAT nvfp4 numerics

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
Details TBD.

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]
---
 test/quantization/test_qat.py                | 10 +++++--
 torchao/prototype/custom_fp_utils.py         | 30 ++++++++++++++------
 torchao/prototype/mx_formats/kernels.py      |  8 +++---
 torchao/prototype/mx_formats/nvfp4_tensor.py |  3 +-
 torchao/prototype/qat/nvfp4.py               |  7 ++++-
 5 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
index fd4939232f..c9a93b26c0 100644
--- a/test/quantization/test_qat.py
+++ b/test/quantization/test_qat.py
@@ -1910,7 +1910,6 @@ def _test_quantize_api_against_ptq(
         quantize_(m, QATConfig(base_config, step="prepare"), filter_fn)
         out_prepared = m(*example_inputs)
         prepare_sqnr = compute_error(out_prepared, out_baseline)
-
         self.assertGreaterEqual(prepare_sqnr, target_prepare_sqnr)
 
         # compare convert
@@ -2086,9 +2085,14 @@ def test_quantize_api_nvfp4(self, use_per_tensor_scale: bool):
         """
         from torchao.prototype.mx_formats import NVFP4InferenceConfig
 
+        if use_per_tensor_scale:
+            target_prepare_sqnr = 36
+        else:
+            target_prepare_sqnr = float("inf")
+
         self._test_quantize_api_against_ptq(
             NVFP4InferenceConfig(use_dynamic_per_tensor_scale=use_per_tensor_scale),
-            target_prepare_sqnr=12,
+            target_prepare_sqnr=target_prepare_sqnr,
             target_convert_sqnr=float("inf"),
         )
 
@@ -2116,7 +2120,7 @@ def test_qat_nvfp4(self, use_per_tensor_scale: bool):
         out = m(*x)
         baseline_out = baseline_model(*x)
         sqnr = compute_error(out, baseline_out).item()
-        self.assertGreater(sqnr, 24)
+        self.assertGreater(sqnr, 10)
 
     @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
     @unittest.skipIf(
diff --git a/torchao/prototype/custom_fp_utils.py b/torchao/prototype/custom_fp_utils.py
index 3d8de6f0de..f1afbabacc 100644
--- a/torchao/prototype/custom_fp_utils.py
+++ b/torchao/prototype/custom_fp_utils.py
@@ -24,7 +24,9 @@ def _n_ones(n: int) -> int:
 F32_EXP_BIAS = _n_ones(EBITS_F32 - 1)
 
 
-def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
+def _f32_to_floatx_unpacked(
+    x: Tensor, ebits: int, mbits: int, fake_quantize: bool = False
+) -> Tensor:
     """Convert FP32 numbers to sub-byte floating point numbers with the given
     number of exponent and mantissa bits.
 
@@ -105,7 +107,8 @@ def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
     denormal_x = x + denorm_mask_float
     denormal_x = denormal_x.view(torch.int32)
     denormal_x -= denorm_mask_int
-    denormal_x = denormal_x.to(torch.uint8)
+    if not fake_quantize:
+        denormal_x = denormal_x.to(torch.uint8)
 
     #
     # branch 3: stay in normal range, adjust the exponent and round
@@ -120,18 +123,23 @@ def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
     normal_x += mant_odd
     # take the bits!
     normal_x = normal_x >> (MBITS_F32 - mbits)
-    normal_x = normal_x.to(torch.uint8)
+    if not fake_quantize:
+        normal_x = normal_x.to(torch.uint8)
 
     #
     # combine the branches
     #
-    x = torch.full_like(x, max_int, dtype=torch.uint8)
+    if fake_quantize:
+        x = torch.full_like(x, max_int, dtype=torch.int32)
+    else:
+        x = torch.full_like(x, max_int, dtype=torch.uint8)
     x = torch.where(denormal_mask, denormal_x, x)
     x = torch.where(normal_mask, normal_x, x)
 
     # add sign back
     sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
-    sign_lp = sign_lp.to(torch.uint8)
+    if not fake_quantize:
+        sign_lp = sign_lp.to(torch.uint8)
     # Right shift of a negative signed integer can fill the least significant
     # bits with either 1s or 0s, depending on the implementation. Since PyTorch
     # doesn't have an uint32 dtype, we mask out these bits to get just the
@@ -139,12 +147,17 @@ def _f32_to_floatx_unpacked(x: Tensor, ebits: int, mbits: int) -> Tensor:
     sign_lp = sign_lp & sign_mask
     x = x | sign_lp
 
-    return x.to(torch.uint8)
+    if fake_quantize:
+        return x
+    else:
+        return x.to(torch.uint8)
 
 
 # TODO(future): check if LUT for everything is faster than bit shifting,
 # especially for fp4 (only 2^4=16 unique values).
-def _floatx_unpacked_to_f32(x: Tensor, ebits: int, mbits: int) -> Tensor:
+def _floatx_unpacked_to_f32(
+    x: Tensor, ebits: int, mbits: int, fake_quantize: bool = False
+) -> Tensor:
     """Convert sub-byte floating point numbers with the given number of exponent
     and mantissa bits to FP32.
 
@@ -154,7 +167,8 @@ def _floatx_unpacked_to_f32(x: Tensor, ebits: int, mbits: int) -> Tensor:
       fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
     Output: torch.Tensor of dtype fp32 with the dequantized value
     """
-    assert x.dtype == torch.uint8
+    if not fake_quantize:
+        assert x.dtype == torch.uint8
     assert 1 + ebits + mbits <= 8
 
     sign_mask = 1 << (ebits + mbits)
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
index 5811dd9d21..89370a2ad7 100644
--- a/torchao/prototype/mx_formats/kernels.py
+++ b/torchao/prototype/mx_formats/kernels.py
@@ -65,13 +65,13 @@ def get_bits(x: torch.Tensor) -> str:
 ZERO_POINT_FIVE_BITS_F32 = 0x3F000000
 
 
-def f32_to_f4_unpacked(x):
+def f32_to_f4_unpacked(x, fake_quantize: bool = False):
     """
     Input: torch.Tensor of dtype torch.float
     Output: torch.Tensor of dtype torch.uint8, with bits 0-3 empty and
       bits 4-7 in fp4_e2m1
     """
-    return _f32_to_floatx_unpacked(x, EBITS_F4_E2M1, MBITS_F4_E2M1)
+    return _f32_to_floatx_unpacked(x, EBITS_F4_E2M1, MBITS_F4_E2M1, fake_quantize)
 
 
 def f32_to_f6_e2m3_unpacked(x):
@@ -92,13 +92,13 @@ def f32_to_f6_e3m2_unpacked(x):
     return _f32_to_floatx_unpacked(x, EBITS_F6_E3M2, MBITS_F6_E3M2)
 
 
-def f4_unpacked_to_f32(x: torch.Tensor):
+def f4_unpacked_to_f32(x: torch.Tensor, fake_quantize: bool = False):
     """
     Input: torch.Tensor of dtype uint8, with bits 0-3 empty and bits 4-7
       containing an fp4_e2m1 encoding
     Output: torch.Tensor of dtype fp32 with the dequantized value
     """
-    return _floatx_unpacked_to_f32(x, EBITS_F4_E2M1, MBITS_F4_E2M1)
+    return _floatx_unpacked_to_f32(x, EBITS_F4_E2M1, MBITS_F4_E2M1, fake_quantize)
 
 
 def f6_e2m3_unpacked_to_f32(x: torch.Tensor):
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
index 1eb12dee90..58a7a92234 100644
--- a/torchao/prototype/mx_formats/nvfp4_tensor.py
+++ b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -798,7 +798,6 @@ def _nvfp4_quantize(
     assert data_hp.is_contiguous(), "Only support contiguous data for now"
     assert block_size == 16, "NVFP4 requires block_size=16"
 
-    orig_dtype = data_hp.dtype
     orig_shape = data_hp.shape
     # Convert to float32 early for consistent precision with Triton implementation
     data_hp = data_hp.float().reshape(orig_shape[0], -1, block_size)
@@ -834,7 +833,7 @@ def _nvfp4_quantize(
     data_scaled = torch.clamp(data_scaled, -F4_E2M1_MAX, F4_E2M1_MAX)
     data_scaled = data_scaled.view(orig_shape)
     if skip_dtype_cast_and_packing:
-        return out_scales.to(torch.float32), data_scaled.to(orig_dtype)
+        return _Float8Round.apply(out_scales), data_scaled
     else:
         data_lp = f32_to_f4_unpacked(data_scaled)
         # TODO: NotImplementedError: "copy_kernel" not implemented for 'Float4_e2m1fn_x2'
diff --git a/torchao/prototype/qat/nvfp4.py b/torchao/prototype/qat/nvfp4.py
index ed709dba1d..f9ff122cbd 100644
--- a/torchao/prototype/qat/nvfp4.py
+++ b/torchao/prototype/qat/nvfp4.py
@@ -2,6 +2,10 @@
 
 import torch
 
+from torchao.prototype.mx_formats.kernels import (
+    f4_unpacked_to_f32,
+    f32_to_f4_unpacked,
+)
 from torchao.prototype.mx_formats.nvfp4_tensor import (
     _nvfp4_quantize,
     per_tensor_amax_to_scale,
@@ -56,13 +60,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             per_tensor_scale=per_tensor_scale,
             skip_dtype_cast_and_packing=True,
         )
+        q = f32_to_f4_unpacked(q, fake_quantize=True)
         if self.config.use_per_tensor_scale:
             scale = scale * per_tensor_scale
-        assert q.dtype == x.dtype
         assert scale.dtype == torch.float32
 
         # dequantize
         M, K = q.shape[0], q.shape[1]
+        q = f4_unpacked_to_f32(q, fake_quantize=True)
         q = q.view(M, K // block_size, block_size)
         scale = scale.view(M, K // block_size, 1)
         dq = q * scale

From 4d7bb2ac0d8b72584951b9f58e7b50f7a3ab24d1 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 23 Sep 2025 14:48:54 -0700
Subject: [PATCH 03/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`,
   but in `torch.int32` instead of `torch.uint8`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From 20c36da611b06f42e01e9903f3af171e0f826fa6 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 23 Sep 2025 15:01:29 -0700
Subject: [PATCH 04/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`,
   but in `torch.int32` instead of `torch.uint8`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From 61dd09fe25e207e8e2b4ea55cbf25d7dbd3b4a59 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 23 Sep 2025 15:09:22 -0700
Subject: [PATCH 05/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`,
   but in `torch.int32` instead of `torch.uint8`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From cec1acd9ab35394ebd34394f9723d97933743d84 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Wed, 24 Sep 2025 13:07:05 -0700
Subject: [PATCH 06/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`,
   but in `torch.int32` instead of `torch.uint8`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From 6585a8c99883b2493e75db8d247904a8293eee97 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Wed, 24 Sep 2025 15:34:41 -0700
Subject: [PATCH 07/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`,
   but in `torch.int32` instead of `torch.uint8`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From 22ec72bd1eac018ff5ff5a0ca23cbaa7098d5ff8 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Thu, 25 Sep 2025 12:01:53 -0700
Subject: [PATCH 08/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`,
   but in `torch.int32` instead of `torch.uint8`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From e446a504b8c34a48805e3e38b36de3c40742e65b Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Thu, 25 Sep 2025 13:06:54 -0700
Subject: [PATCH 09/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From c9470992af31e344583e5c78be4f4a6c099d9651 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Thu, 25 Sep 2025 13:53:17 -0700
Subject: [PATCH 10/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From 0f0937858c28b63a2d0edaf065597f9f3a352626 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Thu, 25 Sep 2025 16:49:47 -0700
Subject: [PATCH 11/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to 36 with `use_per_tensor_scale`, and 12 to inf without.
This is achieved by mimicking the PTQ flow more closely,
in particular, in descending order of significance:

1. Simulate `f4_unpacked_to_f32` and `f32_to_f4_unpacked`
2. Do not cast intermediate fake quantized values to original
   dtype, e.g. bf16 which loses some fidelity from fp32
3. Fake round blockwise scales to float8

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From 843cbcfc17b700ff225f3ef1f2bc2318303b31ca Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Thu, 25 Sep 2025 16:55:57 -0700
Subject: [PATCH 12/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to inf. This is achieved by refactoring NVFP4 QAT to
mimick the PTQ numerics exactly, using a new linear class to
incorporate both the quantization and mm logic.

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From bf2208c108793ff5703004fb3be4c0496dd47670 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Fri, 26 Sep 2025 07:57:02 -0700
Subject: [PATCH 13/16] Update base for Update on "Improve QAT nvfp4 numerics"

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to inf. This is achieved by refactoring NVFP4 QAT to
mimick the PTQ numerics exactly, using a new linear class to
incorporate both the quantization and mm logic.

**Test Plan:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

End-to-end tests TBD.

[ghstack-poisoned]

From 90bc7d472aa25ee3e0a38dc9eadab753f000ddd6 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Fri, 26 Sep 2025 09:01:17 -0700
Subject: [PATCH 14/16] Update base for Update on "Improve QAT nvfp4 numerics"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986, this commit improves the prepare vs convert SQNR of NVFP4 QAT from 12 to inf. This is achieved by refactoring NVFP4 QAT to mimic the PTQ numerics exactly, using a new linear class to incorporate both the quantization and mm logic.

**Unit tests:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

**End-to-end tests:**
Fine-tuning Llama3.2-3B with and without this PR in axolotl:
- fine-tune for 1 epoch on yahma/alpaca-cleaned
- batch size 512, learning rate 2e-5, no gradient accumulation

Wikitext:

- With this PR, QAT nvfp4 quantized model achieved 15% lower perplexity than the quantized baseline
- Without this PR, QAT nvfp4 quantized model was about the same as the quantized baseline

```
==> Llama3.2-3B_baseline_bs512/eval_float.log <==
|        |       |none  |     0|word_perplexity|↓  |9.418|±  |   N/A|

==> Llama3.2-3B_baseline_bs512/eval_quantized.log <==
|        |       |none  |     0|word_perplexity|↓  |10.3681|±  |   N/A|

# QAT with this PR (quantized)
==> unsloth_model_lora_qat_int4_output/lm_eval_quantized.log <==
|        |       |none  |     0|word_perplexity|↓  |10.2281|±  |   N/A|
```

[ghstack-poisoned]

From a48b7ded21e163d0f63443b6cbea4c6e6af1a569 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Fri, 26 Sep 2025 09:06:04 -0700
Subject: [PATCH 15/16] Update base for Update on "Improve QAT nvfp4 numerics"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to inf. This is achieved by refactoring NVFP4 QAT to
mimic the PTQ numerics exactly, using a new linear class to
incorporate both the quantization and mm logic.

**Unit tests:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

**End-to-end tests:**
Fine-tuning Llama3.2-3B with and without this PR in axolotl:
- fine-tune for 1 epoch on yahma/alpaca-cleaned
- batch size 512, learning rate 2e-5, no gradient accumulation

Wikitext:

- With this PR, QAT nvfp4 quantized model achieved 15% lower
  perplexity than the quantized baseline
- Without this PR, QAT nvfp4 quantized model was about the
  same as the quantized baseline

```
==> Llama3.2-3B_baseline_bs512/eval_float.log <==
|        |       |none  |     0|word_perplexity|↓  |9.418|±  |   N/A|

==> Llama3.2-3B_baseline_bs512/eval_quantized.log <==
|        |       |none  |     0|word_perplexity|↓  |10.3681|±  |   N/A|

# QAT with this PR (quantized)
==> Llama3.2-3B_qat_bs512/eval_quantized.log <==
|        |       |none  |     0|word_perplexity|↓  |10.2281|±  |   N/A|
```

[ghstack-poisoned]

From ef3682b5507f501b00c67a42328b8f2c969d2fb9 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Fri, 26 Sep 2025 13:17:07 -0700
Subject: [PATCH 16/16] Update base for Update on "Improve QAT nvfp4 numerics"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Summary:** Similar to https://github.com/pytorch/ao/pull/2986,
this commit improves the prepare vs convert SQNR of NVFP4 QAT
from 12 to inf. This is achieved by refactoring NVFP4 QAT to
mimic the PTQ numerics exactly, using a new linear class to
incorporate both the quantization and mm logic.

**Unit tests:**
```
python test/quantization/test_qat.py -k test_qat_nvfp4
python test/quantization/test_qat.py -k test_quantize_api_nvfp4
```

**End-to-end tests:**
Fine-tuning Llama3.2-3B with and without this PR in axolotl:
- fine-tune for 1 epoch on yahma/alpaca-cleaned
- batch size 512, learning rate 2e-5, no gradient accumulation

Wikitext:

- With this PR, QAT nvfp4 quantized model achieved 15% lower
  perplexity than the quantized baseline
- Without this PR, QAT nvfp4 quantized model was about the
  same as the quantized baseline

```
==> Llama3.2-3B_baseline_bs512/eval_float.log <==
|        |       |none  |     0|word_perplexity|↓  |9.418|±  |   N/A|

==> Llama3.2-3B_baseline_bs512/eval_quantized.log <==
|        |       |none  |     0|word_perplexity|↓  |10.3681|±  |   N/A|

# QAT with this PR (quantized)
==> Llama3.2-3B_qat_bs512/eval_quantized.log <==
|        |       |none  |     0|word_perplexity|↓  |10.2281|±  |   N/A|
```

[ghstack-poisoned]