(prototype) add support for MXFP8 and MXFP4 QAT by ved1beta · Pull Request #3644 · pytorch/ao

ved1beta · 2026-01-15T18:41:34Z

Title
#3547
Class MXFakeQuantizedLinear
Class MXFakeQuantizeConfig
Class_MXQuantizedForwardFakeQuantizedBackward

Tried following nvfp4 implementation

Tests included + e2e test

"""MXFP4 QAT end-to-end training validation."""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset


class SimpleMLP(nn.Module):
    def __init__(self, input_dim=512, hidden_dim=256, num_classes=32):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim, bias=False)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.linear3 = nn.Linear(hidden_dim, num_classes, bias=False)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        return self.linear3(x)


def create_data(num_samples=1000, input_dim=512, num_classes=32, device="cuda"):
    X = torch.randn(num_samples, input_dim, device=device)
    y = torch.randint(0, num_classes, (num_samples,), device=device)
    return DataLoader(TensorDataset(X, y), batch_size=32, shuffle=True)


def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    for data, target in loader:
        optimizer.zero_grad()
        loss = F.cross_entropy(model(data), target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


def run_training(model, loader, epochs=5, lr=0.01):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    losses = []
    for _ in range(epochs):
        losses.append(train_epoch(model, loader, optimizer))
    return losses


def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    train_loader = create_data(device=device)

    torch.manual_seed(42)
    baseline = SimpleMLP().to(device)
    baseline_losses = run_training(baseline, train_loader)

    from torchao.quantization import quantize_
    from torchao.quantization.qat import QATConfig
    from torchao.prototype.mx_formats import MXDynamicActivationMXWeightConfig

    torch.manual_seed(42)
    qat_model = SimpleMLP().to(device)
    base_config = MXDynamicActivationMXWeightConfig(
        activation_dtype=torch.float4_e2m1fn_x2,
        weight_dtype=torch.float4_e2m1fn_x2,
    )
    quantize_(qat_model, QATConfig(base_config, step="prepare"))
    qat_losses = run_training(qat_model, train_loader)

    print("Epoch | Baseline | MXFP4 QAT")
    print("-" * 30)
    for i, (b, q) in enumerate(zip(baseline_losses, qat_losses)):
        print(f"  {i+1}   |  {b:.4f}  |  {q:.4f}")


if __name__ == "__main__":
    main()

pytorch-bot · 2026-01-15T18:41:38Z

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/pytorch/ao/3644

📄 Preview Python docs built from this PR

Note: Links to docs will display an error until the docs builds have been completed.

✅ No Failures

As of commit 02d271e with merge base 30fcb15 ():
💚 Looks good so far! There are no failures yet. 💚

This comment was automatically generated by Dr. CI and updates every 15 minutes.

vkuzo · 2026-01-16T12:01:04Z