pytorch · tianyu-l · Feb 6, 2025 · Feb 4, 2025 · Feb 5, 2025 · Feb 5, 2025
@@ -217,8 +217,8 @@ def apply_tp(
     torch.ops.aten._scaled_dot_product_flash_attention.default,
     torch.ops._c10d_functional.reduce_scatter_tensor.default,
     # for low precision training, it's useful to always save
-    # the result of max(abs(tensor))
-    torch.ops.aten.abs.default,
+    # the result of max, since the absolute maximum is
+    # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
 }