From c6e9d64a36d3f64024e080d82c7aff7636c7d10b Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Tue, 4 Feb 2025 15:09:31 -0800 Subject: [PATCH 1/4] remove abs from save list for per op ac --- torchtitan/parallelisms/parallelize_llama.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py index 9728569ab1..5647849775 100644 --- a/torchtitan/parallelisms/parallelize_llama.py +++ b/torchtitan/parallelisms/parallelize_llama.py @@ -218,7 +218,6 @@ def apply_tp( torch.ops._c10d_functional.reduce_scatter_tensor.default, # for low precision training, it's useful to always save # the result of max(abs(tensor)) - torch.ops.aten.abs.default, torch.ops.aten.max.default, } From d6072987c653dfb3abde1344a0c0b0977f46689a Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Tue, 4 Feb 2025 20:32:28 -0800 Subject: [PATCH 2/4] update comment --- torchtitan/parallelisms/parallelize_llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py index 5647849775..ecf8332475 100644 --- a/torchtitan/parallelisms/parallelize_llama.py +++ b/torchtitan/parallelisms/parallelize_llama.py @@ -217,7 +217,8 @@ def apply_tp( torch.ops.aten._scaled_dot_product_flash_attention.default, torch.ops._c10d_functional.reduce_scatter_tensor.default, # for low precision training, it's useful to always save - # the result of max(abs(tensor)) + # the result of max, since the absolute maximum is + # used to compute the scaling factor for quantization. torch.ops.aten.max.default, } From 730592aab4b7d98a51325452fd340912aee93a3d Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Wed, 5 Feb 2025 11:08:46 -0800 Subject: [PATCH 3/4] trigger build From 838ebabc2025f2d1d1752fea53e28d3e1fc7d265 Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Wed, 5 Feb 2025 19:29:10 -0800 Subject: [PATCH 4/4] trigger ci