[not for land yet] example of float8 with rowwise scaling

vkuzo · vkuzo · commit 7a04053a742a · 2025-02-26T08:15:51.000-08:00
Summary: This is an example of how to call float8 training with rowwise scaling from torchao. TODO: finalize API in torchao, and finalize how we want to expose it in torchtitan, and optimize performance. ``` // baseline (bf16 + compile) > with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --training.compile ... step: 20 loss: 8.4931 memory: 47.65GiB(50.16%) tps: 5,760 mfu: 33.73% // experiment (rowwise float8 + compile) > with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --float8.enable_float8_linear --training.compile ... // torchao main branch step: 40 loss: 7.3818 memory: 66.81GiB(70.33%) tps: 6,412 mfu: 37.55% // torchao with pytorch/ao#1629 step: 20 loss: 8.3823 memory: 58.55GiB(61.63%) tps: 6,424 mfu: 37.62% // for comparison, tensorwise float8 with float8 all-gather (on main branch) with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --float8.enable_float8_linear --training.compile --float8.enable_fsdp_float8_all_gather --float8.precompute_float8_dynamic_scale_for_fsdp ... step: 20 loss: 8.4258 memory: 47.32GiB(49.81%) tps: 7,186 mfu: 42.08% ``` Test Plan: Reviewers: Subscribers: Tasks: Tags:
diff --git a/torchtitan/components/float8.py b/torchtitan/components/float8.py
@@ -49,25 +49,46 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
                 "torchao is not installed. Please install it to use float8 linear layers."
             ) from e
 
-        # Mutates the model inplace replacing instances of torch.nn.Linear with Float8Linear
-        enable_fsdp_float8_all_gather = (
-            parallel_dims.dp_shard_enabled
-            and float8_config.enable_fsdp_float8_all_gather
-        )
-        self.config = Float8LinearConfig(
-            enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
-            force_recompute_fp8_weight_in_bwd=float8_config.force_recompute_fp8_weight_in_bwd,
-        )
+        if float8_config.recipe_name is not None and not hasattr(
+            Float8LinearConfig, "from_recipe_name"
+        ):
+            logger.warning(
+                "Failed to swap to Float8Linear with recipe lookup because the torchao version "
+                + "is too old, please install torchao v0.9.0 or later and try again",
+            )
+            return
 
         self.enabled = True
 
-        # for precompute_float8_dynamic_scale_for_fsdp
-        self.precompute_scale = (
-            enable_fsdp_float8_all_gather
-            and float8_config.precompute_float8_dynamic_scale_for_fsdp
-        )
+        if float8_config.recipe_name is not None:
+            assert (
+                not float8_config.enable_fsdp_float8_all_gather
+            ), "using `float8_config.enable_fsdp_float8_all_gather` together with `float8_config.recipe_name` is not supported"
+            assert (
+                not float8_config.force_recompute_fp8_weight_in_bwd
+            ), "using `float8_config.force_recompute_fp8_weight_in_bwd` together with `float8_config.recipe_name` is not supported"
+            self.config = Float8LinearConfig.from_recipe_name(float8_config.recipe_name)
+            self.precompute_scale = False
+            logger.info(
+                f"Float8 training active with recipe {float8_config.recipe_name}"
+            )
 
-        logger.info("Float8 training active")
+        else:
+            # Mutates the model inplace replacing instances of torch.nn.Linear with Float8Linear
+            enable_fsdp_float8_all_gather = (
+                parallel_dims.dp_shard_enabled
+                and float8_config.enable_fsdp_float8_all_gather
+            )
+            self.config = Float8LinearConfig(
+                enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+                force_recompute_fp8_weight_in_bwd=float8_config.force_recompute_fp8_weight_in_bwd,
+            )
+            # for precompute_float8_dynamic_scale_for_fsdp
+            self.precompute_scale = (
+                enable_fsdp_float8_all_gather
+                and float8_config.precompute_float8_dynamic_scale_for_fsdp
+            )
+            logger.info("Float8 tensorwise scaled training active")
 
     def convert(self, model: nn.Module):
         return self.convert_to_float8_training(model)
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -613,20 +613,30 @@ def __init__(self):
         self.parser.add_argument(
             "--float8.enable_fsdp_float8_all_gather",
             action="store_true",
-            help="Whether enable float8 all-gather in FSDP",
+            help="Whether enable float8 all-gather in FSDP, recommended for tensorwise scaling",
         )
         self.parser.add_argument(
             "--float8.precompute_float8_dynamic_scale_for_fsdp",
             action="store_true",
-            help="Whether precompute float8 scales dynamically for FSDP",
+            help="Whether precompute float8 scales dynamically for FSDP, recommended for tensorwise scaling",
         )
         self.parser.add_argument(
             "--float8.force_recompute_fp8_weight_in_bwd",
             action="store_true",
             help="""
             Whether to force the recomputation of FP8 weights during backward pass.
-            When using FSDP, it is recommended to enable `force_recompute_fp8_weight_in_bwd`
-            to prevent saving unsharded FP8 weights for backward computation.
+            When using FSDP with tensorwise scaling, it is recommended to enable
+            `force_recompute_fp8_weight_in_bwd` to prevent saving unsharded FP8 weights
+            for backward computation.
+            """,
+        )
+        self.parser.add_argument(
+            "--float8.recipe_name",
+            type=str,
+            default=None,
+            help="""
+            If specified, creates float8 config from recipe name, valid choices are
+            `rowwise` and `rowwise_with_gw_hp`.
             """,
         )
 
diff --git a/torchtitan/models/llama/parallelize_llama.py b/torchtitan/models/llama/parallelize_llama.py
@@ -56,12 +56,23 @@ def parallelize_llama(
             and not job_config.training.compile
         ):
             raise RuntimeError("Async TP requires --training.compile")
+
         enable_float8_linear = "float8" in job_config.model.converters
+        float8_is_rowwise = job_config.float8.recipe_name in (
+            "rowwise",
+            "rowwise_with_gw_hp",
+        )
+
+        # For now, float8 all-gather with TP is only supported for tensorwise
+        # float8 scaling recipes. For rowwise recipes, we use regular TP and
+        # all-gather happens in high precision.
+        enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+
         apply_tp(
             model,
             world_mesh["tp"],
             loss_parallel=parallel_dims.loss_parallel_enabled,
-            enable_float8=enable_float8_linear,
+            enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
             enable_async_tp=job_config.experimental.enable_async_tensor_parallel,
         )
 
@@ -115,7 +126,7 @@ def apply_tp(
     model: nn.Module,
     tp_mesh: DeviceMesh,
     loss_parallel: bool,
-    enable_float8: bool,
+    enable_float8_tensorwise_tp: bool,
     enable_async_tp: bool,
 ):
     """Apply tensor parallelism."""
@@ -141,10 +152,8 @@ def apply_tp(
     )
 
     # Parallel styles used for transformer block linear weights and their
-    # inputs may be different for float8 linears
-    if enable_float8:
-        # TODO(vkuzo): once float8 configuration supports delayed scaling,
-        # add a check here to enforce supported float8 all-gather configurations
+    # inputs may be different for float8 linears with tensorwise scaling.
+    if enable_float8_tensorwise_tp:
         # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
         from torchao.float8.float8_tensor_parallel import (
             Float8ColwiseParallel,
@@ -202,7 +211,7 @@ def apply_tp(
         enable_symm_mem_for_group(tp_mesh.get_group().group_name)
 
     logger.info(
-        f"Applied {'Float8 ' if enable_float8 else ''}{'Async ' if enable_async_tp else ''}"
+        f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}{'Async ' if enable_async_tp else ''}"
         "Tensor Parallelism to the model"
     )