From 7488385b5a13fa8689115556e90a47c9e7d721cc Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 28 Aug 2025 07:56:30 +0000 Subject: [PATCH 001/129] create transformer_backend folder with debug run --- .../debug/configs/debug_fsdp_2_gpu.toml | 65 +++++++++++++++++++ .../transformers_backend/debug/run_train.sh | 33 ++++++++++ 2 files changed, 98 insertions(+) create mode 100644 torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml create mode 100755 torchtitan/experiments/transformers_backend/debug/run_train.sh diff --git a/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml b/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml new file mode 100644 index 0000000000..db97c9b339 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml @@ -0,0 +1,65 @@ +# FSDP-only configuration for a 2-GPU setup. +# Model is sharded across GPUs. + +[job] +dump_folder = "./outputs" +description = "Llama 3 debug training with FSDP on 2 GPUs" +print_args = false +use_for_integration_test = true + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +disable_color_printing = false +enable_tensorboard = false +save_tb_folder = "tb" +enable_wandb = false + +[model] +name = "llama3" +flavor = "debugmodel" +tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" + +[optimizer] +name = "AdamW" +lr = 8e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 2 +decay_ratio = 0.8 +decay_type = "linear" +min_lr_factor = 0.0 + +[training] +local_batch_size = 8 +seq_len = 2048 +max_norm = 1.0 +steps = 10 +compile = false +dataset = "c4_test" +dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = 2 +tensor_parallel_degree = 1 +pipeline_parallel_degree = 1 +context_parallel_degree = 1 +expert_parallel_degree = 1 + +[checkpoint] +enable_checkpoint = false + +[activation_checkpoint] +mode = "selective" +selective_ac_option = '2' + +[validation] +enabled = false \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/debug/run_train.sh b/torchtitan/experiments/transformers_backend/debug/run_train.sh new file mode 100755 index 0000000000..fc259612bc --- /dev/null +++ b/torchtitan/experiments/transformers_backend/debug/run_train.sh @@ -0,0 +1,33 @@ +#!/usr/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +# use envs as local overwrites for convenience +# e.g. +# LOG_RANK=0,1 NGPU=4 ./run_train.sh +NGPU=${NGPU:-"8"} +export LOG_RANK=${LOG_RANK:-0} + +# Option to switch between debug and train +MODE=${MODE:-"train"} # Set MODE=debug or MODE=train + +CONFIG_FILE=${CONFIG_FILE:-"configs/debug_fsdp_2_gpu.toml"} + +if [ "$MODE" = "debug" ]; then + PYTHON_CMD="debugpy-run -m torch.distributed.run --" +else + PYTHON_CMD="torchrun" +fi + +TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} + +PYTORCH_ALLOC_CONF="expandable_segments:True" \ +TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \ +$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ +--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ +-m torchtitan.train --job.config_file ${CONFIG_FILE} "$@" \ No newline at end of file From 39a3b34907975732d0777eba07aec2111bfd658f Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 28 Aug 2025 08:45:18 +0000 Subject: [PATCH 002/129] add hf config --- .../configs/debug_1_gpu.toml | 63 +++++++++++++++++++ .../configs/debug_1_gpu_hf.toml | 62 ++++++++++++++++++ .../{debug => }/configs/debug_fsdp_2_gpu.toml | 0 .../{debug => }/run_train.sh | 2 +- 4 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml create mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml rename torchtitan/experiments/transformers_backend/{debug => }/configs/debug_fsdp_2_gpu.toml (100%) rename torchtitan/experiments/transformers_backend/{debug => }/run_train.sh (94%) diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml new file mode 100644 index 0000000000..c2f4dd7136 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml @@ -0,0 +1,63 @@ +[job] +dump_folder = "./outputs" +description = "Llama 3 debug training with FSDP on 2 GPUs" +print_args = false +use_for_integration_test = true + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +disable_color_printing = false +enable_tensorboard = false +save_tb_folder = "tb" +enable_wandb = false + +[model] +name = "llama3" +hf_name = "Llama-3.2-3B" +flavor = "debugmodel" +tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" + +[optimizer] +name = "AdamW" +lr = 8e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 2 +decay_ratio = 0.8 +decay_type = "linear" +min_lr_factor = 0.0 + +[training] +local_batch_size = 8 +seq_len = 2048 +max_norm = 1.0 +steps = 10 +compile = false +dataset = "c4_test" +dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = 1 +tensor_parallel_degree = 1 +pipeline_parallel_degree = 1 +context_parallel_degree = 1 +expert_parallel_degree = 1 + +[checkpoint] +enable_checkpoint = false + +[activation_checkpoint] +mode = "selective" +selective_ac_option = '2' + +[validation] +enabled = false \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml new file mode 100644 index 0000000000..a314d1711e --- /dev/null +++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml @@ -0,0 +1,62 @@ +[job] +dump_folder = "./outputs" +description = "Llama 3 debug training with FSDP on 2 GPUs" +print_args = false +use_for_integration_test = true + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +disable_color_printing = false +enable_tensorboard = false +save_tb_folder = "tb" +enable_wandb = false + +[model] +name = "meta-llama/Llama-3.2-3B" +flavor = "debugmodel" +tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" + +[optimizer] +name = "AdamW" +lr = 8e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 2 +decay_ratio = 0.8 +decay_type = "linear" +min_lr_factor = 0.0 + +[training] +local_batch_size = 8 +seq_len = 2048 +max_norm = 1.0 +steps = 10 +compile = false +dataset = "c4_test" +dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = 1 +tensor_parallel_degree = 1 +pipeline_parallel_degree = 1 +context_parallel_degree = 1 +expert_parallel_degree = 1 + +[checkpoint] +enable_checkpoint = false + +[activation_checkpoint] +mode = "selective" +selective_ac_option = '2' + +[validation] +enabled = false \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml similarity index 100% rename from torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml rename to torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml diff --git a/torchtitan/experiments/transformers_backend/debug/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh similarity index 94% rename from torchtitan/experiments/transformers_backend/debug/run_train.sh rename to torchtitan/experiments/transformers_backend/run_train.sh index fc259612bc..74ef5603b1 100755 --- a/torchtitan/experiments/transformers_backend/debug/run_train.sh +++ b/torchtitan/experiments/transformers_backend/run_train.sh @@ -16,7 +16,7 @@ export LOG_RANK=${LOG_RANK:-0} # Option to switch between debug and train MODE=${MODE:-"train"} # Set MODE=debug or MODE=train -CONFIG_FILE=${CONFIG_FILE:-"configs/debug_fsdp_2_gpu.toml"} +CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu.toml"} if [ "$MODE" = "debug" ]; then PYTHON_CMD="debugpy-run -m torch.distributed.run --" From ea7c594c263aa856f1e82899427517bdc315bf8e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 28 Aug 2025 09:22:45 +0000 Subject: [PATCH 003/129] can now register train spec for hf model --- torchtitan/experiments/__init__.py | 1 + .../transformers_backend/__init__.py | 56 ++ .../infra/parallelize_hf_transformers.py | 503 ++++++++++++++++++ .../model/hf_transformers_args.py | 127 +++++ 4 files changed, 687 insertions(+) create mode 100644 torchtitan/experiments/transformers_backend/__init__.py create mode 100644 torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py create mode 100644 torchtitan/experiments/transformers_backend/model/hf_transformers_args.py diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py index 9d81f6b885..32a41004a2 100644 --- a/torchtitan/experiments/__init__.py +++ b/torchtitan/experiments/__init__.py @@ -7,3 +7,4 @@ import torchtitan.experiments.llama4 # noqa: F401 import torchtitan.experiments.qwen3 import torchtitan.experiments.simple_fsdp # noqa: F401 +import torchtitan.experiments.transformers_backend # noqa: F401 \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py new file mode 100644 index 0000000000..5ec6386a2b --- /dev/null +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -0,0 +1,56 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import dataclasses + +from torchtitan.components.loss import build_cross_entropy_loss +from torchtitan.components.lr_scheduler import build_lr_schedulers +from torchtitan.components.optimizer import build_optimizers +from torchtitan.datasets.hf_datasets import build_hf_dataloader +from torchtitan.components.tokenizer import build_hf_tokenizer + +from torchtitan.models.llama3 import pipeline_llama +from torchtitan.protocols.train_spec import register_train_spec, TrainSpec + +from .infra.parallelize_hf_transformers import parallelize_hf_transformers +from .model.hf_transformers_args import HFTransformerModelArgs + +from transformers.models.llama.modeling_llama import LlamaForCausalLM + + +__all__ = [ + "HFTransformerModelArgs", + "LlamaForCausalLM", #TODO(3outeille): later use AutoModelForCausalLM + "hf_transformers_configs", +] + + +hf_configs = { + "debugmodel": HFTransformerModelArgs( + dim=256, + n_layers=6, + n_heads=16, + rope_theta=500000, + ), +} + +hf_train_spec = TrainSpec( + name="hf_auto_model", + model_cls=LlamaForCausalLM, + model_args=hf_configs, + parallelize_fn=parallelize_hf_transformers, + pipelining_fn=pipeline_llama, + build_optimizers_fn=build_optimizers, + build_lr_schedulers_fn=build_lr_schedulers, + build_dataloader_fn=build_hf_dataloader, + build_tokenizer_fn=build_hf_tokenizer, + build_loss_fn=build_cross_entropy_loss, +) + +# Register multiple train_specs under the same name +register_train_spec(hf_train_spec) +register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-3B")) +register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-1B")) \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py new file mode 100644 index 0000000000..3f26036dc8 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -0,0 +1,503 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +import torch.nn as nn +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy +from torch.distributed.tensor import Partial, Replicate, Shard +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + parallelize_module, + PrepareModuleInput, + PrepareModuleInputOutput, + RowwiseParallel, + SequenceParallel, +) +from torchtitan.config import JobConfig, TORCH_DTYPE_MAP +from torchtitan.distributed import ParallelDims + +from torchtitan.distributed.expert_parallel import ( + ExpertParallel, + ExpertTensorParallel, + NoParallel, + ReordererSequenceParallel, + TensorParallel, +) +from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp + +from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp +from torchtitan.tools.logging import logger + + +def parallelize_hf_transformers( + model: nn.Module, + parallel_dims: ParallelDims, + job_config: JobConfig, +): + """ + Apply tensor parallelism, activation checkpointing, torch.compile, and data + parallelism to the model. + + NOTE: The passed-in model preferably should be on meta device. Otherwise, + the model must fit on GPU or CPU memory. + """ + world_mesh = parallel_dims.world_mesh + # TODO: TP currently cannot handle uneven seq_len because we set + # `use_local_output=True` to use plain Tensors for legacy reasons. + # Need to revisit this. + assert ( + job_config.training.seq_len % parallel_dims.seq_len_divisor == 0 + ), f""" + Sequence length {job_config.training.seq_len} must be divisible by the product of TP degree + ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}). + """ + + if ( + job_config.parallelism.context_parallel_degree > 1 + and model.model_args.use_flex_attn + ): + raise NotImplementedError("CP support for FlexAttention is still in progress.") + + if parallel_dims.tp_enabled: + enable_float8_linear = "float8" in job_config.model.converters + float8_is_rowwise = job_config.float8.recipe_name in ( + "rowwise", + "rowwise_with_gw_hp", + ) + + # For now, float8 all-gather with TP is only supported for tensorwise + # float8 scaling recipes. For rowwise recipes, we use regular TP and + # all-gather happens in high precision. + enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise + + apply_non_moe_tp( + model, + world_mesh["tp"], + loss_parallel=not job_config.parallelism.disable_loss_parallel, + enable_float8_tensorwise_tp=enable_float8_tensorwise_tp, + ) + maybe_enable_async_tp(job_config, world_mesh["tp"]) + + if parallel_dims.tp_enabled or parallel_dims.ep_enabled: + apply_moe_ep_tp( + model, + tp_mesh=world_mesh["tp"] if parallel_dims.tp_enabled else None, + ep_mesh=world_mesh["ep"] if parallel_dims.ep_enabled else None, + ep_tp_mesh=( + world_mesh["ep", "tp"] + if parallel_dims.tp_enabled + and parallel_dims.ep_enabled + and parallel_dims.etp_enabled + else None + ), + etp_enabled=parallel_dims.etp_enabled, + ) + + if job_config.activation_checkpoint.mode != "none": + apply_ac(model, job_config.activation_checkpoint) + + model_compile_enabled = ( + job_config.compile.enable and "model" in job_config.compile.components + ) + # turn on per-TransformerBlock compile after AC wrapping and before FSDP + if model_compile_enabled: + # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE + torch._dynamo.config.capture_scalar_outputs = True + apply_compile(model) + + dp_mesh: DeviceMesh | None = None + if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled: + # apply FSDP or HSDP, potentially with Context Parallel + if parallel_dims.dp_replicate_enabled: + dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp") + else: + dp_mesh_dim_names = ("dp_shard_cp",) + dp_mesh = world_mesh[tuple(dp_mesh_dim_names)] + + # the mesh dim names of which the MoE params are sharded on via FSDP/HSDP + dp_mod_ep_mesh_dim_names = [] + if parallel_dims.ep_enabled: + if parallel_dims.dp_replicate_enabled: + dp_mod_ep_mesh_dim_names.append("dp_replicate") + dp_mod_ep_mesh_dim_names.append("dp_shard_mod_ep") + + apply_fsdp( + model, + dp_mesh, + param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param], + reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce], + pp_enabled=parallel_dims.pp_enabled, + cpu_offload=job_config.training.enable_cpu_offload, + reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward, + ep_degree=parallel_dims.ep, + dp_mod_ep_mesh=( + world_mesh[tuple(dp_mod_ep_mesh_dim_names)] + if parallel_dims.ep_enabled + else None + ), + gradient_divide_factor=parallel_dims.fsdp_gradient_divide_factor, + ) + + if parallel_dims.dp_replicate_enabled: + logger.info("Applied HSDP to the model") + else: + logger.info("Applied FSDP to the model") + + if parallel_dims.cp_enabled: + logger.info("Applied Context Parallel to the model") + + if job_config.training.enable_cpu_offload: + logger.info("Applied CPU Offloading to the model") + elif parallel_dims.dp_replicate_enabled: + if world_mesh.ndim > 1: + raise RuntimeError("DDP has not supported > 1D parallelism") + dp_mesh = world_mesh + apply_ddp( + model, + dp_mesh, + enable_compile=model_compile_enabled, + enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd, + ) + + return model + + +def apply_non_moe_tp( + model: nn.Module, + tp_mesh: DeviceMesh, + loss_parallel: bool, + enable_float8_tensorwise_tp: bool, +): + """Apply tensor parallelism.""" + # 1. Parallelize the embedding and shard its outputs (which are the first + # transformer block's inputs) + # 2. Parallelize the root norm layer over the sequence dim + # 3. Parallelize the final linear output layer + parallelize_module( + model, + tp_mesh, + { + "tok_embeddings": RowwiseParallel( + input_layouts=Replicate(), + output_layouts=Shard(1), + ), + "norm": SequenceParallel(), + "output": ColwiseParallel( + input_layouts=Shard(1), + output_layouts=Shard(-1) if loss_parallel else Replicate(), + use_local_output=not loss_parallel, + ), + }, + ) + + # Parallel styles used for transformer block linear weights and their + # inputs may be different for float8 linears with tensorwise scaling. + if enable_float8_tensorwise_tp: + # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there + from torchao.float8.float8_tensor_parallel import ( + Float8ColwiseParallel, + Float8RowwiseParallel, + PrepareFloat8ModuleInput, + ) + + rowwise_parallel, colwise_parallel, prepare_module_input = ( + Float8RowwiseParallel, + Float8ColwiseParallel, + PrepareFloat8ModuleInput, + ) + else: + rowwise_parallel, colwise_parallel, prepare_module_input = ( + RowwiseParallel, + ColwiseParallel, + PrepareModuleInput, + ) + + # Apply tensor + sequence parallelism to every transformer block + for transformer_block in model.layers.values(): + layer_plan = { + "attention_norm": SequenceParallel(), + "attention": prepare_module_input( + input_layouts=(Shard(1), None), + desired_input_layouts=(Replicate(), None), + ), + "attention.wq": colwise_parallel(), + "attention.wk": colwise_parallel(), + "attention.wv": colwise_parallel(), + "attention.wo": rowwise_parallel(output_layouts=Shard(1)), + "ffn_norm": SequenceParallel(), + } + if not transformer_block.moe_enabled: + layer_plan.update( + { + "feed_forward": prepare_module_input( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + "feed_forward.w1": colwise_parallel(), + "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)), + "feed_forward.w3": colwise_parallel(), + } + ) + + parallelize_module( + module=transformer_block, + device_mesh=tp_mesh, + parallelize_plan=layer_plan, + ) + + logger.info( + f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}" + "Tensor Parallelism to the model" + ) + + +def apply_fsdp( + model: nn.Module, + dp_mesh: DeviceMesh, + param_dtype: torch.dtype, + reduce_dtype: torch.dtype, + pp_enabled: bool, + cpu_offload: bool = False, + reshard_after_forward_policy: str = "default", + ep_degree: int = 1, + dp_mod_ep_mesh: DeviceMesh | None = None, + gradient_divide_factor: int | None = None, +): + """ + Apply data parallelism (via FSDP2) to the model. + + Args: + model (nn.Module): The model to apply data parallelism to. + dp_mesh (DeviceMesh): The device mesh to use for data parallelism. + param_dtype (torch.dtype): The data type to use for model parameters. + reduce_dtype (torch.dtype): The data type to use for reduction operations. + pp_enabled (bool): Whether pipeline parallelism is enabled. + cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False. + reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default". + Other options: "never", "always". + - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios. + - "always" will enable `reshard_after_forward` for all forward passes. + - "never" will disable `reshard_after_forward` for all forward passes. + + """ + mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype) + fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} + if cpu_offload: + fsdp_config["offload_policy"] = CPUOffloadPolicy() + + match reshard_after_forward_policy: + case "always": + reshard_after_forward = True + case "never": + reshard_after_forward = False + case "default": + # For PP, by default do not reshard after forward to avoid per-microbatch + # all-gathers, which can be expensive and non-overlapped + reshard_after_forward = not pp_enabled + case _: + raise ValueError( + f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}." + ) + + if model.tok_embeddings is not None: + fully_shard( + model.tok_embeddings, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + + for layer_id, transformer_block in model.layers.items(): + # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping + # - the router and the shared experts are sharded together with the TransformerBlock + # - the routed experts are sharded with the remaining dp_mod_ep_mesh + if transformer_block.moe_enabled and ep_degree > 1: + fsdp_mod_ep_config = fsdp_config.copy() + fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh + + # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts). + # When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding + # causes inefficiency, so we choose to do FSDP sharding on dim-1. + # Even when EP is not used, we may still want to shard the experts + # on non-0 dim. For now it may not be worth the complexity to support + # shard_placement_fn on the outer TransformerBlock-level FSDP. + _experts_shard_placement_fn = None + assert dp_mod_ep_mesh is not None + assert hasattr(transformer_block, "moe") + if ( + dp_mod_ep_mesh.size() * ep_degree + > transformer_block.moe.experts.num_experts + ): + _experts_shard_placement_fn = lambda param: Shard(1) + + fully_shard( + transformer_block.moe.experts, + **fsdp_mod_ep_config, + reshard_after_forward=reshard_after_forward, + shard_placement_fn=_experts_shard_placement_fn, + ) + + # NOTE: # Although the FSDP sharding of experts is done on a mesh of + # a different size than other parameters, the gradient division + # factor should be consistent with data. + transformer_block.moe.experts.set_gradient_divide_factor( + gradient_divide_factor, + ) + + fully_shard( + transformer_block, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + + # As an optimization, do not reshard_after_forward the last layers by default + # since FSDP would prefetch them immediately after the forward pass + if model.norm is not None and model.output is not None: + fully_shard( + [model.norm, model.output], + **fsdp_config, + reshard_after_forward=reshard_after_forward_policy == "always", + ) + + fully_shard(model, **fsdp_config) + + # NOTE: set up explicit prefetching when EP is enabled, as D2H syncs + # in EP could interfere with implicit prefetching in FSDP + if ep_degree == 1: + return + + # forward + transformer_blocks = list(model.layers.values()) + next_transformer_blocks = transformer_blocks[1:] + [None] + + if model.tok_embeddings is not None and model.layers is not None: + model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]]) + + for transformer_block, next_transformer_block in zip( + transformer_blocks, next_transformer_blocks + ): + if next_transformer_block is not None: + if next_transformer_block.moe_enabled: + transformer_block.set_modules_to_forward_prefetch( + [next_transformer_block, next_transformer_block.moe.experts] + ) + else: + transformer_block.set_modules_to_forward_prefetch( + [next_transformer_block] + ) + elif model.norm is not None and model.output is not None: + transformer_block.set_modules_to_forward_prefetch( + [model.norm, model.output] + ) + + # backward + reversed_transformer_blocks = list(reversed(model.layers.values())) + prev_transformer_blocks = reversed_transformer_blocks[1:] + [None] + + if model.norm is not None and model.output is not None and model.layers is not None: + model.output.set_modules_to_backward_prefetch([reversed_transformer_blocks[0]]) + + for transformer_block, prev_transformer_block in zip( + reversed_transformer_blocks, prev_transformer_blocks + ): + if prev_transformer_block is not None: + if prev_transformer_block.moe_enabled: + transformer_block.set_modules_to_backward_prefetch( + [prev_transformer_block, prev_transformer_block.moe.experts] + ) + else: + transformer_block.set_modules_to_backward_prefetch( + [prev_transformer_block] + ) + elif model.tok_embeddings is not None: + transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings]) + + +def apply_moe_ep_tp( + model: nn.Module, + tp_mesh: DeviceMesh | None, + ep_mesh: DeviceMesh | None, + ep_tp_mesh: DeviceMesh | None, + etp_enabled: bool, +): + for transformer_block in model.layers.values(): + if not transformer_block.moe_enabled: + continue + + if tp_mesh is not None: + moe_layer_plan = { + # input / output sharding on the seqlen dim + # all-gather for input, reduce-scatter for output + "moe": PrepareModuleInputOutput( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + use_local_input=True, + output_layouts=(Partial(),), + desired_output_layouts=(Shard(1),), + ), + # replicate computation for the router + "moe.router.gate": NoParallel(), + } + if ep_mesh is not None and not etp_enabled: + # If TP is borrowed for EP, then split the tokens across TP ranks so that + # the reorderer, the all-to-all comms, and routed experts computation + # are effectively running Sequence Parallel (split along the folded bs*slen dim) + moe_layer_plan.update({"moe.reorderer": ReordererSequenceParallel()}) + if transformer_block.moe.shared_experts is not None: + # input Replicate, output Partial + moe_layer_plan.update( + { + "moe.shared_experts.w1": ColwiseParallel(), + "moe.shared_experts.w2": RowwiseParallel( + output_layouts=Partial() + ), + "moe.shared_experts.w3": ColwiseParallel(), + } + ) + parallelize_module( + module=transformer_block, + device_mesh=tp_mesh, + parallelize_plan=moe_layer_plan, + ) + + experts_mesh, experts_plan = None, None + if ep_mesh is None: + experts_mesh = tp_mesh + # input Replicate, output Partial + experts_plan = TensorParallel() + elif tp_mesh is None: + experts_mesh = ep_mesh + # input / output sharding on the batch / tokens dim + experts_plan = ExpertParallel() + elif etp_enabled: + experts_mesh = ep_tp_mesh + experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh) + else: + experts_mesh = ep_mesh + experts_plan = ExpertParallel() + + parallelize_module( + module=transformer_block.moe.experts, + device_mesh=experts_mesh, + parallelize_plan=experts_plan, + ) + + +def apply_compile(model: nn.Module): + """ + Apply torch.compile to each TransformerBlock, which makes compilation efficient due to + repeated structure. Alternatively one can compile the whole model (after applying DP). + """ + for layer_id, transformer_block in model.layers.named_children(): + # TODO: remove when torch.compile supports fullgraph=True for MoE + fullgraph = True + if transformer_block.moe_enabled: + fullgraph = False + transformer_block = torch.compile(transformer_block, fullgraph=fullgraph) + model.layers.register_module(layer_id, transformer_block) + + logger.info("Compiling each TransformerBlock with torch.compile") diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py new file mode 100644 index 0000000000..92e149625b --- /dev/null +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -0,0 +1,127 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from dataclasses import dataclass, field + +from torch import nn + +from torchtitan.config import JobConfig + +from torchtitan.models.moe import MoEArgs +from torchtitan.protocols import BaseModelArgs +from torchtitan.tools.logging import logger +from torchtitan.tools.utils import has_cuda_capability + + +@dataclass +class HFTransformerModelArgs(BaseModelArgs): + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: int | None = None + vocab_size: int = 202048 + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + ffn_dim_multiplier: float | None = None + norm_eps: float = 1e-5 + rope_theta: float = 10000 + + max_seq_len: int = 1048576 + # If `True`, then each transformer block init uses its layer ID, and if + # `False`, each uses the total number of transformer blocks + depth_init: bool = True + + use_flex_attn: bool = False + attn_mask_type: str = "causal" + # iRoPE settings + # When ``every_n_layers_nope`` is specified, NoPE (no positional embedding) is + # used every n layers. Other layers uses RoPE (rotary positional embedding) and + # the inner attention of those layer will use the fixed block size specified by + # ``fixed_attn_block_size``. ``fixed_attn_block_size`` means that the query will + # only attend to the tokens within the same block regardless how long is the + # sequence. + every_n_layers_nope: int | None = None + fixed_attn_block_size: int = 8192 + + # MoE + moe_args: MoEArgs = field(default_factory=MoEArgs) + auto_scale_hidden_dim: bool = True + # frequency of using MoE layer instead of feedforward layer in a transformer block + interleave_moe_layer_step: int = 2 + + def update_from_config(self, job_config: JobConfig, **kwargs) -> None: + seq_len = job_config.training.seq_len + if seq_len > self.max_seq_len: + logger.warning( + f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}." + ) + self.max_seq_len = seq_len + + if self.moe_args.use_grouped_mm and not has_cuda_capability(9, 0): + logger.warning( + "Failed to use grouped mm, which is only supported on SM90 or later", + ) + self.moe_args.use_grouped_mm = False + + if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn: + raise NotImplementedError( + "CP support for FlexAttention is still in progress." + ) + + def get_nparams_and_flops( + self, model: nn.Module, seq_len: int + ) -> tuple[int, float]: + return 0, 0 + # nparams_embedding = 0 + # nparams_moe_router = 0 + # nparams_shared_experts = 0 + # nparams_experts = 0 + # nparams_dense = 0 + + # for name, p in model.named_parameters(): + # if "embedding" in name: + # nparams_embedding += p.numel() + # nparams_dense += p.numel() + # elif "moe.shared_experts" in name: + # nparams_shared_experts += p.numel() + # elif "moe.router" in name: + # nparams_moe_router += p.numel() + # elif "moe.experts" in name: + # nparams_experts += p.numel() + # else: + # nparams_dense += p.numel() + + # nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts + # nparams = nparams_dense + nparams_sparse + # nparams_sparse_active = ( + # nparams_moe_router + # + nparams_shared_experts + # + nparams_experts * self.moe_args.top_k // self.moe_args.num_experts + # ) + + # logger.info( + # f"Total parameter count: dense {nparams_dense:,}, " + # f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" + # ) + + # l, h, q, t = ( + # self.n_layers, + # self.n_heads, + # self.dim // self.n_heads, + # seq_len, + # ) + # # Reasoning behind the factor of 12 for the self-attention part of the formula: + # # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # # 2. the flash attention does 1 more matmul recomputation in the backward + # # but recomputation should not be counted in calculating MFU (+0) + # # 3. each matmul performs 1 multiplication and 1 addition (*2) + # # 4. we follow the convention and do not account for sparsity in causal attention + # num_flops_per_token = ( + # 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) + # + 12 * l * h * q * t + # ) + + # return nparams, num_flops_per_token From 5f0adf5c226aa9af321cdd27b7d379fd03823e10 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 28 Aug 2025 13:56:04 +0000 Subject: [PATCH 004/129] can now switch with different flavors using HF Llama modeling --- .../transformers_backend/__init__.py | 15 +- .../configs/debug_1_gpu.toml | 1 - .../configs/debug_1_gpu_hf.toml | 4 +- .../model/hf_transformers_args.py | 148 ++++++------------ 4 files changed, 64 insertions(+), 104 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 5ec6386a2b..e416731205 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -28,19 +28,26 @@ ] -hf_configs = { - "debugmodel": HFTransformerModelArgs( - dim=256, +flavors = { + "debug": HFTransformerModelArgs( + dim=1, n_layers=6, n_heads=16, rope_theta=500000, ), + "medium": HFTransformerModelArgs( + dim=40, + n_layers=24, + n_heads=32, + rope_theta=500000, + ), + "full": HFTransformerModelArgs(), } hf_train_spec = TrainSpec( name="hf_auto_model", model_cls=LlamaForCausalLM, - model_args=hf_configs, + model_args=flavors, parallelize_fn=parallelize_hf_transformers, pipelining_fn=pipeline_llama, build_optimizers_fn=build_optimizers, diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml index c2f4dd7136..34f6953869 100644 --- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml +++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml @@ -20,7 +20,6 @@ enable_wandb = false [model] name = "llama3" -hf_name = "Llama-3.2-3B" flavor = "debugmodel" tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml index a314d1711e..30872e903c 100644 --- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml +++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml @@ -19,8 +19,8 @@ save_tb_folder = "tb" enable_wandb = false [model] -name = "meta-llama/Llama-3.2-3B" -flavor = "debugmodel" +name = "meta-llama/Llama-3.2-1B" +flavor = "medium" tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" [optimizer] diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 92e149625b..956ce5a853 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -4,124 +4,78 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - -from dataclasses import dataclass, field +from dataclasses import dataclass +from typing import Optional, Union +import os from torch import nn - from torchtitan.config import JobConfig - -from torchtitan.models.moe import MoEArgs from torchtitan.protocols import BaseModelArgs from torchtitan.tools.logging import logger -from torchtitan.tools.utils import has_cuda_capability +from transformers.models.llama.configuration_llama import LlamaConfig @dataclass class HFTransformerModelArgs(BaseModelArgs): + # Torchtitan naming dim: int = 4096 n_layers: int = 32 n_heads: int = 32 - n_kv_heads: int | None = None - vocab_size: int = 202048 - multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 - ffn_dim_multiplier: float | None = None - norm_eps: float = 1e-5 + n_kv_heads: Optional[int] = None + vocab_size: int = 128256 + multiple_of: int = 256 + ffn_dim_multiplier: Optional[float] = None rope_theta: float = 10000 - - max_seq_len: int = 1048576 - # If `True`, then each transformer block init uses its layer ID, and if - # `False`, each uses the total number of transformer blocks + max_seq_len: int = 2048 + + # HF compatibility + rms_norm_eps: float = 1e-6 + use_cache: bool = True depth_init: bool = True - use_flex_attn: bool = False attn_mask_type: str = "causal" - # iRoPE settings - # When ``every_n_layers_nope`` is specified, NoPE (no positional embedding) is - # used every n layers. Other layers uses RoPE (rotary positional embedding) and - # the inner attention of those layer will use the fixed block size specified by - # ``fixed_attn_block_size``. ``fixed_attn_block_size`` means that the query will - # only attend to the tokens within the same block regardless how long is the - # sequence. - every_n_layers_nope: int | None = None - fixed_attn_block_size: int = 8192 - - # MoE - moe_args: MoEArgs = field(default_factory=MoEArgs) - auto_scale_hidden_dim: bool = True - # frequency of using MoE layer instead of feedforward layer in a transformer block - interleave_moe_layer_step: int = 2 - - def update_from_config(self, job_config: JobConfig, **kwargs) -> None: + eos_id: int = 0 + + def update_from_config(self, job_config: JobConfig): + #TODO(3outeille): what if we dont specify flavor? Should use full as default + flavor = getattr(job_config.model, "flavor", None) + + if flavor == "full": + model_name_or_config: Union[LlamaConfig, str, os.PathLike] = job_config.model.name + hf_model_config = LlamaConfig.from_pretrained(model_name_or_config) + + #TODO(3outeille): use getattr to handle models that don't have all the attributes + self.dim = hf_model_config.hidden_size + self.n_layers = hf_model_config.num_hidden_layers + self.n_heads = hf_model_config.num_attention_heads + self.n_kv_heads = hf_model_config.num_key_value_heads + self.vocab_size = hf_model_config.vocab_size + self.rope_theta = getattr(hf_model_config, "rope_theta", 10000.0) + self.max_seq_len = hf_model_config.max_position_embeddings + self.rms_norm_eps = getattr(hf_model_config, "rms_norm_eps", 1e-6) + + if hasattr(hf_model_config, "intermediate_size") and hf_model_config.intermediate_size: + self.ffn_dim_multiplier = hf_model_config.intermediate_size / hf_model_config.hidden_size + + # Always update max_seq_len to match training seq_len, warn if exceeded seq_len = job_config.training.seq_len if seq_len > self.max_seq_len: - logger.warning( - f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}." - ) + logger.warning(f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}.") self.max_seq_len = seq_len - if self.moe_args.use_grouped_mm and not has_cuda_capability(9, 0): - logger.warning( - "Failed to use grouped mm, which is only supported on SM90 or later", - ) - self.moe_args.use_grouped_mm = False - if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn: - raise NotImplementedError( - "CP support for FlexAttention is still in progress." - ) - - def get_nparams_and_flops( - self, model: nn.Module, seq_len: int - ) -> tuple[int, float]: - return 0, 0 - # nparams_embedding = 0 - # nparams_moe_router = 0 - # nparams_shared_experts = 0 - # nparams_experts = 0 - # nparams_dense = 0 - - # for name, p in model.named_parameters(): - # if "embedding" in name: - # nparams_embedding += p.numel() - # nparams_dense += p.numel() - # elif "moe.shared_experts" in name: - # nparams_shared_experts += p.numel() - # elif "moe.router" in name: - # nparams_moe_router += p.numel() - # elif "moe.experts" in name: - # nparams_experts += p.numel() - # else: - # nparams_dense += p.numel() - - # nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts - # nparams = nparams_dense + nparams_sparse - # nparams_sparse_active = ( - # nparams_moe_router - # + nparams_shared_experts - # + nparams_experts * self.moe_args.top_k // self.moe_args.num_experts - # ) + raise NotImplementedError("CP support for FlexAttention is still in progress.") - # logger.info( - # f"Total parameter count: dense {nparams_dense:,}, " - # f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" - # ) + return self - # l, h, q, t = ( - # self.n_layers, - # self.n_heads, - # self.dim // self.n_heads, - # seq_len, - # ) - # # Reasoning behind the factor of 12 for the self-attention part of the formula: - # # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) - # # 2. the flash attention does 1 more matmul recomputation in the backward - # # but recomputation should not be counted in calculating MFU (+0) - # # 3. each matmul performs 1 multiplication and 1 addition (*2) - # # 4. we follow the convention and do not account for sparsity in causal attention - # num_flops_per_token = ( - # 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) - # + 12 * l * h * q * t - # ) + def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: + nparams = sum(p.numel() for p in model.parameters()) + nparams_embedding = sum( + sum(p.numel() for p in m.parameters()) + for m in model.children() + if isinstance(m, nn.Embedding) + ) - # return nparams, num_flops_per_token + l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len + num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + return nparams, num_flops_per_token From 7c3795cf1b1ada252aefdb9e89a900cd4b10f0f4 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 28 Aug 2025 14:30:24 +0000 Subject: [PATCH 005/129] it is now working up to apply_ac --- .../model/hf_transformers_args.py | 29 ++++++++++++++++++- torchtitan/train.py | 2 +- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 956ce5a853..a9c24dd30d 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional, Union import os @@ -36,6 +36,8 @@ class HFTransformerModelArgs(BaseModelArgs): attn_mask_type: str = "causal" eos_id: int = 0 + _torchtitan_args: dict = field(init=False, repr=False, default_factory=dict) + def update_from_config(self, job_config: JobConfig): #TODO(3outeille): what if we dont specify flavor? Should use full as default flavor = getattr(job_config.model, "flavor", None) @@ -45,6 +47,7 @@ def update_from_config(self, job_config: JobConfig): hf_model_config = LlamaConfig.from_pretrained(model_name_or_config) #TODO(3outeille): use getattr to handle models that don't have all the attributes + # Fill torchtitan args with HF ones self.dim = hf_model_config.hidden_size self.n_layers = hf_model_config.num_hidden_layers self.n_heads = hf_model_config.num_attention_heads @@ -66,8 +69,32 @@ def update_from_config(self, job_config: JobConfig): if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn: raise NotImplementedError("CP support for FlexAttention is still in progress.") + self._torchtitan_args = { + "dim": self.dim, + "n_layers": self.n_layers, + "n_heads": self.n_heads, + "n_kv_heads": self.n_kv_heads, + "vocab_size": self.vocab_size, + "multiple_of": self.multiple_of, + "ffn_dim_multiplier": self.ffn_dim_multiplier, + "rope_theta": self.rope_theta, + "max_seq_len": self.max_seq_len, + "rms_norm_eps": self.rms_norm_eps, + "use_cache": self.use_cache, + "depth_init": self.depth_init, + "use_flex_attn": self.use_flex_attn, + "attn_mask_type": self.attn_mask_type, + "eos_id": self.eos_id, + } return self + def convert_to_hf_config(self) -> LlamaConfig: + if not self._torchtitan_args: + raise RuntimeError( + "`update_from_config` must be called before `convert_to_hf_config` to prepare the arguments." + ) + return LlamaConfig(**self._torchtitan_args) + def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: nparams = sum(p.numel() for p in model.parameters()) nparams_embedding = sum( diff --git a/torchtitan/train.py b/torchtitan/train.py index 9b69fd6798..76737c5fc7 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -155,7 +155,7 @@ def __init__(self, job_config: JobConfig): f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}" ) with torch.device("meta"): - model = self.train_spec.model_cls(model_args) + model = self.train_spec.model_cls(model_args.convert_to_hf_config()) # Build the collection of model converters. No-op if `model.converters` empty model_converters = build_model_converters(job_config, parallel_dims) From 3fb2bf825e5224d805ca8845132b76226cb97984 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sat, 6 Sep 2025 08:03:45 +0000 Subject: [PATCH 006/129] now working up to init_weights --- torchtitan/models/llama3/infra/parallelize.py | 11 +++++++++-- torchtitan/train.py | 7 ++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py index 7d0b5de92b..8165f8e907 100644 --- a/torchtitan/models/llama3/infra/parallelize.py +++ b/torchtitan/models/llama3/infra/parallelize.py @@ -34,6 +34,7 @@ from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp from torchtitan.tools.logging import logger +from transformers.models.llama.modeling_llama import LlamaForCausalLM def parallelize_llama( model: nn.Module, @@ -325,11 +326,17 @@ def selective_checkpointing_context_fn(): def apply_ac(model: nn.Module, ac_config: ACConfig): """Apply activation checkpointing to the model.""" - for layer_id, transformer_block in model.layers.named_children(): + # TODO(3outeille): Make it more generic later + if isinstance(model, LlamaForCausalLM): + layers = model.model.layers + else: + layers = model.layers + + for layer_id, transformer_block in layers.named_children(): transformer_block = _apply_ac_to_transformer_block( transformer_block, ac_config, base_fqn=f"layers.{layer_id}" ) - model.layers.register_module(layer_id, transformer_block) + layers.register_module(layer_id, transformer_block) logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") diff --git a/torchtitan/train.py b/torchtitan/train.py index 76737c5fc7..a21bd7bf9d 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -33,6 +33,8 @@ maybe_enable_profiling, ) +from torchtitan.experiments.transformers_backend.model.hf_transformers_args import HFTransformerModelArgs + class Trainer(torch.distributed.checkpoint.stateful.Stateful): # core configs @@ -155,7 +157,10 @@ def __init__(self, job_config: JobConfig): f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}" ) with torch.device("meta"): - model = self.train_spec.model_cls(model_args.convert_to_hf_config()) + if isinstance(model_args, HFTransformerModelArgs): + model = self.train_spec.model_cls(model_args.convert_to_hf_config()) + else: + model = self.train_spec.model_cls(model_args) # Build the collection of model converters. No-op if `model.converters` empty model_converters = build_model_converters(job_config, parallel_dims) From 25daecaaa1952cef8ade604708544c224e29f454 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sat, 6 Sep 2025 09:19:03 +0000 Subject: [PATCH 007/129] fix mapping when convert_to_hf_config + add breaking test to ensure proper mapping --- .../model/hf_transformers_args.py | 27 +++++----- .../test_hf_torchtitan_model_args.py | 51 +++++++++++++++++++ torchtitan/train.py | 6 ++- 3 files changed, 68 insertions(+), 16 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index a9c24dd30d..94b014dfd7 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -36,7 +36,7 @@ class HFTransformerModelArgs(BaseModelArgs): attn_mask_type: str = "causal" eos_id: int = 0 - _torchtitan_args: dict = field(init=False, repr=False, default_factory=dict) + _hf_args: dict = field(init=False, repr=False, default_factory=dict) def update_from_config(self, job_config: JobConfig): #TODO(3outeille): what if we dont specify flavor? Should use full as default @@ -69,31 +69,28 @@ def update_from_config(self, job_config: JobConfig): if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn: raise NotImplementedError("CP support for FlexAttention is still in progress.") - self._torchtitan_args = { - "dim": self.dim, - "n_layers": self.n_layers, - "n_heads": self.n_heads, - "n_kv_heads": self.n_kv_heads, + self._hf_args = { + "hidden_size": self.dim, + "num_hidden_layers": self.n_layers, + "num_attention_heads": self.n_heads, + "num_key_value_heads": self.n_kv_heads, "vocab_size": self.vocab_size, - "multiple_of": self.multiple_of, - "ffn_dim_multiplier": self.ffn_dim_multiplier, + "rope_scaling": {"type": "dynamic", "factor": 2.0}, + "intermediate_size": self.ffn_dim_multiplier, "rope_theta": self.rope_theta, - "max_seq_len": self.max_seq_len, + "max_position_embeddings": self.max_seq_len, "rms_norm_eps": self.rms_norm_eps, "use_cache": self.use_cache, - "depth_init": self.depth_init, - "use_flex_attn": self.use_flex_attn, - "attn_mask_type": self.attn_mask_type, - "eos_id": self.eos_id, + "pad_token_id": self.eos_id, } return self def convert_to_hf_config(self) -> LlamaConfig: - if not self._torchtitan_args: + if not self._hf_args: raise RuntimeError( "`update_from_config` must be called before `convert_to_hf_config` to prepare the arguments." ) - return LlamaConfig(**self._torchtitan_args) + return LlamaConfig(**self._hf_args) def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: nparams = sum(p.numel() for p in model.parameters()) diff --git a/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py b/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py new file mode 100644 index 0000000000..d83f268091 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py @@ -0,0 +1,51 @@ +from transformers.models.llama.configuration_llama import LlamaConfig +from torchtitan.experiments.transformers_backend.model.hf_transformers_args import ( + HFTransformerModelArgs, +) +from torchtitan.config import JobConfig + + +def print_comparison_keys(ref_dict, tt_dict): + all_keys = sorted(list(set(ref_dict.keys()) | set(tt_dict.keys()))) + print(f"{'Attribute':<30} | {'Original HF':<20} | {'TorchTitan HF':<20}") + print("-" * 75) + for key in all_keys: + ref_val = ref_dict.get(key, "N/A") + tt_val = tt_dict.get(key, "N/A") + if str(ref_val) != str(tt_val): + # Red for different + print(f"\033[91m{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}\033[0m") + else: + print(f"{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}") + +def compare_hf_tt_configs(model_name, flavor): + ref_hf_config = LlamaConfig() + + model_args = HFTransformerModelArgs() + job_config = JobConfig() + job_config.model.name = model_name + job_config.model.flavor = flavor + model_args.update_from_config(job_config) + tt_hf_config = model_args.convert_to_hf_config() + + ref_dict = ref_hf_config.to_dict() + tt_dict = tt_hf_config.to_dict() + + try: + assert ref_dict == tt_dict + print(f"✅ Configs match for model name {model_name} with flavor: {flavor}") + except AssertionError: + print(f"❌ Configs do not match for model name {model_name} with flavor: {flavor}! Showing differences:") + print_comparison_keys(ref_dict, tt_dict) + raise + +if __name__ == "__main__": + model_names = [ + "meta-llama/Llama-3.2-1B", + ] + flavors = ["full"] + + for model_name in model_names: + for flavor in flavors: + print(f"\nTesting model name: {model_name} with flavor: {flavor}") + compare_hf_tt_configs(model_name, flavor) \ No newline at end of file diff --git a/torchtitan/train.py b/torchtitan/train.py index a21bd7bf9d..d4a00ad98e 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -34,6 +34,7 @@ ) from torchtitan.experiments.transformers_backend.model.hf_transformers_args import HFTransformerModelArgs +from transformers.models.llama.modeling_llama import LlamaForCausalLM class Trainer(torch.distributed.checkpoint.stateful.Stateful): @@ -266,7 +267,10 @@ def __init__(self, job_config: JobConfig): model.to_empty(device=init_device) with torch.no_grad(): - model.init_weights(buffer_device=buffer_device) + if isinstance(model, LlamaForCausalLM): + model.post_init() + else: + model.init_weights(buffer_device=buffer_device) model.train() self.model_parts = [model] From 3e67f2cccee7c889a3fd1d23e71dc8dc648f5ad8 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 8 Sep 2025 08:42:21 +0000 Subject: [PATCH 008/129] define own apply_ac for transformer backend instead of reusing llama3 --- .../infra/parallelize_hf_transformers.py | 142 +++++++++++++++++- .../model/hf_transformers_args.py | 2 +- torchtitan/models/llama3/infra/parallelize.py | 10 +- 3 files changed, 143 insertions(+), 11 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 3f26036dc8..04ffaaeffb 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -4,9 +4,15 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from collections import defaultdict +from typing import Optional import torch import torch.nn as nn +from torch.distributed._composable.replicate import replicate +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( + checkpoint_wrapper as ptd_checkpoint_wrapper, +) from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy from torch.distributed.tensor import Partial, Replicate, Shard @@ -29,10 +35,142 @@ TensorParallel, ) from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp - -from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp +from torchtitan.config.job_config import ActivationCheckpoint as ACConfig from torchtitan.tools.logging import logger +from transformers.models.llama.modeling_llama import LlamaForCausalLM + +# for selective op activation checkpointing +_save_list = { + torch.ops.aten.mm.default, + torch.ops.aten._scaled_dot_product_efficient_attention.default, + torch.ops.aten._scaled_dot_product_flash_attention.default, + torch.ops._c10d_functional.reduce_scatter_tensor.default, + # for low precision training, it's useful to always save + # the result of max, since the absolute maximum is + # used to compute the scaling factor for quantization. + torch.ops.aten.max.default, + torch._higher_order_ops.flex_attention, +} + +def _apply_ac_to_transformer_block( + module: nn.Module, ac_config: ACConfig, *, base_fqn: Optional[str] = None +): + valid_ac_modes = ("full", "selective") + if ac_config.mode not in valid_ac_modes: + raise ValueError( + f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}" + ) + + if ac_config.mode == "full": + return ptd_checkpoint_wrapper(module, preserve_rng_state=False) + + assert ac_config.mode == "selective", f"{ac_config.mode}" + use_op_sac = ac_config.selective_ac_option == "op" + use_layer_sac = ac_config.selective_ac_option.isdigit() + if not use_op_sac and not use_layer_sac: + raise ValueError( + f"Invalid selective AC option: {ac_config.selective_ac_option}. " + f"Valid options: 'op' or a positive int representing layer frequency" + ) + if use_op_sac: + from torch.utils.checkpoint import ( + CheckpointPolicy, + create_selective_checkpoint_contexts, + ) + + mm_recompute_shapes = set() + if len(ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns) > 0: + for module_fqn, submod in module.named_modules(): + fqn = module_fqn + if base_fqn is not None: + fqn = f"{base_fqn}.{module_fqn}" + if not any( + filter_fqn in fqn + for filter_fqn in ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns + ): + continue + if not isinstance(submod, nn.Linear): + raise ValueError( + "per_op_sac_force_recompute_mm_shapes_by_fqns expected to match " + f"a nn.Linear, but got: {submod}" + ) + out_f, in_f = submod.weight.shape + mm_recompute_shapes.add((in_f, out_f)) + logger.debug( + f"Selective op AC force recomputing mms with rhs shapes {mm_recompute_shapes}" + ) + + def _get_custom_policy(meta): + def _custom_policy(ctx, func, *args, **kwargs): + mode = "recompute" if ctx.is_recompute else "forward" + mm_count_key = f"{mode}_mm_count" + if func == torch.ops.aten.mm.default: + if args[1].shape in mm_recompute_shapes: + return CheckpointPolicy.PREFER_RECOMPUTE + meta[mm_count_key] += 1 + # Saves output of all compute ops, except every second mm + to_save = func in _save_list and not ( + func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0 + ) + return ( + CheckpointPolicy.MUST_SAVE + if to_save + else CheckpointPolicy.PREFER_RECOMPUTE + ) + + return _custom_policy + + def selective_checkpointing_context_fn(): + meta = defaultdict(int) + return create_selective_checkpoint_contexts(_get_custom_policy(meta)) + + return ptd_checkpoint_wrapper( + module, + context_fn=selective_checkpointing_context_fn, + preserve_rng_state=False, + ) + elif use_layer_sac: + # Checkpoint every `ac_freq` of the modules passed to this function + ac_freq = int(ac_config.selective_ac_option) + ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0) + ptd_checkpoint_wrapper._count += 1 + if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0: + return ptd_checkpoint_wrapper(module, preserve_rng_state=False) + else: + return module + +def apply_ac(model: nn.Module, ac_config: ACConfig): + """Apply activation checkpointing to the model.""" + # TODO(3outeille): Make it more generic later + layers = model.model.layers + + for layer_id, transformer_block in layers.named_children(): + transformer_block = _apply_ac_to_transformer_block( + transformer_block, ac_config, base_fqn=f"layers.{layer_id}" + ) + layers.register_module(layer_id, transformer_block) + + logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") + +def apply_ddp( + model: nn.Module, + dp_mesh: DeviceMesh, + enable_compile: bool, + enable_compiled_autograd: bool, +): + if enable_compile: + if enable_compiled_autograd: + torch._dynamo.config.optimize_ddp = ( + "python_reducer_without_compiled_forward" + ) + else: + torch._dynamo.config.optimize_ddp = "ddp_optimizer" + + replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100) + + logger.info("Applied DDP to the model") + def parallelize_hf_transformers( model: nn.Module, diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 94b014dfd7..e20da24c5b 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -58,7 +58,7 @@ def update_from_config(self, job_config: JobConfig): self.rms_norm_eps = getattr(hf_model_config, "rms_norm_eps", 1e-6) if hasattr(hf_model_config, "intermediate_size") and hf_model_config.intermediate_size: - self.ffn_dim_multiplier = hf_model_config.intermediate_size / hf_model_config.hidden_size + self.ffn_dim_multiplier = hf_model_config.intermediate_size // hf_model_config.hidden_size # Always update max_seq_len to match training seq_len, warn if exceeded seq_len = job_config.training.seq_len diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py index 8165f8e907..6da44a321d 100644 --- a/torchtitan/models/llama3/infra/parallelize.py +++ b/torchtitan/models/llama3/infra/parallelize.py @@ -326,17 +326,11 @@ def selective_checkpointing_context_fn(): def apply_ac(model: nn.Module, ac_config: ACConfig): """Apply activation checkpointing to the model.""" - # TODO(3outeille): Make it more generic later - if isinstance(model, LlamaForCausalLM): - layers = model.model.layers - else: - layers = model.layers - - for layer_id, transformer_block in layers.named_children(): + for layer_id, transformer_block in model.layers.named_children(): transformer_block = _apply_ac_to_transformer_block( transformer_block, ac_config, base_fqn=f"layers.{layer_id}" ) - layers.register_module(layer_id, transformer_block) + model.layers.register_module(layer_id, transformer_block) logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") From 8c5c0ae63b0d784dac3476140ead3089fd62bdc4 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 9 Sep 2025 08:00:32 +0000 Subject: [PATCH 009/129] HF model without any parallelism now train (but grad_norm is high) --- .../transformers_backend/__init__.py | 14 +-- .../infra/parallelize_hf_transformers.py | 3 - .../model/hf_transformers_args.py | 104 +++++++++--------- torchtitan/train.py | 38 ++++++- 4 files changed, 89 insertions(+), 70 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index e416731205..504adfc88e 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -29,17 +29,13 @@ flavors = { - "debug": HFTransformerModelArgs( - dim=1, - n_layers=6, - n_heads=16, - rope_theta=500000, + "debugmodel": HFTransformerModelArgs( + n_layers=2, + vocab_size=2000, ), "medium": HFTransformerModelArgs( - dim=40, - n_layers=24, - n_heads=32, - rope_theta=500000, + dim=1024, + n_layers=12, ), "full": HFTransformerModelArgs(), } diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 04ffaaeffb..2f0d9167b0 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -38,8 +38,6 @@ from torchtitan.config.job_config import ActivationCheckpoint as ACConfig from torchtitan.tools.logging import logger -from transformers.models.llama.modeling_llama import LlamaForCausalLM - # for selective op activation checkpointing _save_list = { torch.ops.aten.mm.default, @@ -142,7 +140,6 @@ def selective_checkpointing_context_fn(): def apply_ac(model: nn.Module, ac_config: ACConfig): """Apply activation checkpointing to the model.""" - # TODO(3outeille): Make it more generic later layers = model.model.layers for layer_id, transformer_block in layers.named_children(): diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index e20da24c5b..63e252d851 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -16,7 +16,7 @@ @dataclass -class HFTransformerModelArgs(BaseModelArgs): +class HFTransformerModelArgs(LlamaConfig, BaseModelArgs): # Torchtitan naming dim: int = 4096 n_layers: int = 32 @@ -25,72 +25,72 @@ class HFTransformerModelArgs(BaseModelArgs): vocab_size: int = 128256 multiple_of: int = 256 ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 rope_theta: float = 10000 - max_seq_len: int = 2048 - # HF compatibility - rms_norm_eps: float = 1e-6 - use_cache: bool = True + max_seq_len: int = 2048 depth_init: bool = True use_flex_attn: bool = False attn_mask_type: str = "causal" eos_id: int = 0 - - _hf_args: dict = field(init=False, repr=False, default_factory=dict) + + # HF args + attn_implementation: str = "eager" def update_from_config(self, job_config: JobConfig): - #TODO(3outeille): what if we dont specify flavor? Should use full as default - flavor = getattr(job_config.model, "flavor", None) - if flavor == "full": - model_name_or_config: Union[LlamaConfig, str, os.PathLike] = job_config.model.name - hf_model_config = LlamaConfig.from_pretrained(model_name_or_config) + #TODO(3outeille): clean this mess once grad norm is stabilized + default_args = HFTransformerModelArgs() - #TODO(3outeille): use getattr to handle models that don't have all the attributes - # Fill torchtitan args with HF ones - self.dim = hf_model_config.hidden_size - self.n_layers = hf_model_config.num_hidden_layers - self.n_heads = hf_model_config.num_attention_heads - self.n_kv_heads = hf_model_config.num_key_value_heads - self.vocab_size = hf_model_config.vocab_size - self.rope_theta = getattr(hf_model_config, "rope_theta", 10000.0) - self.max_seq_len = hf_model_config.max_position_embeddings - self.rms_norm_eps = getattr(hf_model_config, "rms_norm_eps", 1e-6) + args_to_override = {} + for key in default_args.__dict__: + if hasattr(self, key): + current_value = getattr(self, key) + default_value = getattr(default_args, key) + if current_value != default_value: + args_to_override[key] = current_value - if hasattr(hf_model_config, "intermediate_size") and hf_model_config.intermediate_size: - self.ffn_dim_multiplier = hf_model_config.intermediate_size // hf_model_config.hidden_size + hf_model_config = LlamaConfig.from_pretrained( + job_config.model.name, + attn_implementation=self.attn_implementation, + ) + # n_layers = 32 + self.__dict__.update(hf_model_config.__dict__) - # Always update max_seq_len to match training seq_len, warn if exceeded - seq_len = job_config.training.seq_len - if seq_len > self.max_seq_len: - logger.warning(f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}.") - self.max_seq_len = seq_len + # num_hidden_layers = 16 - if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn: - raise NotImplementedError("CP support for FlexAttention is still in progress.") + # Update TT args with HF args (for keys that exist in both but differ in namings) + self.dim = self.hidden_size + self.n_layers = self.num_hidden_layers + self.n_heads = self.num_attention_heads + self.n_kv_heads = self.num_key_value_heads + self.norm_eps = self.rms_norm_eps + self.max_seq_len = self.max_position_embeddings + self.eos_id = self.eos_token_id - self._hf_args = { - "hidden_size": self.dim, - "num_hidden_layers": self.n_layers, - "num_attention_heads": self.n_heads, - "num_key_value_heads": self.n_kv_heads, - "vocab_size": self.vocab_size, - "rope_scaling": {"type": "dynamic", "factor": 2.0}, - "intermediate_size": self.ffn_dim_multiplier, - "rope_theta": self.rope_theta, - "max_position_embeddings": self.max_seq_len, - "rms_norm_eps": self.rms_norm_eps, - "use_cache": self.use_cache, - "pad_token_id": self.eos_id, - } - return self + # n_layers = 16 + + self.__dict__.update(args_to_override) + + # n_layers = 2 + # num_hidden_layers = 16 - def convert_to_hf_config(self) -> LlamaConfig: - if not self._hf_args: - raise RuntimeError( - "`update_from_config` must be called before `convert_to_hf_config` to prepare the arguments." - ) - return LlamaConfig(**self._hf_args) + # Update HF args with TT override args because HF modeling uses HF args and not TT args + # TODO(3outeille): find a cleaner way to handle the mapping + self.hidden_size = self.dim + self.num_hidden_layers = self.n_layers + self.num_attention_heads = self.n_heads + self.num_key_value_heads = self.n_kv_heads + self.rms_norm_eps = self.norm_eps + self.max_position_embeddings = self.max_seq_len + self.eos_token_id = self.eos_id + + # n_layers = 2 + # num_hidden_layers = 2 + + print(self) + self.use_cache = False + return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: nparams = sum(p.numel() for p in model.parameters()) diff --git a/torchtitan/train.py b/torchtitan/train.py index d4a00ad98e..bc8128d0fa 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -33,8 +33,34 @@ maybe_enable_profiling, ) -from torchtitan.experiments.transformers_backend.model.hf_transformers_args import HFTransformerModelArgs -from transformers.models.llama.modeling_llama import LlamaForCausalLM +from transformers.models.llama.modeling_llama import LlamaForCausalLM, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel + + +# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly +# The default _initialize_weights sets _is_hf_initialized = True even on a meta device, +# which prevents subsequent proper initialization. +def _initialize_weights_patched(self, module): + """ + Patched version of _initialize_weights that skips initialization and setting + the _is_hf_initialized flag if the module is on a meta device. + """ + if getattr(module, "_is_hf_initialized", False): + return + + # Check if any parameter is on the meta device + for param in module.parameters(recurse=False): + if param.device.type == "meta": + return + + #TODO(3outeille): check if register bufffer is init + + # If not on a meta device, call the original weight initialization + self._init_weights(module) + module._is_hf_initialized = True + + +PreTrainedModel._initialize_weights = _initialize_weights_patched class Trainer(torch.distributed.checkpoint.stateful.Stateful): @@ -158,10 +184,7 @@ def __init__(self, job_config: JobConfig): f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}" ) with torch.device("meta"): - if isinstance(model_args, HFTransformerModelArgs): - model = self.train_spec.model_cls(model_args.convert_to_hf_config()) - else: - model = self.train_spec.model_cls(model_args) + model = self.train_spec.model_cls(model_args) # Build the collection of model converters. No-op if `model.converters` empty model_converters = build_model_converters(job_config, parallel_dims) @@ -468,6 +491,9 @@ def forward_backward_step( assert len(model_parts) == 1 with self.maybe_enable_amp: pred = model_parts[0](inputs) + #NOTE(3outeille): just trying to make it work for now. Will refactor later. + if isinstance(pred, CausalLMOutputWithPast): + pred = pred.logits loss = self.loss_fn(pred, labels) # need to free to before bwd to avoid peaking memory del pred From 4ae9560258936d3096052d4858ad5d79d1c857fe Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 10 Sep 2025 07:41:35 +0000 Subject: [PATCH 010/129] a bit cleaner way to get passed args --- .../transformers_backend/__init__.py | 19 +++-- .../transformers_backend/compare_tt_hf_run.sh | 76 +++++++++++++++++++ .../model/hf_transformers_args.py | 15 +--- 3 files changed, 93 insertions(+), 17 deletions(-) create mode 100755 torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 504adfc88e..876e7ae8fa 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -28,16 +28,25 @@ ] +def hf_transformer_model_args_builder(**kwargs): + # Capture the kwargs in the passed_args field + args = HFTransformerModelArgs(**kwargs) + args.passed_args = kwargs + return args + + flavors = { - "debugmodel": HFTransformerModelArgs( - n_layers=2, - vocab_size=2000, + "debugmodel": hf_transformer_model_args_builder( + # n_layers=2, + # vocab_size=2000, + max_seq_len=2048, + dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000 ), - "medium": HFTransformerModelArgs( + "medium": hf_transformer_model_args_builder( dim=1024, n_layers=12, ), - "full": HFTransformerModelArgs(), + "full": hf_transformer_model_args_builder(), } hf_train_spec = TrainSpec( diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh new file mode 100755 index 0000000000..4085461e3a --- /dev/null +++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh @@ -0,0 +1,76 @@ +#!/usr/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex +set -o pipefail + +# Common settings +NGPU=${NGPU:-"1"} +export LOG_RANK=${LOG_RANK:-0} + + +run_tt() { + echo "##############################################" + echo "### Running TorchTitan (native) training ###" + echo "##############################################" + TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml" + + # Use CUDA_VISIBLE_DEVICES=0 for TT run + CUDA_VISIBLE_DEVICES=0 \ + torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ + --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ + -m torchtitan.train --job.config_file ${TT_CONFIG} "$@" +} + +run_hf() { + echo "#######################################################" + echo "### Running TorchTitan with HF backend training ###" + echo "#######################################################" + HF_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml" + + # Use CUDA_VISIBLE_DEVICES=1 for HF run + CUDA_VISIBLE_DEVICES=1 \ + torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ + --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ + -m torchtitan.train --job.config_file ${HF_CONFIG} "$@" +} + + +TT_LOG="tt_run.log" +HF_LOG="hf_run.log" +DIFF_LOG="run_diff.log" + +run_tt "$@" 2>&1 | tee ${TT_LOG} +run_hf "$@" 2>&1 | tee ${HF_LOG} + +# Filter logs to remove noisy differences +TT_LOG_FILTERED="${TT_LOG}.filtered" +HF_LOG_FILTERED="${HF_LOG}.filtered" + +# This sed command removes timestamps, PIDs, master ports, and other +# volatile details that change between runs. +# Feel free to adjust the regex patterns to better suit your log format. +sed -E \ + -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \ + -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \ + -e 's/PID [0-9]+/PID XXXX/g' \ + -e 's/localhost:[0-9]+/localhost:XXXX/g' \ + < "${TT_LOG}" > "${TT_LOG_FILTERED}" + +sed -E \ + -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \ + -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \ + -e 's/PID [0-9]+/PID XXXX/g' \ + -e 's/localhost:[0-9]+/localhost:XXXX/g' \ + < "${HF_LOG}" > "${HF_LOG_FILTERED}" + +echo "############################################" +echo "### Diff between TT and HF run logs ###" +echo "############################################" +echo "### Log diff is being saved to ${DIFF_LOG}" +echo "############################################" +git diff --no-index --color=always --word-diff=color "${TT_LOG_FILTERED}" "${HF_LOG_FILTERED}" | tee "${DIFF_LOG}" || true diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 63e252d851..bb9d1b814d 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -37,19 +37,10 @@ class HFTransformerModelArgs(LlamaConfig, BaseModelArgs): # HF args attn_implementation: str = "eager" + passed_args: dict = field(init=False, repr=False, default_factory=dict) + def update_from_config(self, job_config: JobConfig): - #TODO(3outeille): clean this mess once grad norm is stabilized - default_args = HFTransformerModelArgs() - - args_to_override = {} - for key in default_args.__dict__: - if hasattr(self, key): - current_value = getattr(self, key) - default_value = getattr(default_args, key) - if current_value != default_value: - args_to_override[key] = current_value - hf_model_config = LlamaConfig.from_pretrained( job_config.model.name, attn_implementation=self.attn_implementation, @@ -70,7 +61,7 @@ def update_from_config(self, job_config: JobConfig): # n_layers = 16 - self.__dict__.update(args_to_override) + self.__dict__.update(self.passed_args) # n_layers = 2 # num_hidden_layers = 16 From 9be95f98f518efbdd25c3e90e5edd5a60971d8d0 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 10 Sep 2025 09:01:12 +0000 Subject: [PATCH 011/129] now same number of params + same attention backend but noticed higher gradnorm and less tps with HF model --- .../transformers_backend/__init__.py | 3 +- .../configs/debug_1_gpu_hf.toml | 4 +- .../model/hf_transformers_args.py | 63 ++++++++++++++++++- .../transformers_backend/run_train.sh | 17 ++++- 4 files changed, 79 insertions(+), 8 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 876e7ae8fa..b8fc47b9e7 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -40,7 +40,8 @@ def hf_transformer_model_args_builder(**kwargs): # n_layers=2, # vocab_size=2000, max_seq_len=2048, - dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000 + #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else + dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16 ), "medium": hf_transformer_model_args_builder( dim=1024, diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml index 30872e903c..3144011b62 100644 --- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml +++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml @@ -1,6 +1,6 @@ [job] dump_folder = "./outputs" -description = "Llama 3 debug training with FSDP on 2 GPUs" +description = "HF Llama 3 debug training" print_args = false use_for_integration_test = true @@ -20,7 +20,7 @@ enable_wandb = false [model] name = "meta-llama/Llama-3.2-1B" -flavor = "medium" +flavor = "debugmodel" tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" [optimizer] diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index bb9d1b814d..b21a0604a2 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -35,7 +35,7 @@ class HFTransformerModelArgs(LlamaConfig, BaseModelArgs): eos_id: int = 0 # HF args - attn_implementation: str = "eager" + attn_implementation: str = "sdpa" passed_args: dict = field(init=False, repr=False, default_factory=dict) @@ -76,15 +76,74 @@ def update_from_config(self, job_config: JobConfig): self.max_position_embeddings = self.max_seq_len self.eos_token_id = self.eos_id + # Match torchtitan parameter counts + self.tie_word_embeddings = False + self.attention_bias = False + self.mlp_bias = False + + # Match torchtitan intermediate size calculation + ffn_hidden_size = 4 * self.hidden_size + ffn_hidden_size = int(2 * ffn_hidden_size / 3) + if self.ffn_dim_multiplier is not None: + ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) + self.intermediate_size = self.multiple_of * ( + (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of + ) + # Forced it as HF has config.head_dim and the modeling retrieves it instead of doing config.hidden_size // config.num_attention_heads + self.head_dim = self.dim // self.num_attention_heads + # n_layers = 2 # num_hidden_layers = 2 - print(self) self.use_cache = False return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: nparams = sum(p.numel() for p in model.parameters()) + + layer_params = {} # int -> int + embedding_params = 0 + norm_params = 0 + lm_head_params = 0 + misc_params = {} + + for name, p in model.named_parameters(): + if "model.embed_tokens" in name: + embedding_params += p.numel() + elif "model.layers." in name: + try: + layer_num = int(name.split("layers.")[1].split(".")[0]) + if layer_num not in layer_params: + layer_params[layer_num] = 0 + layer_params[layer_num] += p.numel() + except (ValueError, IndexError): + # Should not happen with standard HF llama names + component = "misc_layer_parts" + if component not in misc_params: + misc_params[component] = 0 + misc_params[component] += p.numel() + elif "model.norm" in name: + norm_params += p.numel() + elif "lm_head" in name: + lm_head_params += p.numel() + else: + # Catch anything else + component = name.split(".")[0] + if component not in misc_params: + misc_params[component] = 0 + misc_params[component] += p.numel() + + logger.info("Parameter breakdown:") + logger.info(f" - embedding: {embedding_params:,} parameters") + for layer_num in sorted(layer_params.keys()): + params = layer_params[layer_num] + logger.info(f" - layer_{layer_num}: {params:,} parameters") + logger.info(f" - final_norm: {norm_params:,} parameters") + logger.info(f" - lm_head: {lm_head_params:,} parameters") + if misc_params: + for name, params in misc_params.items(): + logger.info(f" - {name} (misc): {params:,} parameters") + nparams_embedding = sum( sum(p.numel() for p in m.parameters()) for m in model.children() diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh index 74ef5603b1..6151fcda64 100755 --- a/torchtitan/experiments/transformers_backend/run_train.sh +++ b/torchtitan/experiments/transformers_backend/run_train.sh @@ -9,17 +9,28 @@ set -ex # use envs as local overwrites for convenience # e.g. -# LOG_RANK=0,1 NGPU=4 ./run_train.sh +# BACKEND=tt LOG_RANK=0,1 NGPU=4 ./run_train.sh NGPU=${NGPU:-"8"} export LOG_RANK=${LOG_RANK:-0} +DEBUG_PORT=${DEBUG_PORT:-5678} # Option to switch between debug and train MODE=${MODE:-"train"} # Set MODE=debug or MODE=train -CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu.toml"} +# Option to switch between hf and tt backend +BACKEND=${BACKEND:-"hf"} + +if [ "$BACKEND" = "tt" ]; then + CONFIG_FILE=${CONFIG_FILE:-"/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml"} +elif [ "$BACKEND" = "hf" ]; then + CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu_hf.toml"} +else + echo "Invalid BACKEND set: ${BACKEND}" + exit 1 +fi if [ "$MODE" = "debug" ]; then - PYTHON_CMD="debugpy-run -m torch.distributed.run --" + PYTHON_CMD="debugpy-run -p ${DEBUG_PORT} -m torch.distributed.run --" else PYTHON_CMD="torchrun" fi From bf9144779ca28fed110aa010e5eaece0ae0278bc Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 11 Sep 2025 08:35:37 +0000 Subject: [PATCH 012/129] fix seed and deterministic --- .../experiments/transformers_backend/compare_tt_hf_run.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh index 4085461e3a..81b33091fb 100755 --- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh @@ -23,7 +23,7 @@ run_tt() { CUDA_VISIBLE_DEVICES=0 \ torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${TT_CONFIG} "$@" + -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic "$@" } run_hf() { @@ -36,7 +36,7 @@ run_hf() { CUDA_VISIBLE_DEVICES=1 \ torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${HF_CONFIG} "$@" + -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic "$@" } @@ -45,7 +45,9 @@ HF_LOG="hf_run.log" DIFF_LOG="run_diff.log" run_tt "$@" 2>&1 | tee ${TT_LOG} -run_hf "$@" 2>&1 | tee ${HF_LOG} +# run_hf "$@" 2>&1 | tee ${HF_LOG} +run_tt "$@" 2>&1 | tee ${HF_LOG} + # Filter logs to remove noisy differences TT_LOG_FILTERED="${TT_LOG}.filtered" From 4c2fc0bbd04aa1667296ccd124f26c3cb8cf15fb Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 11 Sep 2025 13:41:45 +0000 Subject: [PATCH 013/129] fix torch deterministic for HF modeling that was producing Nans --- torchtitan/train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchtitan/train.py b/torchtitan/train.py index bc8128d0fa..3c9718df1b 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -8,6 +8,7 @@ import os import time from datetime import timedelta +from transformers.utils import is_torch_deterministic from typing import Any, Generator, Iterable, Optional import torch @@ -287,7 +288,9 @@ def __init__(self, job_config: JobConfig): else: # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel model = self.train_spec.parallelize_fn(model, parallel_dims, job_config) - + if is_torch_deterministic(): + # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan + torch.utils.deterministic.fill_uninitialized_memory = False model.to_empty(device=init_device) with torch.no_grad(): if isinstance(model, LlamaForCausalLM): From 9bffa386f7a97454ff580ee11c5ba39a5a1b51fe Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 15 Sep 2025 13:10:57 +0000 Subject: [PATCH 014/129] HF model now numerically stable compared to TT (given a fixed attention backend) --- .../model/hf_transformers_args.py | 99 +++++++++++++++++++ torchtitan/models/attention.py | 4 +- torchtitan/train.py | 31 +----- 3 files changed, 105 insertions(+), 29 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index b21a0604a2..61282d9fb0 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -14,6 +14,102 @@ from torchtitan.tools.logging import logger from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.modeling_utils import PreTrainedModel +from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP + +# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly +# The default _initialize_weights sets _is_hf_initialized = True even on a meta device, +# which prevents subsequent proper initialization. +def _initialize_weights_patched(self, module): + """ + Patched version of _initialize_weights that skips initialization and setting + the _is_hf_initialized flag if the module is on a meta device. + """ + if getattr(module, "_is_hf_initialized", False): + return + + # Check if any parameter is on the meta device + for param in module.parameters(recurse=False): + if param.device.type == "meta": + return + + #TODO(3outeille): check if register bufffer is init + + # If not on a meta device, call the original weight initialization + self._init_weights(module) + module._is_hf_initialized = True + + +#TODO(3outeille): find a better way to do this +from transformers.models.llama.modeling_llama import LlamaDecoderLayer + +_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__ + +def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int): + _original_llama_decoder_layer_init(self, config, layer_idx) + self.mlp.layer_idx = layer_idx + +LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched + + +def _init_weights_patched(self, module): + """ + Patched version of _init_weights to match TorchTitan's initialization for Llama. + `self` is a LlamaPreTrainedModel instance. + """ + config = self.config + + if isinstance(module, (LlamaAttention, LlamaMLP)): + layer_idx = module.layer_idx + + if config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, LlamaAttention): + nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, LlamaMLP): + nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std) + nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) + + elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif ( + isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) + or "LayerNorm" in module.__class__.__name__ + or "RMSNorm" in module.__class__.__name__ + ): + # Norms can exist without weights (in which case they are None from torch primitives) + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + +PreTrainedModel._init_weights = _init_weights_patched +PreTrainedModel._initialize_weights = _initialize_weights_patched @dataclass class HFTransformerModelArgs(LlamaConfig, BaseModelArgs): @@ -96,6 +192,9 @@ def update_from_config(self, job_config: JobConfig): # num_hidden_layers = 2 self.use_cache = False + + # HF numerical stability matching + self.initializer_range = 1.0 # use as std for normal init in embedding return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py index f66361a6d2..9d99622cc1 100644 --- a/torchtitan/models/attention.py +++ b/torchtitan/models/attention.py @@ -205,9 +205,9 @@ def _init_backend(cls) -> None: # Add CuDNN on B200 w/ highest priority cls.backends = [ - SDPBackend.FLASH_ATTENTION, + # SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, - SDPBackend.MATH, + # SDPBackend.MATH, ] if has_cuda_capability(10, 0): cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION) diff --git a/torchtitan/train.py b/torchtitan/train.py index 3c9718df1b..7b43e6b866 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -35,33 +35,6 @@ ) from transformers.models.llama.modeling_llama import LlamaForCausalLM, CausalLMOutputWithPast -from transformers.modeling_utils import PreTrainedModel - - -# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly -# The default _initialize_weights sets _is_hf_initialized = True even on a meta device, -# which prevents subsequent proper initialization. -def _initialize_weights_patched(self, module): - """ - Patched version of _initialize_weights that skips initialization and setting - the _is_hf_initialized flag if the module is on a meta device. - """ - if getattr(module, "_is_hf_initialized", False): - return - - # Check if any parameter is on the meta device - for param in module.parameters(recurse=False): - if param.device.type == "meta": - return - - #TODO(3outeille): check if register bufffer is init - - # If not on a meta device, call the original weight initialization - self._init_weights(module) - module._is_hf_initialized = True - - -PreTrainedModel._initialize_weights = _initialize_weights_patched class Trainer(torch.distributed.checkpoint.stateful.Stateful): @@ -294,6 +267,10 @@ def __init__(self, job_config: JobConfig): model.to_empty(device=init_device) with torch.no_grad(): if isinstance(model, LlamaForCausalLM): + print("Now done with meta device, calling post_init") + for m in model.modules(): + if hasattr(m, "_is_hf_initialized"): + m._is_hf_initialized = False model.post_init() else: model.init_weights(buffer_device=buffer_device) From 40d84cc4098c51ceac8e30fc966dc787a0905a43 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 15 Sep 2025 14:02:03 +0000 Subject: [PATCH 015/129] handling the is_hf_initialized flag in patch --- .../transformers_backend/model/hf_transformers_args.py | 3 +-- torchtitan/train.py | 4 ---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 61282d9fb0..5a64dd0dc6 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -28,8 +28,7 @@ def _initialize_weights_patched(self, module): if getattr(module, "_is_hf_initialized", False): return - # Check if any parameter is on the meta device - for param in module.parameters(recurse=False): + for param in module.parameters(recurse=True): if param.device.type == "meta": return diff --git a/torchtitan/train.py b/torchtitan/train.py index 7b43e6b866..f3617eb415 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -267,10 +267,6 @@ def __init__(self, job_config: JobConfig): model.to_empty(device=init_device) with torch.no_grad(): if isinstance(model, LlamaForCausalLM): - print("Now done with meta device, calling post_init") - for m in model.modules(): - if hasattr(m, "_is_hf_initialized"): - m._is_hf_initialized = False model.post_init() else: model.init_weights(buffer_device=buffer_device) From bd3f3327060b1ef56583f27a8f01c8b7d8390e74 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 09:12:46 +0000 Subject: [PATCH 016/129] refactor HF transformer model args --- .../model/hf_transformers_args.py | 209 ++++++++++++------ 1 file changed, 147 insertions(+), 62 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 5a64dd0dc6..d558ec1550 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -89,6 +89,7 @@ def _init_weights_patched(self, module): ) if module.bias is not None: module.bias.data.zero_() + elif isinstance(module, nn.Embedding): std = config.initializer_range module.weight.data.normal_(mean=0.0, std=std) @@ -112,88 +113,172 @@ def _init_weights_patched(self, module): @dataclass class HFTransformerModelArgs(LlamaConfig, BaseModelArgs): - # Torchtitan naming - dim: int = 4096 - n_layers: int = 32 - n_heads: int = 32 - n_kv_heads: Optional[int] = None - vocab_size: int = 128256 - multiple_of: int = 256 - ffn_dim_multiplier: Optional[float] = None - norm_eps: float = 1e-5 - rope_theta: float = 10000 - - max_seq_len: int = 2048 - depth_init: bool = True - use_flex_attn: bool = False - attn_mask_type: str = "causal" - eos_id: int = 0 - - # HF args - attn_implementation: str = "sdpa" - - passed_args: dict = field(init=False, repr=False, default_factory=dict) + """ + Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. + + Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. + """ + + def __init__( + self, + # TorchTitan args + dim: int = 4096, + n_layers: int = 32, + n_heads: int = 32, + n_kv_heads: Optional[int] = None, + vocab_size: int = 128256, + multiple_of: int = 256, + ffn_dim_multiplier: Optional[float] = None, + norm_eps: float = 1e-5, + rope_theta: float = 10000, + max_seq_len: int = 2048, + depth_init: bool = True, + use_flex_attn: bool = False, + attn_mask_type: str = "causal", + eos_id: int = 0, + # HuggingFace specific args + attn_implementation: str = "sdpa", + **kwargs + ): + # Map TorchTitan arguments to HuggingFace arguments for parent class initialization + hf_config_dict = dict( + hidden_size=dim, + num_hidden_layers=n_layers, + num_attention_heads=n_heads, + num_key_value_heads=n_kv_heads, + vocab_size=vocab_size, + rms_norm_eps=norm_eps, + rope_theta=rope_theta, + max_position_embeddings=max_seq_len, + eos_token_id=eos_id, + **kwargs + ) + + super().__init__(**hf_config_dict) + + # Store TorchTitan-specific args (no HF equivalent) + self.multiple_of = multiple_of + self.ffn_dim_multiplier = ffn_dim_multiplier + self.depth_init = depth_init + self.use_flex_attn = use_flex_attn + self.attn_mask_type = attn_mask_type + + # HuggingFace specific args + self.attn_implementation = attn_implementation + + self._passed_args = dict( + dim=dim, + n_layers=n_layers, + n_heads=n_heads, + n_kv_heads=n_kv_heads, + vocab_size=vocab_size, + multiple_of=multiple_of, + ffn_dim_multiplier=ffn_dim_multiplier, + norm_eps=norm_eps, + rope_theta=rope_theta, + max_seq_len=max_seq_len, + depth_init=depth_init, + use_flex_attn=use_flex_attn, + attn_mask_type=attn_mask_type, + eos_id=eos_id, + attn_implementation=attn_implementation, + **kwargs + ) + + @property + def dim(self) -> int: + """TorchTitan: Model dimension (alias for HF hidden_size)""" + return self.hidden_size + + @dim.setter + def dim(self, value: int): + self.hidden_size = value + + @property + def n_layers(self) -> int: + """TorchTitan: Number of layers (alias for HF num_hidden_layers)""" + return self.num_hidden_layers + + @n_layers.setter + def n_layers(self, value: int): + self.num_hidden_layers = value + + @property + def n_heads(self) -> int: + """TorchTitan: Number of attention heads (alias for HF num_attention_heads)""" + return self.num_attention_heads + + @n_heads.setter + def n_heads(self, value: int): + self.num_attention_heads = value + + @property + def n_kv_heads(self) -> Optional[int]: + """TorchTitan: Number of key-value heads (alias for HF num_key_value_heads)""" + return self.num_key_value_heads + + @n_kv_heads.setter + def n_kv_heads(self, value: Optional[int]): + self.num_key_value_heads = value + + @property + def norm_eps(self) -> float: + """TorchTitan: Layer norm epsilon (alias for HF rms_norm_eps)""" + return self.rms_norm_eps + + @norm_eps.setter + def norm_eps(self, value: float): + self.rms_norm_eps = value + + @property + def max_seq_len(self) -> int: + """TorchTitan: Maximum sequence length (alias for HF max_position_embeddings)""" + return self.max_position_embeddings + + @max_seq_len.setter + def max_seq_len(self, value: int): + self.max_position_embeddings = value + + @property + def eos_id(self) -> int: + """TorchTitan: End of sequence token ID (alias for HF eos_token_id)""" + return self.eos_token_id + + @eos_id.setter + def eos_id(self, value: int): + self.eos_token_id = value def update_from_config(self, job_config: JobConfig): - + # Load HF config (overwrites our HF attributes) hf_model_config = LlamaConfig.from_pretrained( job_config.model.name, attn_implementation=self.attn_implementation, ) - # n_layers = 32 - self.__dict__.update(hf_model_config.__dict__) - - # num_hidden_layers = 16 - - # Update TT args with HF args (for keys that exist in both but differ in namings) - self.dim = self.hidden_size - self.n_layers = self.num_hidden_layers - self.n_heads = self.num_attention_heads - self.n_kv_heads = self.num_key_value_heads - self.norm_eps = self.rms_norm_eps - self.max_seq_len = self.max_position_embeddings - self.eos_id = self.eos_token_id - # n_layers = 16 - - self.__dict__.update(self.passed_args) + self.__dict__.update(hf_model_config.__dict__) - # n_layers = 2 - # num_hidden_layers = 16 - - # Update HF args with TT override args because HF modeling uses HF args and not TT args - # TODO(3outeille): find a cleaner way to handle the mapping - self.hidden_size = self.dim - self.num_hidden_layers = self.n_layers - self.num_attention_heads = self.n_heads - self.num_key_value_heads = self.n_kv_heads - self.rms_norm_eps = self.norm_eps - self.max_position_embeddings = self.max_seq_len - self.eos_token_id = self.eos_id + # Update our attributes with the passed args from flavors + for key, value in self._passed_args.items(): + if hasattr(self, key): + setattr(self, key, value) - # Match torchtitan parameter counts + # Configure HF-specific settings to match TorchTitan settings self.tie_word_embeddings = False self.attention_bias = False self.mlp_bias = False - - # Match torchtitan intermediate size calculation - ffn_hidden_size = 4 * self.hidden_size + self.use_cache = False + self.initializer_range = 1.0 # use as std for normal init in embedding + + ffn_hidden_size = 4 * self.dim ffn_hidden_size = int(2 * ffn_hidden_size / 3) if self.ffn_dim_multiplier is not None: ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) self.intermediate_size = self.multiple_of * ( (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of ) - # Forced it as HF has config.head_dim and the modeling retrieves it instead of doing config.hidden_size // config.num_attention_heads + self.head_dim = self.dim // self.num_attention_heads - # n_layers = 2 - # num_hidden_layers = 2 - - self.use_cache = False - - # HF numerical stability matching - self.initializer_range = 1.0 # use as std for normal init in embedding return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: From 249be928393bb82534dcbbf34986c2386bb7332a Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 09:33:07 +0000 Subject: [PATCH 017/129] wrapper model class to avoid transformers to be explicit in train.py --- .../transformers_backend/__init__.py | 6 +-- .../model/hf_transformers_args.py | 39 ++++++++++++++++++- torchtitan/train.py | 5 +-- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index b8fc47b9e7..9f7ee13484 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -16,14 +16,14 @@ from torchtitan.protocols.train_spec import register_train_spec, TrainSpec from .infra.parallelize_hf_transformers import parallelize_hf_transformers -from .model.hf_transformers_args import HFTransformerModelArgs +from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel from transformers.models.llama.modeling_llama import LlamaForCausalLM __all__ = [ "HFTransformerModelArgs", - "LlamaForCausalLM", #TODO(3outeille): later use AutoModelForCausalLM + "HFTransformerModel", "hf_transformers_configs", ] @@ -52,7 +52,7 @@ def hf_transformer_model_args_builder(**kwargs): hf_train_spec = TrainSpec( name="hf_auto_model", - model_cls=LlamaForCausalLM, + model_cls=HFTransformerModel, model_args=flavors, parallelize_fn=parallelize_hf_transformers, pipelining_fn=pipeline_llama, diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index d558ec1550..fad9e35f28 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -16,6 +16,7 @@ from transformers.modeling_utils import PreTrainedModel from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP +from transformers.models.llama.modeling_llama import LlamaForCausalLM # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, @@ -32,8 +33,6 @@ def _initialize_weights_patched(self, module): if param.device.type == "meta": return - #TODO(3outeille): check if register bufffer is init - # If not on a meta device, call the original weight initialization self._init_weights(module) module._is_hf_initialized = True @@ -336,3 +335,39 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t return nparams, num_flops_per_token + + +class HFTransformerModel(LlamaForCausalLM): + def __init__(self, model_args: HFTransformerModelArgs): + super().__init__(model_args) + + def init_weights(self, *args, **kwargs): + # Taken from transformers.modeling_utils.PreTrainedModel.init_weights + super().init_weights() + self._backward_compatibility_gradient_checkpointing() + + # Make sure the modules correctly exist if the flag is active + if self._keep_in_fp32_modules is not None or self._keep_in_fp32_modules_strict is not None: + all_parameters = {name for name, _ in self.named_parameters() if len(name) > 0} + unique_module_names = set() + # Get all unique module names in the module graph, without the prefixes + for param in all_parameters: + unique_module_names.update( + [name for name in param.split(".") if not name.isnumeric() and name not in ["weight", "bias"]] + ) + # Check that every module in the keep_in_fp32 list is part of the module graph + if self._keep_in_fp32_modules is not None: + for module in self._keep_in_fp32_modules: + if module not in unique_module_names: + raise ValueError( + f"{module} was specified in the `_keep_in_fp32_modules` list, but is not part of the modules in" + f" {self.__class__.__name__}" + ) + + if self._keep_in_fp32_modules_strict is not None: + for module in self._keep_in_fp32_modules_strict: + if module not in unique_module_names: + raise ValueError( + f"{module} was specified in the `_keep_in_fp32_modules_strict` list, but is not part of the modules in" + f" {self.__class__.__name__}" + ) \ No newline at end of file diff --git a/torchtitan/train.py b/torchtitan/train.py index f3617eb415..7ae5881f2a 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -266,10 +266,7 @@ def __init__(self, job_config: JobConfig): torch.utils.deterministic.fill_uninitialized_memory = False model.to_empty(device=init_device) with torch.no_grad(): - if isinstance(model, LlamaForCausalLM): - model.post_init() - else: - model.init_weights(buffer_device=buffer_device) + model.init_weights(buffer_device=buffer_device) model.train() self.model_parts = [model] From e2d4adaca2bd00bce2d069b9e968a6dcc8e51c1d Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 10:02:07 +0000 Subject: [PATCH 018/129] add better testing script with reference log for later sanity check --- .../transformers_backend/compare_tt_hf_run.sh | 10 ++- .../configs/debug_1_gpu.toml | 62 -------------- .../configs/debug_1_gpu_hf.toml | 6 +- .../configs/debug_1_gpu_tt.toml | 83 +++++++++++++++++++ .../reference_diff_llama3_1gpu.log | 61 ++++++++++++++ .../test_hf_torchtitan_model_args.py | 51 ------------ torchtitan/models/llama3/model/args.py | 75 +++++++++++++---- 7 files changed, 212 insertions(+), 136 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml create mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml create mode 100644 torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log delete mode 100644 torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh index 81b33091fb..0461ebfb7b 100755 --- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh @@ -17,7 +17,7 @@ run_tt() { echo "##############################################" echo "### Running TorchTitan (native) training ###" echo "##############################################" - TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml" + TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml" # Use CUDA_VISIBLE_DEVICES=0 for TT run CUDA_VISIBLE_DEVICES=0 \ @@ -44,9 +44,11 @@ TT_LOG="tt_run.log" HF_LOG="hf_run.log" DIFF_LOG="run_diff.log" -run_tt "$@" 2>&1 | tee ${TT_LOG} -# run_hf "$@" 2>&1 | tee ${HF_LOG} -run_tt "$@" 2>&1 | tee ${HF_LOG} +export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_hf" +run_hf "$@" 2>&1 | tee ${HF_LOG} || true +export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_tt" +run_tt "$@" 2>&1 | tee ${TT_LOG} || true +# run_tt "$@" 2>&1 | tee ${HF_LOG} # Filter logs to remove noisy differences diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml deleted file mode 100644 index 34f6953869..0000000000 --- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml +++ /dev/null @@ -1,62 +0,0 @@ -[job] -dump_folder = "./outputs" -description = "Llama 3 debug training with FSDP on 2 GPUs" -print_args = false -use_for_integration_test = true - -[profiling] -enable_profiling = false -save_traces_folder = "profile_trace" -profile_freq = 10 -enable_memory_snapshot = false -save_memory_snapshot_folder = "memory_snapshot" - -[metrics] -log_freq = 1 -disable_color_printing = false -enable_tensorboard = false -save_tb_folder = "tb" -enable_wandb = false - -[model] -name = "llama3" -flavor = "debugmodel" -tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" - -[optimizer] -name = "AdamW" -lr = 8e-4 -eps = 1e-8 - -[lr_scheduler] -warmup_steps = 2 -decay_ratio = 0.8 -decay_type = "linear" -min_lr_factor = 0.0 - -[training] -local_batch_size = 8 -seq_len = 2048 -max_norm = 1.0 -steps = 10 -compile = false -dataset = "c4_test" -dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" - -[parallelism] -data_parallel_replicate_degree = 1 -data_parallel_shard_degree = 1 -tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 -context_parallel_degree = 1 -expert_parallel_degree = 1 - -[checkpoint] -enable_checkpoint = false - -[activation_checkpoint] -mode = "selective" -selective_ac_option = '2' - -[validation] -enabled = false \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml index 3144011b62..95aa9599b2 100644 --- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml +++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml @@ -5,9 +5,9 @@ print_args = false use_for_integration_test = true [profiling] -enable_profiling = false -save_traces_folder = "profile_trace" -profile_freq = 10 +enable_profiling = true +save_traces_folder = "profile_trace_hf" +profile_freq = 5 enable_memory_snapshot = false save_memory_snapshot_folder = "memory_snapshot" diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml new file mode 100644 index 0000000000..b153a98f21 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml @@ -0,0 +1,83 @@ +# torchtitan Config.toml + +[job] +dump_folder = "./outputs" +description = "Llama 3 debug training" +print_args = false +use_for_integration_test = true + +[profiling] +enable_profiling = true +save_traces_folder = "profile_trace" +profile_freq = 5 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +disable_color_printing = false +enable_tensorboard = false +save_tb_folder = "tb" +enable_wandb = false + +[model] +name = "llama3" +flavor = "debugmodel" +# test folder with tokenizer.json, for debug purpose only +hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" +# converters = ["float8"] + +[optimizer] +name = "AdamW" +lr = 8e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps +decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps +decay_type = "linear" +min_lr_factor = 0.0 + +[training] +local_batch_size = 8 +seq_len = 2048 +max_norm = 1.0 # grad norm clipping +steps = 10 +dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M) +dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = 1 +fsdp_reshard_after_forward = "default" # default / never / always +tensor_parallel_degree = 1 +enable_async_tensor_parallel = false +pipeline_parallel_degree = 1 +context_parallel_degree = 1 + +[checkpoint] +enable = false +folder = "checkpoint" +interval = 10 +last_save_model_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = "selective" # ["none", "selective", "full"] +selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy + +[compile] +enable=false +components = ["model", "loss"] + +[float8] +enable_fsdp_float8_all_gather = false +precompute_float8_dynamic_scale_for_fsdp = false +filter_fqns = ["output"] + +[validation] +enable = false +dataset = "c4_validation" +freq = 5 +steps = 10 diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log new file mode 100644 index 0000000000..e134f15115 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log @@ -0,0 +1,61 @@ +diff --git a/tt_run.log.filtered b/hf_run.log.filtered +index d3be70f..0f9a180 100644 +--- a/tt_run.log.filtered ++++ b/hf_run.log.filtered +@@ -1,22 +1,23 @@ ++ echo '##############################################' +##############################################'#######################################################' +####################################################### ++ echo '### Running TorchTitan (native)with HF backend training ###' +### Running TorchTitan (native)with HF backend training ### ++ echo '##############################################' +##############################################'#######################################################' +####################################################### ++ TT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.tomlHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml ++ CUDA_VISIBLE_DEVICES=0CUDA_VISIBLE_DEVICES=1 ++ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml --training.seed 42 --training.deterministic +[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. +[rank0]: warnings.warn( +[rank0]:[titan] TIMESTAMP - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily. +[rank0]:[titan] TIMESTAMP - root - INFO - Starting job: HF Llama 3 debug training +[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config +[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [] +[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation). +[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json +[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test +[rank0]:[titan] TIMESTAMP - root - INFO - Building llama3meta-llama/Llama-3.2-1B debugmodel with TransformerModelArgs(_enforced='ThisHFTransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)defaults.') +[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory +[rank0]:[titan] TIMESTAMP - root - INFO - Parameter breakdown: +[rank0]:[titan] TIMESTAMP - root - INFO - - embedding: 512,000 parameters +@@ -28,30 +29,29 @@ +[rank0]:[titan] TIMESTAMP - root - INFO - - layer_5: 852,480 parameters +[rank0]:[titan] TIMESTAMP - root - INFO - - final_norm: 256 parameters +[rank0]:[titan] TIMESTAMP - root - INFO - - lm_head: 512,000 parameters +[rank0]:[titan] TIMESTAMP - root - INFO - Model llama3meta-llama/Llama-3.2-1B debugmodel size: 6,139,136 total parameters +[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model +[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14 +[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.04GiB(0.05%) +[rank0]:[titan] TIMESTAMP - root - WARNING - model.safetensors.index.json not found at hf_assets_path: /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format +[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP +[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2) +[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1 +[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace./outputs/profile_trace_hf +[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 7.87237.8704 grad_norm: 1.51671.5185 memory: 1.39GiB(1.75%)1.67GiB(2.10%) tps: 44,58534,083 tflops: 3.192.54 mfu: 0.32%0.26% +[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 +[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.52467.5209 grad_norm: 1.63591.6373 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 21,05219,870 tflops: 1.511.48 mfu: 0.15% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 6.79006.7789 grad_norm: 2.03452.0390 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 334,947199,616 tflops: 23.9514.89 mfu: 2.42%1.51% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 5.98295.9673 grad_norm: 2.41292.4176 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 357,001207,967 tflops: 25.5315.51 mfu: 2.58%1.57% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 5.05365.0388 grad_norm: 2.53052.5275 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 279,263188,745 tflops: 19.9714.08 mfu: 2.02%1.42% +[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5 +[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.020.04 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.63704.6283 grad_norm: 2.28262.2818 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 129,46483,088 tflops: 9.266.20 mfu: 0.94%0.63% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.31334.3077 grad_norm: 2.10192.1023 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 298,394175,561 tflops: 21.3413.09 mfu: 2.16%1.32% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.13984.1349 grad_norm: 1.93421.9334 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 352,929206,086 tflops: 25.2415.37 mfu: 2.55%1.55% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.53264.5289 grad_norm: 1.51111.5103 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 357,192208,947 tflops: 25.5415.58 mfu: 2.58%1.58% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.98593.9828 grad_norm: 1.77991.7849 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 287,408189,593 tflops: 20.5514.14 mfu: 2.08%1.43% +[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10 +[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.04 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete +[rank0]:[titan] TIMESTAMP - root - INFO - Training completed +[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed diff --git a/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py b/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py deleted file mode 100644 index d83f268091..0000000000 --- a/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py +++ /dev/null @@ -1,51 +0,0 @@ -from transformers.models.llama.configuration_llama import LlamaConfig -from torchtitan.experiments.transformers_backend.model.hf_transformers_args import ( - HFTransformerModelArgs, -) -from torchtitan.config import JobConfig - - -def print_comparison_keys(ref_dict, tt_dict): - all_keys = sorted(list(set(ref_dict.keys()) | set(tt_dict.keys()))) - print(f"{'Attribute':<30} | {'Original HF':<20} | {'TorchTitan HF':<20}") - print("-" * 75) - for key in all_keys: - ref_val = ref_dict.get(key, "N/A") - tt_val = tt_dict.get(key, "N/A") - if str(ref_val) != str(tt_val): - # Red for different - print(f"\033[91m{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}\033[0m") - else: - print(f"{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}") - -def compare_hf_tt_configs(model_name, flavor): - ref_hf_config = LlamaConfig() - - model_args = HFTransformerModelArgs() - job_config = JobConfig() - job_config.model.name = model_name - job_config.model.flavor = flavor - model_args.update_from_config(job_config) - tt_hf_config = model_args.convert_to_hf_config() - - ref_dict = ref_hf_config.to_dict() - tt_dict = tt_hf_config.to_dict() - - try: - assert ref_dict == tt_dict - print(f"✅ Configs match for model name {model_name} with flavor: {flavor}") - except AssertionError: - print(f"❌ Configs do not match for model name {model_name} with flavor: {flavor}! Showing differences:") - print_comparison_keys(ref_dict, tt_dict) - raise - -if __name__ == "__main__": - model_names = [ - "meta-llama/Llama-3.2-1B", - ] - flavors = ["full"] - - for model_name in model_names: - for flavor in flavors: - print(f"\nTesting model name: {model_name} with flavor: {flavor}") - compare_hf_tt_configs(model_name, flavor) \ No newline at end of file diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py index e2f698f8b1..1728d9b93e 100644 --- a/torchtitan/models/llama3/model/args.py +++ b/torchtitan/models/llama3/model/args.py @@ -53,25 +53,68 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None: self.max_seq_len = seq_len def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: + """ + Count parameters and estimate flops for a TT (TorchTitan) model. + + Args: + model (nn.Module): The TT model (not HF). + seq_len (int): Sequence length. + + Returns: + tuple[int, int]: (nparams, num_flops_per_token) + """ nparams = sum(p.numel() for p in model.parameters()) + + layer_params = {} # layer_id -> int + embedding_params = 0 + norm_params = 0 + lm_head_params = 0 + misc_params = {} + + # TT model: top-level modules are tok_embeddings, layers (ModuleDict), norm, output + for name, p in model.named_parameters(): + if name.startswith("tok_embeddings."): + embedding_params += p.numel() + elif name.startswith("layers."): + try: + # layers.. + layer_id = int(name.split(".")[1]) + if layer_id not in layer_params: + layer_params[layer_id] = 0 + layer_params[layer_id] += p.numel() + except (ValueError, IndexError): + # Should not happen, but catch any oddities + component = "misc_layer_parts" + if component not in misc_params: + misc_params[component] = 0 + misc_params[component] += p.numel() + elif name.startswith("norm."): + norm_params += p.numel() + elif name.startswith("output."): + lm_head_params += p.numel() + else: + # Catch anything else + component = name.split(".")[0] + if component not in misc_params: + misc_params[component] = 0 + misc_params[component] += p.numel() + + logger.info("Parameter breakdown:") + logger.info(f" - embedding: {embedding_params:,} parameters") + for layer_num in sorted(layer_params.keys()): + params = layer_params[layer_num] + logger.info(f" - layer_{layer_num}: {params:,} parameters") + logger.info(f" - final_norm: {norm_params:,} parameters") + logger.info(f" - lm_head: {lm_head_params:,} parameters") + if misc_params: + for name, params in misc_params.items(): + logger.info(f" - {name} (misc): {params:,} parameters") + + # For TT, embedding is always model.tok_embeddings nparams_embedding = sum( - sum(p.numel() for p in m.parameters()) - for m in model.children() - if isinstance(m, nn.Embedding) + p.numel() for p in getattr(model, "tok_embeddings", nn.Module()).parameters() ) - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Reasoning behind the factor of 12 for the self-attention part of the formula: - # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) - # 2. the flash attention does 1 more matmul recomputation in the backward - # but recomputation should not be counted in calculating MFU (+0) - # 3. each matmul performs 1 multiplication and 1 addition (*2) - # 4. we follow the convention and do not account for sparsity in causal attention + l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t - return nparams, num_flops_per_token From 4b498a94fa9a02679005147cc6ae23460e11af45 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 12:05:49 +0000 Subject: [PATCH 019/129] no need to fill passed args --- .../transformers_backend/__init__.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 9f7ee13484..c4b5256d83 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -18,36 +18,25 @@ from .infra.parallelize_hf_transformers import parallelize_hf_transformers from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel -from transformers.models.llama.modeling_llama import LlamaForCausalLM - - __all__ = [ "HFTransformerModelArgs", "HFTransformerModel", "hf_transformers_configs", ] - -def hf_transformer_model_args_builder(**kwargs): - # Capture the kwargs in the passed_args field - args = HFTransformerModelArgs(**kwargs) - args.passed_args = kwargs - return args - - flavors = { - "debugmodel": hf_transformer_model_args_builder( + "debugmodel": HFTransformerModelArgs( # n_layers=2, # vocab_size=2000, max_seq_len=2048, #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16 ), - "medium": hf_transformer_model_args_builder( + "medium": HFTransformerModelArgs( dim=1024, n_layers=12, ), - "full": hf_transformer_model_args_builder(), + "full": HFTransformerModelArgs(), } hf_train_spec = TrainSpec( From eb403d5e0a45b7da7586e8d384562c6f22214e86 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 15:11:04 +0000 Subject: [PATCH 020/129] can now handle multiple HF modeling --- .../infra/parallelize_hf_transformers.py | 6 +- .../model/hf_transformers_args.py | 94 ++++++++----------- torchtitan/models/llama3/infra/parallelize.py | 2 - torchtitan/train.py | 2 +- 4 files changed, 41 insertions(+), 63 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 2f0d9167b0..76d2d8adb4 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -140,13 +140,11 @@ def selective_checkpointing_context_fn(): def apply_ac(model: nn.Module, ac_config: ACConfig): """Apply activation checkpointing to the model.""" - layers = model.model.layers - - for layer_id, transformer_block in layers.named_children(): + for layer_id, transformer_block in model.layers.named_children(): transformer_block = _apply_ac_to_transformer_block( transformer_block, ac_config, base_fqn=f"layers.{layer_id}" ) - layers.register_module(layer_id, transformer_block) + model.layers.register_module(layer_id, transformer_block) logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index fad9e35f28..b0db5ba36b 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -4,19 +4,18 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from dataclasses import dataclass, field -from typing import Optional, Union -import os +from dataclasses import dataclass +from typing import Optional from torch import nn from torchtitan.config import JobConfig from torchtitan.protocols import BaseModelArgs from torchtitan.tools.logging import logger from transformers.models.llama.configuration_llama import LlamaConfig - -from transformers.modeling_utils import PreTrainedModel from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP -from transformers.models.llama.modeling_llama import LlamaForCausalLM +from transformers.modeling_utils import PreTrainedModel +from transformers import AutoConfig +from transformers.configuration_utils import PretrainedConfig # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, @@ -111,7 +110,7 @@ def _init_weights_patched(self, module): PreTrainedModel._initialize_weights = _initialize_weights_patched @dataclass -class HFTransformerModelArgs(LlamaConfig, BaseModelArgs): +class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): """ Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. @@ -138,23 +137,7 @@ def __init__( # HuggingFace specific args attn_implementation: str = "sdpa", **kwargs - ): - # Map TorchTitan arguments to HuggingFace arguments for parent class initialization - hf_config_dict = dict( - hidden_size=dim, - num_hidden_layers=n_layers, - num_attention_heads=n_heads, - num_key_value_heads=n_kv_heads, - vocab_size=vocab_size, - rms_norm_eps=norm_eps, - rope_theta=rope_theta, - max_position_embeddings=max_seq_len, - eos_token_id=eos_id, - **kwargs - ) - - super().__init__(**hf_config_dict) - + ): # Store TorchTitan-specific args (no HF equivalent) self.multiple_of = multiple_of self.ffn_dim_multiplier = ffn_dim_multiplier @@ -249,7 +232,7 @@ def eos_id(self, value: int): def update_from_config(self, job_config: JobConfig): # Load HF config (overwrites our HF attributes) - hf_model_config = LlamaConfig.from_pretrained( + hf_model_config = AutoConfig.from_pretrained( job_config.model.name, attn_implementation=self.attn_implementation, ) @@ -337,37 +320,36 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in return nparams, num_flops_per_token -class HFTransformerModel(LlamaForCausalLM): +class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): - super().__init__(model_args) + super().__init__() + + # Try to import the model class dynamically from the transformers library if not found in globals + model_class_name = model_args.architectures[0] + model_cls = globals().get(model_class_name, None) + if model_cls is None: + try: + import importlib + transformers_mod = importlib.import_module("transformers") + model_cls = getattr(transformers_mod, model_class_name) + except (ImportError, AttributeError) as e: + raise ImportError( + f"Could not find model class '{model_class_name}' in globals or transformers. " + f"Make sure the class is available. Original error: {e}" + ) + self.model = model_cls(config=model_args) + + @property + def layers(self): + """Returns the model's layers, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like + return self.model.model.layers + else: + # Add more cases here if needed for other model architectures + raise AttributeError("Could not find layers in the model. Please check the model structure.") + + def forward(self, *args, **kwargs): + return self.model(*args, **kwargs) def init_weights(self, *args, **kwargs): - # Taken from transformers.modeling_utils.PreTrainedModel.init_weights - super().init_weights() - self._backward_compatibility_gradient_checkpointing() - - # Make sure the modules correctly exist if the flag is active - if self._keep_in_fp32_modules is not None or self._keep_in_fp32_modules_strict is not None: - all_parameters = {name for name, _ in self.named_parameters() if len(name) > 0} - unique_module_names = set() - # Get all unique module names in the module graph, without the prefixes - for param in all_parameters: - unique_module_names.update( - [name for name in param.split(".") if not name.isnumeric() and name not in ["weight", "bias"]] - ) - # Check that every module in the keep_in_fp32 list is part of the module graph - if self._keep_in_fp32_modules is not None: - for module in self._keep_in_fp32_modules: - if module not in unique_module_names: - raise ValueError( - f"{module} was specified in the `_keep_in_fp32_modules` list, but is not part of the modules in" - f" {self.__class__.__name__}" - ) - - if self._keep_in_fp32_modules_strict is not None: - for module in self._keep_in_fp32_modules_strict: - if module not in unique_module_names: - raise ValueError( - f"{module} was specified in the `_keep_in_fp32_modules_strict` list, but is not part of the modules in" - f" {self.__class__.__name__}" - ) \ No newline at end of file + self.model.post_init() \ No newline at end of file diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py index 6da44a321d..1a2528be6d 100644 --- a/torchtitan/models/llama3/infra/parallelize.py +++ b/torchtitan/models/llama3/infra/parallelize.py @@ -34,8 +34,6 @@ from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp from torchtitan.tools.logging import logger -from transformers.models.llama.modeling_llama import LlamaForCausalLM - def parallelize_llama( model: nn.Module, parallel_dims: ParallelDims, diff --git a/torchtitan/train.py b/torchtitan/train.py index 7ae5881f2a..179f455e98 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -34,7 +34,7 @@ maybe_enable_profiling, ) -from transformers.models.llama.modeling_llama import LlamaForCausalLM, CausalLMOutputWithPast +from transformers.models.llama.modeling_llama import CausalLMOutputWithPast class Trainer(torch.distributed.checkpoint.stateful.Stateful): From a0d67a78ab45aba67e594d59b54116dd4c06742d Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 15:14:10 +0000 Subject: [PATCH 021/129] handle pref logits accessing inside HF model wrapper --- .../transformers_backend/model/hf_transformers_args.py | 9 +++++++-- torchtitan/train.py | 3 --- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index b0db5ba36b..c257cbfcfd 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -4,6 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import importlib from dataclasses import dataclass from typing import Optional @@ -16,6 +17,8 @@ from transformers.modeling_utils import PreTrainedModel from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_outputs import CausalLMOutputWithPast + # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, @@ -329,7 +332,6 @@ def __init__(self, model_args: HFTransformerModelArgs): model_cls = globals().get(model_class_name, None) if model_cls is None: try: - import importlib transformers_mod = importlib.import_module("transformers") model_cls = getattr(transformers_mod, model_class_name) except (ImportError, AttributeError) as e: @@ -349,7 +351,10 @@ def layers(self): raise AttributeError("Could not find layers in the model. Please check the model structure.") def forward(self, *args, **kwargs): - return self.model(*args, **kwargs) + output = self.model(*args, **kwargs) + if isinstance(output, CausalLMOutputWithPast): + return output.logits + return output def init_weights(self, *args, **kwargs): self.model.post_init() \ No newline at end of file diff --git a/torchtitan/train.py b/torchtitan/train.py index 179f455e98..d7a399a1ce 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -464,9 +464,6 @@ def forward_backward_step( assert len(model_parts) == 1 with self.maybe_enable_amp: pred = model_parts[0](inputs) - #NOTE(3outeille): just trying to make it work for now. Will refactor later. - if isinstance(pred, CausalLMOutputWithPast): - pred = pred.logits loss = self.loss_fn(pred, labels) # need to free to before bwd to avoid peaking memory del pred From ea05552507082936e4d7b92a71691fe4d37bac01 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 15:21:17 +0000 Subject: [PATCH 022/129] isolate HF patch for llama in another file --- .../model/hf_llama_patch.py | 90 +++++++++++++++++ .../model/hf_transformers_args.py | 97 +------------------ 2 files changed, 92 insertions(+), 95 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_patch.py diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py new file mode 100644 index 0000000000..28888f61a6 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py @@ -0,0 +1,90 @@ + + +import torch.nn as nn + +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer +from transformers.modeling_utils import PreTrainedModel + +_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__ + +def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int): + _original_llama_decoder_layer_init(self, config, layer_idx) + self.mlp.layer_idx = layer_idx + +def _initialize_weights_patched(self, module): + # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly + # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, + # which prevents subsequent proper initialization. + if getattr(module, "_is_hf_initialized", False): + return + + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + + # If not on a meta device, call the original weight initialization + self._init_weights(module) + module._is_hf_initialized = True + +def _init_weights_patched(self, module): + """ + Patched version of _init_weights to match TorchTitan's initialization for Llama. + `self` is a LlamaPreTrainedModel instance. + """ + config = self.config + + if isinstance(module, (LlamaAttention, LlamaMLP)): + layer_idx = module.layer_idx + + if config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, LlamaAttention): + nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, LlamaMLP): + nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std) + nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) + + elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif ( + isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) + or "LayerNorm" in module.__class__.__name__ + or "RMSNorm" in module.__class__.__name__ + ): + # Norms can exist without weights (in which case they are None from torch primitives) + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + +def patch_hf_llama(): + LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index c257cbfcfd..5a8b724397 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -12,105 +12,12 @@ from torchtitan.config import JobConfig from torchtitan.protocols import BaseModelArgs from torchtitan.tools.logging import logger -from transformers.models.llama.configuration_llama import LlamaConfig -from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP -from transformers.modeling_utils import PreTrainedModel from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast - -# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly -# The default _initialize_weights sets _is_hf_initialized = True even on a meta device, -# which prevents subsequent proper initialization. -def _initialize_weights_patched(self, module): - """ - Patched version of _initialize_weights that skips initialization and setting - the _is_hf_initialized flag if the module is on a meta device. - """ - if getattr(module, "_is_hf_initialized", False): - return - - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - - # If not on a meta device, call the original weight initialization - self._init_weights(module) - module._is_hf_initialized = True - - -#TODO(3outeille): find a better way to do this -from transformers.models.llama.modeling_llama import LlamaDecoderLayer - -_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__ - -def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int): - _original_llama_decoder_layer_init(self, config, layer_idx) - self.mlp.layer_idx = layer_idx - -LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched - - -def _init_weights_patched(self, module): - """ - Patched version of _init_weights to match TorchTitan's initialization for Llama. - `self` is a LlamaPreTrainedModel instance. - """ - config = self.config - - if isinstance(module, (LlamaAttention, LlamaMLP)): - layer_idx = module.layer_idx - - if config.depth_init: - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - else: - init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 - - if isinstance(module, LlamaAttention): - nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, LlamaMLP): - nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std) - nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) - - elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif ( - isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) - or "LayerNorm" in module.__class__.__name__ - or "RMSNorm" in module.__class__.__name__ - ): - # Norms can exist without weights (in which case they are None from torch primitives) - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - - -PreTrainedModel._init_weights = _init_weights_patched -PreTrainedModel._initialize_weights = _initialize_weights_patched +from .hf_llama_patch import patch_hf_llama +patch_hf_llama() @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): From adefa2cd616cd848956e5bea252a9bcd63515942 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 16 Sep 2025 18:42:23 +0000 Subject: [PATCH 023/129] find hacky way to pass HF model.name through CLI --- torchtitan/experiments/transformers_backend/__init__.py | 5 +---- torchtitan/protocols/train_spec.py | 9 ++++++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index c4b5256d83..7ac18a1752 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -52,7 +52,4 @@ build_loss_fn=build_cross_entropy_loss, ) -# Register multiple train_specs under the same name -register_train_spec(hf_train_spec) -register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-3B")) -register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-1B")) \ No newline at end of file +register_train_spec(hf_train_spec) \ No newline at end of file diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index 06fa3a1bc6..0feaaa38cc 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. from collections.abc import Callable +import dataclasses from dataclasses import dataclass from typing import Mapping, TypeAlias @@ -69,8 +70,14 @@ def register_train_spec(train_spec: TrainSpec) -> None: def get_train_spec(name: str) -> TrainSpec: global _train_specs - if name not in _train_specs: + + if "/" in name: # HF model (dynamic loading) + hf_spec = _train_specs["hf_auto_model"] + new_spec = dataclasses.replace(hf_spec, name=name) + _train_specs[name] = new_spec + elif name not in _train_specs: # Torchtitan raise ValueError(f"Model {name} is not registered.") + return _train_specs[name] From a2358631c2b430c9bbbc061db2055eb1c81f8abf Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 17 Sep 2025 08:12:29 +0000 Subject: [PATCH 024/129] more granularity of logging when doing parameter breakdown --- .../model/hf_transformers_args.py | 82 ++++++------- .../reference_diff_llama3_1gpu.log | 112 ++++++++++++++---- torchtitan/models/deepseek_v3/model/args.py | 16 +++ torchtitan/models/llama3/model/args.py | 91 +++++--------- torchtitan/train.py | 4 +- 5 files changed, 180 insertions(+), 125 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 5a8b724397..4b2f38ffa1 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -175,60 +175,52 @@ def update_from_config(self, job_config: JobConfig): def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: nparams = sum(p.numel() for p in model.parameters()) - - layer_params = {} # int -> int - embedding_params = 0 - norm_params = 0 - lm_head_params = 0 - misc_params = {} - - for name, p in model.named_parameters(): - if "model.embed_tokens" in name: - embedding_params += p.numel() - elif "model.layers." in name: - try: - layer_num = int(name.split("layers.")[1].split(".")[0]) - if layer_num not in layer_params: - layer_params[layer_num] = 0 - layer_params[layer_num] += p.numel() - except (ValueError, IndexError): - # Should not happen with standard HF llama names - component = "misc_layer_parts" - if component not in misc_params: - misc_params[component] = 0 - misc_params[component] += p.numel() - elif "model.norm" in name: - norm_params += p.numel() - elif "lm_head" in name: - lm_head_params += p.numel() - else: - # Catch anything else - component = name.split(".")[0] - if component not in misc_params: - misc_params[component] = 0 - misc_params[component] += p.numel() - - logger.info("Parameter breakdown:") - logger.info(f" - embedding: {embedding_params:,} parameters") - for layer_num in sorted(layer_params.keys()): - params = layer_params[layer_num] - logger.info(f" - layer_{layer_num}: {params:,} parameters") - logger.info(f" - final_norm: {norm_params:,} parameters") - logger.info(f" - lm_head: {lm_head_params:,} parameters") - if misc_params: - for name, params in misc_params.items(): - logger.info(f" - {name} (misc): {params:,} parameters") - nparams_embedding = sum( sum(p.numel() for p in m.parameters()) for m in model.children() if isinstance(m, nn.Embedding) ) - l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Reasoning behind the factor of 12 for the self-attention part of the formula: + # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # 2. the flash attention does 1 more matmul recomputation in the backward + # but recomputation should not be counted in calculating MFU (+0) + # 3. each matmul performs 1 multiplication and 1 addition (*2) + # 4. we follow the convention and do not account for sparsity in causal attention num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + return nparams, num_flops_per_token + def debug_structure_param(self, model: nn.Module): + logger.info("Model Structure Parameter Breakdown:") + + def _format_module(module: nn.Module, prefix: str = ""): + for name, sub_module in module.named_children(): + sub_module_params = sum(p.numel() for p in sub_module.parameters()) + if sub_module_params == 0: + continue + + # For HF models, we want to "unwrap" the ".model" attribute + # to get a view comparable to the native TorchTitan models. + if name == "model": + _format_module(sub_module, prefix) + else: + logger.info( + f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" + ) + _format_module(sub_module, prefix + " ") + + total_params = sum(p.numel() for p in model.parameters()) + logger.info(f"{model.__class__.__name__} - {total_params:,} params") + _format_module(model, " ") + + class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log index e134f15115..44bbbae2d1 100644 --- a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log +++ b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log @@ -1,8 +1,8 @@ diff --git a/tt_run.log.filtered b/hf_run.log.filtered -index d3be70f..0f9a180 100644 +index 28327e0..abbe4d7 100644 --- a/tt_run.log.filtered +++ b/hf_run.log.filtered -@@ -1,22 +1,23 @@ +@@ -1,125 +1,125 @@ + echo '##############################################' ##############################################'#######################################################' ####################################################### @@ -13,7 +13,7 @@ ####################################################### + TT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.tomlHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml + CUDA_VISIBLE_DEVICES=0CUDA_VISIBLE_DEVICES=1 -+ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml --training.seed 42 --training.deterministic ++ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml --training.seed 42 --training.deterministic --model.name llama3meta-llama/Llama-3.2-1B [rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. [rank0]: warnings.warn( [rank0]:[titan] TIMESTAMP - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily. @@ -26,12 +26,84 @@ [rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test [rank0]:[titan] TIMESTAMP - root - INFO - Building llama3meta-llama/Llama-3.2-1B debugmodel with TransformerModelArgs(_enforced='ThisHFTransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)defaults.') [rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory -[rank0]:[titan] TIMESTAMP - root - INFO - Parameter breakdown: -[rank0]:[titan] TIMESTAMP - root - INFO - - embedding: 512,000 parameters -@@ -28,30 +29,29 @@ -[rank0]:[titan] TIMESTAMP - root - INFO - - layer_5: 852,480 parameters -[rank0]:[titan] TIMESTAMP - root - INFO - - final_norm: 256 parameters -[rank0]:[titan] TIMESTAMP - root - INFO - - lm_head: 512,000 parameters +[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown: +[rank0]:[titan] TIMESTAMP - root - INFO - TransformerHFTransformerModel - 6,139,136 params +[rank0]:[titan] TIMESTAMP - root - INFO - (tok_embeddings):(embed_tokens): Embedding - 512,000 params +[rank0]:[titan] TIMESTAMP - root - INFO - (layers): ModuleDictModuleList - 5,114,880 params +[rank0]:[titan] TIMESTAMP - root - INFO - (0): TransformerBlockLlamaDecoderLayer - 852,480 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (1): TransformerBlockLlamaDecoderLayer - 852,480 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (2): TransformerBlockLlamaDecoderLayer - 852,480 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (3): TransformerBlockLlamaDecoderLayer - 852,480 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (4): TransformerBlockLlamaDecoderLayer - 852,480 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (5): TransformerBlockLlamaDecoderLayer - 852,480 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (norm): RMSNormLlamaRMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (output):(lm_head): Linear - 512,000 params [rank0]:[titan] TIMESTAMP - root - INFO - Model llama3meta-llama/Llama-3.2-1B debugmodel size: 6,139,136 total parameters [rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model [rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14 @@ -41,19 +113,19 @@ [rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2) [rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1 [rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace./outputs/profile_trace_hf -[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 7.87237.8704 grad_norm: 1.51671.5185 memory: 1.39GiB(1.75%)1.67GiB(2.10%) tps: 44,58534,083 tflops: 3.192.54 mfu: 0.32%0.26% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 7.87237.8704 grad_norm: 1.51671.5185 memory: 1.39GiB(1.75%)1.67GiB(2.10%) tps: 43,79234,528 tflops: 3.132.58 mfu: 0.32%0.26% [rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 -[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.52467.5209 grad_norm: 1.63591.6373 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 21,05219,870 tflops: 1.511.48 mfu: 0.15% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 6.79006.7789 grad_norm: 2.03452.0390 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 334,947199,616 tflops: 23.9514.89 mfu: 2.42%1.51% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 5.98295.9673 grad_norm: 2.41292.4176 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 357,001207,967 tflops: 25.5315.51 mfu: 2.58%1.57% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 5.05365.0388 grad_norm: 2.53052.5275 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 279,263188,745 tflops: 19.9714.08 mfu: 2.02%1.42% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.52467.5209 grad_norm: 1.63591.6373 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 21,38419,712 tflops: 1.531.47 mfu: 0.15% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 6.79006.7789 grad_norm: 2.03452.0390 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 336,714197,260 tflops: 24.0814.71 mfu: 2.43%1.49% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 5.98295.9673 grad_norm: 2.41292.4176 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 360,388206,932 tflops: 25.7715.43 mfu: 2.61%1.56% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 5.05365.0388 grad_norm: 2.53052.5275 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 286,298186,563 tflops: 20.4713.91 mfu: 2.07%1.41% [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5 -[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.020.04 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.63704.6283 grad_norm: 2.28262.2818 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 129,46483,088 tflops: 9.266.20 mfu: 0.94%0.63% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.31334.3077 grad_norm: 2.10192.1023 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 298,394175,561 tflops: 21.3413.09 mfu: 2.16%1.32% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.13984.1349 grad_norm: 1.93421.9334 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 352,929206,086 tflops: 25.2415.37 mfu: 2.55%1.55% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.53264.5289 grad_norm: 1.51111.5103 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 357,192208,947 tflops: 25.5415.58 mfu: 2.58%1.58% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.98593.9828 grad_norm: 1.77991.7849 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 287,408189,593 tflops: 20.5514.14 mfu: 2.08%1.43% +[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.05 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.63704.6283 grad_norm: 2.28262.2818 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 129,44780,608 tflops: 9.266.01 mfu: 0.94%0.61% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.31334.3077 grad_norm: 2.10192.1023 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 300,058177,619 tflops: 21.4613.25 mfu: 2.17%1.34% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.13984.1349 grad_norm: 1.93421.9334 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 361,523205,777 tflops: 25.8515.35 mfu: 2.61%1.55% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.53264.5289 grad_norm: 1.51111.5103 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 363,412207,933 tflops: 25.9915.51 mfu: 2.63%1.57% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.98593.9828 grad_norm: 1.77991.7849 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 295,360188,228 tflops: 21.1214.04 mfu: 2.14%1.42% [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10 [rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.04 seconds [rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete diff --git a/torchtitan/models/deepseek_v3/model/args.py b/torchtitan/models/deepseek_v3/model/args.py index d6afedfa34..9451f01b01 100644 --- a/torchtitan/models/deepseek_v3/model/args.py +++ b/torchtitan/models/deepseek_v3/model/args.py @@ -159,3 +159,19 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in ) return nparams, num_flops_per_token + + def debug_structure_param(self, model: nn.Module): + logger.info("Model Structure Parameter Breakdown:") + + def _format_module(module: nn.Module, prefix: str = ""): + for name, sub_module in module.named_children(): + sub_module_params = sum(p.numel() for p in sub_module.parameters()) + if sub_module_params > 0: + logger.info( + f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" + ) + _format_module(sub_module, prefix + " ") + + total_params = sum(p.numel() for p in model.parameters()) + logger.info(f"{model.__class__.__name__} - {total_params:,} params") + _format_module(model, " ") diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py index 1728d9b93e..5aaf3839ed 100644 --- a/torchtitan/models/llama3/model/args.py +++ b/torchtitan/models/llama3/model/args.py @@ -53,68 +53,41 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None: self.max_seq_len = seq_len def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - """ - Count parameters and estimate flops for a TT (TorchTitan) model. - - Args: - model (nn.Module): The TT model (not HF). - seq_len (int): Sequence length. - - Returns: - tuple[int, int]: (nparams, num_flops_per_token) - """ nparams = sum(p.numel() for p in model.parameters()) - - layer_params = {} # layer_id -> int - embedding_params = 0 - norm_params = 0 - lm_head_params = 0 - misc_params = {} - - # TT model: top-level modules are tok_embeddings, layers (ModuleDict), norm, output - for name, p in model.named_parameters(): - if name.startswith("tok_embeddings."): - embedding_params += p.numel() - elif name.startswith("layers."): - try: - # layers.. - layer_id = int(name.split(".")[1]) - if layer_id not in layer_params: - layer_params[layer_id] = 0 - layer_params[layer_id] += p.numel() - except (ValueError, IndexError): - # Should not happen, but catch any oddities - component = "misc_layer_parts" - if component not in misc_params: - misc_params[component] = 0 - misc_params[component] += p.numel() - elif name.startswith("norm."): - norm_params += p.numel() - elif name.startswith("output."): - lm_head_params += p.numel() - else: - # Catch anything else - component = name.split(".")[0] - if component not in misc_params: - misc_params[component] = 0 - misc_params[component] += p.numel() - - logger.info("Parameter breakdown:") - logger.info(f" - embedding: {embedding_params:,} parameters") - for layer_num in sorted(layer_params.keys()): - params = layer_params[layer_num] - logger.info(f" - layer_{layer_num}: {params:,} parameters") - logger.info(f" - final_norm: {norm_params:,} parameters") - logger.info(f" - lm_head: {lm_head_params:,} parameters") - if misc_params: - for name, params in misc_params.items(): - logger.info(f" - {name} (misc): {params:,} parameters") - - # For TT, embedding is always model.tok_embeddings nparams_embedding = sum( - p.numel() for p in getattr(model, "tok_embeddings", nn.Module()).parameters() + sum(p.numel() for p in m.parameters()) + for m in model.children() + if isinstance(m, nn.Embedding) ) - l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Reasoning behind the factor of 12 for the self-attention part of the formula: + # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # 2. the flash attention does 1 more matmul recomputation in the backward + # but recomputation should not be counted in calculating MFU (+0) + # 3. each matmul performs 1 multiplication and 1 addition (*2) + # 4. we follow the convention and do not account for sparsity in causal attention num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + return nparams, num_flops_per_token + + def debug_structure_param(self, model: nn.Module): + logger.info("Model Structure Parameter Breakdown:") + + def _format_module(module: nn.Module, prefix: str = ""): + for name, sub_module in module.named_children(): + sub_module_params = sum(p.numel() for p in sub_module.parameters()) + if sub_module_params > 0: + logger.info( + f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" + ) + _format_module(sub_module, prefix + " ") + + total_params = sum(p.numel() for p in model.parameters()) + logger.info(f"{model.__class__.__name__} - {total_params:,} params") + _format_module(model, " ") \ No newline at end of file diff --git a/torchtitan/train.py b/torchtitan/train.py index d7a399a1ce..b15cd73e2c 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -180,7 +180,9 @@ def __init__(self, job_config: JobConfig): model_param_count, self.metrics_processor.num_flops_per_token, ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len) - + + model_args.debug_structure_param(model) + logger.info( f"{color.blue}Model {self.train_spec.name} {job_config.model.flavor} " f"{color.red}size: {model_param_count:,} total parameters{color.reset}" From fc43dc84adcc482f21b3c163e229109f859b965e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 17 Sep 2025 08:20:05 +0000 Subject: [PATCH 025/129] add __repr__ to HFTransformerModelArgs for better debugging logs --- .../model/hf_transformers_args.py | 9 +++++++ .../reference_diff_llama3_1gpu.log | 26 +++++++++---------- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 4b2f38ffa1..75610d8203 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -77,6 +77,15 @@ def __init__( **kwargs ) + def __repr__(self) -> str: + # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. + # PretrainedConfig has a __repr__ that serializes the object to JSON, but it + # doesn't work well with how HFTransformerModelArgs is initialized. + # This custom __repr__ provides a dataclass-like representation that correctly + # displays the arguments passed during initialization. + args_str = ", ".join(f"{k}={v!r}" for k, v in self._passed_args.items()) + return f"{self.__class__.__name__}({args_str})" + @property def dim(self) -> int: """TorchTitan: Model dimension (alias for HF hidden_size)""" diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log index 44bbbae2d1..84eff10ff8 100644 --- a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log +++ b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log @@ -1,5 +1,5 @@ diff --git a/tt_run.log.filtered b/hf_run.log.filtered -index 28327e0..abbe4d7 100644 +index 1f72d39..c1856a6 100644 --- a/tt_run.log.filtered +++ b/hf_run.log.filtered @@ -1,125 +1,125 @@ @@ -24,7 +24,7 @@ [rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation). [rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json [rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test -[rank0]:[titan] TIMESTAMP - root - INFO - Building llama3meta-llama/Llama-3.2-1B debugmodel with TransformerModelArgs(_enforced='ThisHFTransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)defaults.') +[rank0]:[titan] TIMESTAMP - root - INFO - Building llama3meta-llama/Llama-3.2-1B debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256,HFTransformerModelArgs(dim=256, n_layers=6, n_heads=16, n_kv_heads=None,n_kv_heads=16, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)eos_id=0, attn_implementation='sdpa') [rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory [rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown: [rank0]:[titan] TIMESTAMP - root - INFO - TransformerHFTransformerModel - 6,139,136 params @@ -113,19 +113,19 @@ [rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2) [rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1 [rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace./outputs/profile_trace_hf -[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 7.87237.8704 grad_norm: 1.51671.5185 memory: 1.39GiB(1.75%)1.67GiB(2.10%) tps: 43,79234,528 tflops: 3.132.58 mfu: 0.32%0.26% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 7.87237.8704 grad_norm: 1.51671.5185 memory: 1.39GiB(1.75%)1.67GiB(2.10%) tps: 43,37532,685 tflops: 3.102.44 mfu: 0.31%0.25% [rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 -[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.52467.5209 grad_norm: 1.63591.6373 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 21,38419,712 tflops: 1.531.47 mfu: 0.15% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 6.79006.7789 grad_norm: 2.03452.0390 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 336,714197,260 tflops: 24.0814.71 mfu: 2.43%1.49% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 5.98295.9673 grad_norm: 2.41292.4176 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 360,388206,932 tflops: 25.7715.43 mfu: 2.61%1.56% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 5.05365.0388 grad_norm: 2.53052.5275 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 286,298186,563 tflops: 20.4713.91 mfu: 2.07%1.41% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.52467.5209 grad_norm: 1.63591.6373 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 20,83419,798 tflops: 1.491.48 mfu: 0.15% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 6.79006.7789 grad_norm: 2.03452.0390 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 338,323199,161 tflops: 24.1914.85 mfu: 2.45%1.50% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 5.98295.9673 grad_norm: 2.41292.4176 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 362,741207,198 tflops: 25.9415.45 mfu: 2.62%1.56% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 5.05365.0388 grad_norm: 2.53052.5275 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 287,315187,882 tflops: 20.5514.01 mfu: 2.08%1.42% [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5 -[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.05 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.63704.6283 grad_norm: 2.28262.2818 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 129,44780,608 tflops: 9.266.01 mfu: 0.94%0.61% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.31334.3077 grad_norm: 2.10192.1023 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 300,058177,619 tflops: 21.4613.25 mfu: 2.17%1.34% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.13984.1349 grad_norm: 1.93421.9334 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 361,523205,777 tflops: 25.8515.35 mfu: 2.61%1.55% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.53264.5289 grad_norm: 1.51111.5103 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 363,412207,933 tflops: 25.9915.51 mfu: 2.63%1.57% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.98593.9828 grad_norm: 1.77991.7849 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 295,360188,228 tflops: 21.1214.04 mfu: 2.14%1.42% +[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.04 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.63704.6283 grad_norm: 2.28262.2818 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 130,12183,115 tflops: 9.316.20 mfu: 0.94%0.63% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.31334.3077 grad_norm: 2.10192.1023 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 295,546174,068 tflops: 21.1312.98 mfu: 2.14%1.31% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.13984.1349 grad_norm: 1.93421.9334 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 361,129206,837 tflops: 25.8215.43 mfu: 2.61%1.56% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.53264.5289 grad_norm: 1.51111.5103 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 363,728208,233 tflops: 26.0115.53 mfu: 2.63%1.57% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.98593.9828 grad_norm: 1.77991.7849 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 294,013188,295 tflops: 21.0314.04 mfu: 2.13%1.42% [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10 [rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.04 seconds [rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete From 23ae3785e7718d42b3d76bdb54c955c7da2fb9c8 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 17 Sep 2025 13:33:34 +0000 Subject: [PATCH 026/129] HF deepseek v3 is now training --- .../transformers_backend/__init__.py | 53 +++++++-- .../model/hf_transformers_args.py | 112 ++++++++++++++++-- 2 files changed, 142 insertions(+), 23 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 7ac18a1752..6273dd2dd3 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -18,25 +18,58 @@ from .infra.parallelize_hf_transformers import parallelize_hf_transformers from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel +from torchtitan.models.moe import MoEArgs + __all__ = [ "HFTransformerModelArgs", "HFTransformerModel", "hf_transformers_configs", ] +#TODO(3outeille): identify that if MoE model is used, we add a moe_args field +# flavors = { +# "debugmodel": HFTransformerModelArgs( +# # n_layers=2, +# # vocab_size=2000, +# max_seq_len=2048, +# #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else +# dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16 +# ), +# "medium": HFTransformerModelArgs( +# dim=1024, +# n_layers=12, +# ), +# "full": HFTransformerModelArgs(), +# } + flavors = { "debugmodel": HFTransformerModelArgs( - # n_layers=2, - # vocab_size=2000, - max_seq_len=2048, - #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else - dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16 - ), - "medium": HFTransformerModelArgs( - dim=1024, - n_layers=12, + n_layers=3, + vocab_size=2000, + dim=256, + inter_dim=1024, + moe_inter_dim=256, + n_dense_layers=1, + n_heads=16, + n_group=2, + topk_group=1, + moe_args=MoEArgs( + num_experts=8, + num_shared_experts=2, + top_k=3, + score_func="softmax", + route_norm=True, + score_before_experts=False, + ), + kv_lora_rank=16, + q_lora_rank=0, + qk_rope_head_dim=16, + qk_nope_head_dim=32, + v_head_dim=32, + mscale=0.70, + # TO REMOVE: + n_kv_heads=16 ), - "full": HFTransformerModelArgs(), } hf_train_spec = TrainSpec( diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 75610d8203..821a20f61f 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -16,6 +16,8 @@ from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast +from torchtitan.models.moe import MoEArgs + from .hf_llama_patch import patch_hf_llama patch_hf_llama() @@ -44,20 +46,44 @@ def __init__( use_flex_attn: bool = False, attn_mask_type: str = "causal", eos_id: int = 0, + moe_args: Optional[MoEArgs] = None, + # DeepSeekV3 specific args + n_group: Optional[int] = None, + topk_group: Optional[int] = None, + inter_dim: Optional[int] = None, + moe_inter_dim: Optional[int] = None, + n_dense_layers: Optional[int] = None, + n_expert_groups: Optional[int] = None, + n_limited_groups: Optional[int] = None, + q_lora_rank: Optional[int] = None, + kv_lora_rank: Optional[int] = None, + qk_nope_head_dim: Optional[int] = None, + qk_rope_head_dim: Optional[int] = None, + v_head_dim: Optional[int] = None, + original_seq_len: Optional[int] = None, + rope_factor: Optional[float] = None, + beta_fast: Optional[int] = None, + beta_slow: Optional[int] = None, + mscale: Optional[float] = None, # HuggingFace specific args attn_implementation: str = "sdpa", - **kwargs - ): + **kwargs, + ): # Store TorchTitan-specific args (no HF equivalent) self.multiple_of = multiple_of self.ffn_dim_multiplier = ffn_dim_multiplier self.depth_init = depth_init self.use_flex_attn = use_flex_attn self.attn_mask_type = attn_mask_type - + # HuggingFace specific args self.attn_implementation = attn_implementation + # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to + # setting it to None in HuggingFace. + if q_lora_rank == 0: + q_lora_rank = None + self._passed_args = dict( dim=dim, n_layers=n_layers, @@ -74,17 +100,53 @@ def __init__( attn_mask_type=attn_mask_type, eos_id=eos_id, attn_implementation=attn_implementation, - **kwargs + # DeepSeekV3 specific args + n_group=n_group, + topk_group=topk_group, + inter_dim=inter_dim, + moe_inter_dim=moe_inter_dim, + n_dense_layers=n_dense_layers, + n_expert_groups=n_expert_groups, + n_limited_groups=n_limited_groups, + q_lora_rank=q_lora_rank, + kv_lora_rank=kv_lora_rank, + qk_nope_head_dim=qk_nope_head_dim, + qk_rope_head_dim=qk_rope_head_dim, + v_head_dim=v_head_dim, + original_seq_len=original_seq_len, + rope_factor=rope_factor, + beta_fast=beta_fast, + beta_slow=beta_slow, + mscale=mscale, + **kwargs, ) + if moe_args is not None: + # MoE args for HF config + # HF uses different names for these + self.num_experts_per_tok = moe_args.top_k + self.n_routed_experts = moe_args.num_experts + self.n_shared_experts = moe_args.num_shared_experts + self.moe_intermediate_size = moe_inter_dim + self._passed_args.update( + dict( + num_experts_per_tok=moe_args.top_k, + n_routed_experts=moe_args.num_experts, + n_shared_experts=moe_args.num_shared_experts, + moe_intermediate_size=moe_inter_dim, + ) + ) + + def __repr__(self) -> str: # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. # PretrainedConfig has a __repr__ that serializes the object to JSON, but it # doesn't work well with how HFTransformerModelArgs is initialized. # This custom __repr__ provides a dataclass-like representation that correctly # displays the arguments passed during initialization. - args_str = ", ".join(f"{k}={v!r}" for k, v in self._passed_args.items()) - return f"{self.__class__.__name__}({args_str})" + args_lines = [f"{k}={v!r}" for k, v in sorted(self._passed_args.items())] + args_str = "\n".join(args_lines) + return f"{self.__class__.__name__}(\n{args_str}\n)" @property def dim(self) -> int: @@ -149,6 +211,25 @@ def eos_id(self) -> int: def eos_id(self, value: int): self.eos_token_id = value + # === DeepSeekV3 specific properties === + @property + def inter_dim(self) -> int: + """TorchTitan: Intermediate dimension (alias for HF intermediate_size)""" + return self.intermediate_size + + @inter_dim.setter + def inter_dim(self, value: int): + self.intermediate_size = value + + @property + def n_dense_layers(self) -> int: + """TorchTitan: Number of dense layers (alias for HF first_k_dense_replace)""" + return self.first_k_dense_replace + + @n_dense_layers.setter + def n_dense_layers(self, value: int): + self.first_k_dense_replace = value + def update_from_config(self, job_config: JobConfig): # Load HF config (overwrites our HF attributes) hf_model_config = AutoConfig.from_pretrained( @@ -163,6 +244,10 @@ def update_from_config(self, job_config: JobConfig): if hasattr(self, key): setattr(self, key, value) + # MoE + if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + # Configure HF-specific settings to match TorchTitan settings self.tie_word_embeddings = False self.attention_bias = False @@ -170,13 +255,14 @@ def update_from_config(self, job_config: JobConfig): self.use_cache = False self.initializer_range = 1.0 # use as std for normal init in embedding - ffn_hidden_size = 4 * self.dim - ffn_hidden_size = int(2 * ffn_hidden_size / 3) - if self.ffn_dim_multiplier is not None: - ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) - self.intermediate_size = self.multiple_of * ( - (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of - ) + if self.inter_dim is None: # Only for llama model + ffn_hidden_size = 4 * self.dim + ffn_hidden_size = int(2 * ffn_hidden_size / 3) + if self.ffn_dim_multiplier is not None: + ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) + self.intermediate_size = self.multiple_of * ( + (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of + ) self.head_dim = self.dim // self.num_attention_heads From 2573be482c7d1467c9d947a526c2952d0535a4ce Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 17 Sep 2025 13:54:34 +0000 Subject: [PATCH 027/129] refactor to make it clear which args comes from which parts --- .../transformers_backend/__init__.py | 80 +++--- .../model/hf_transformers_args.py | 264 +++++++----------- 2 files changed, 141 insertions(+), 203 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 6273dd2dd3..422df5621c 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -27,51 +27,51 @@ ] #TODO(3outeille): identify that if MoE model is used, we add a moe_args field -# flavors = { -# "debugmodel": HFTransformerModelArgs( -# # n_layers=2, -# # vocab_size=2000, -# max_seq_len=2048, -# #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else -# dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16 -# ), -# "medium": HFTransformerModelArgs( -# dim=1024, -# n_layers=12, -# ), -# "full": HFTransformerModelArgs(), -# } - flavors = { "debugmodel": HFTransformerModelArgs( - n_layers=3, - vocab_size=2000, - dim=256, - inter_dim=1024, - moe_inter_dim=256, - n_dense_layers=1, - n_heads=16, - n_group=2, - topk_group=1, - moe_args=MoEArgs( - num_experts=8, - num_shared_experts=2, - top_k=3, - score_func="softmax", - route_norm=True, - score_before_experts=False, - ), - kv_lora_rank=16, - q_lora_rank=0, - qk_rope_head_dim=16, - qk_nope_head_dim=32, - v_head_dim=32, - mscale=0.70, - # TO REMOVE: - n_kv_heads=16 + # n_layers=2, + # vocab_size=2000, + max_seq_len=2048, + #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else + dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16 ), + "medium": HFTransformerModelArgs( + dim=1024, + n_layers=12, + ), + "full": HFTransformerModelArgs(), } +# flavors = { +# "debugmodel": HFTransformerModelArgs( +# n_layers=3, +# vocab_size=2000, +# dim=256, +# inter_dim=1024, +# moe_inter_dim=256, +# n_dense_layers=1, +# n_heads=16, +# n_group=2, +# topk_group=1, +# moe_args=MoEArgs( +# num_experts=8, +# num_shared_experts=2, +# top_k=3, +# score_func="softmax", +# route_norm=True, +# score_before_experts=False, +# ), +# kv_lora_rank=16, +# q_lora_rank=0, +# qk_rope_head_dim=16, +# qk_nope_head_dim=32, +# v_head_dim=32, +# mscale=0.70, +# # TO REMOVE: +# n_kv_heads=16 +# ), +# } + hf_train_spec = TrainSpec( name="hf_auto_model", model_cls=HFTransformerModel, diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 821a20f61f..f5f04ce77b 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -21,123 +21,144 @@ from .hf_llama_patch import patch_hf_llama patch_hf_llama() +class AliasedPropertiesMeta(type): + """ + This metaclass automatically creates aliased properties on a class. + It looks for a `_TITAN_TO_HF_MAPPING` dictionary in the class + namespace and generates properties based on its contents. + """ + + def __new__(cls, name, bases, dct): + def _create_aliased_property(hf_name: str) -> property: + def getter(self): + return getattr(self, hf_name) + def setter(self, value): + setattr(self, hf_name, value) + return property(getter, setter) + + mapping = dct.get('_TITAN_TO_HF_MAPPING', {}) + for titan_name, hf_name in mapping.items(): + dct[titan_name] = _create_aliased_property(hf_name) + return super().__new__(cls, name, bases, dct) + +@dataclass +class TitanModelArgs: + """Arguments for the base TorchTitan model.""" + + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = 128256 + multiple_of: int = 256 + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + rope_theta: float = 10000 + max_seq_len: int = 2048 + depth_init: bool = True + use_flex_attn: bool = False + attn_mask_type: str = "causal" + eos_id: int = 0 + moe_args: Optional[MoEArgs] = None + + +@dataclass +class DeepSeekV3Args: + """Arguments specific to DeepSeekV3 models.""" + + n_group: Optional[int] = None + topk_group: Optional[int] = None + inter_dim: Optional[int] = None + moe_inter_dim: Optional[int] = None + n_dense_layers: Optional[int] = None + n_expert_groups: Optional[int] = None + n_limited_groups: Optional[int] = None + q_lora_rank: Optional[int] = None + kv_lora_rank: Optional[int] = None + qk_nope_head_dim: Optional[int] = None + qk_rope_head_dim: Optional[int] = None + v_head_dim: Optional[int] = None + original_seq_len: Optional[int] = None + rope_factor: Optional[float] = None + beta_fast: Optional[int] = None + beta_slow: Optional[int] = None + mscale: Optional[float] = None + + @dataclass -class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): +class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs, metaclass=AliasedPropertiesMeta): """ Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. """ + _TITAN_TO_HF_MAPPING = { + # TorchTitan Name: HuggingFace Name + "dim": "hidden_size", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "n_kv_heads": "num_key_value_heads", + "norm_eps": "rms_norm_eps", + "max_seq_len": "max_position_embeddings", + "eos_id": "eos_token_id", + # DeepSeekV3 specific aliases + "inter_dim": "intermediate_size", + "n_dense_layers": "first_k_dense_replace", + } + def __init__( self, - # TorchTitan args - dim: int = 4096, - n_layers: int = 32, - n_heads: int = 32, - n_kv_heads: Optional[int] = None, - vocab_size: int = 128256, - multiple_of: int = 256, - ffn_dim_multiplier: Optional[float] = None, - norm_eps: float = 1e-5, - rope_theta: float = 10000, - max_seq_len: int = 2048, - depth_init: bool = True, - use_flex_attn: bool = False, - attn_mask_type: str = "causal", - eos_id: int = 0, - moe_args: Optional[MoEArgs] = None, - # DeepSeekV3 specific args - n_group: Optional[int] = None, - topk_group: Optional[int] = None, - inter_dim: Optional[int] = None, - moe_inter_dim: Optional[int] = None, - n_dense_layers: Optional[int] = None, - n_expert_groups: Optional[int] = None, - n_limited_groups: Optional[int] = None, - q_lora_rank: Optional[int] = None, - kv_lora_rank: Optional[int] = None, - qk_nope_head_dim: Optional[int] = None, - qk_rope_head_dim: Optional[int] = None, - v_head_dim: Optional[int] = None, - original_seq_len: Optional[int] = None, - rope_factor: Optional[float] = None, - beta_fast: Optional[int] = None, - beta_slow: Optional[int] = None, - mscale: Optional[float] = None, + titan_args: Optional[TitanModelArgs] = None, + deepseek_v3_args: Optional[DeepSeekV3Args] = None, # HuggingFace specific args attn_implementation: str = "sdpa", **kwargs, ): + titan_args = titan_args or TitanModelArgs() + deepseek_v3_args = deepseek_v3_args or DeepSeekV3Args() + # Store TorchTitan-specific args (no HF equivalent) - self.multiple_of = multiple_of - self.ffn_dim_multiplier = ffn_dim_multiplier - self.depth_init = depth_init - self.use_flex_attn = use_flex_attn - self.attn_mask_type = attn_mask_type + self.multiple_of = titan_args.multiple_of + self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier + self.depth_init = titan_args.depth_init + self.use_flex_attn = titan_args.use_flex_attn + self.attn_mask_type = titan_args.attn_mask_type # HuggingFace specific args self.attn_implementation = attn_implementation # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to # setting it to None in HuggingFace. + q_lora_rank = deepseek_v3_args.q_lora_rank if q_lora_rank == 0: q_lora_rank = None + deepseek_v3_args.q_lora_rank = q_lora_rank - self._passed_args = dict( - dim=dim, - n_layers=n_layers, - n_heads=n_heads, - n_kv_heads=n_kv_heads, - vocab_size=vocab_size, - multiple_of=multiple_of, - ffn_dim_multiplier=ffn_dim_multiplier, - norm_eps=norm_eps, - rope_theta=rope_theta, - max_seq_len=max_seq_len, - depth_init=depth_init, - use_flex_attn=use_flex_attn, - attn_mask_type=attn_mask_type, - eos_id=eos_id, - attn_implementation=attn_implementation, - # DeepSeekV3 specific args - n_group=n_group, - topk_group=topk_group, - inter_dim=inter_dim, - moe_inter_dim=moe_inter_dim, - n_dense_layers=n_dense_layers, - n_expert_groups=n_expert_groups, - n_limited_groups=n_limited_groups, - q_lora_rank=q_lora_rank, - kv_lora_rank=kv_lora_rank, - qk_nope_head_dim=qk_nope_head_dim, - qk_rope_head_dim=qk_rope_head_dim, - v_head_dim=v_head_dim, - original_seq_len=original_seq_len, - rope_factor=rope_factor, - beta_fast=beta_fast, - beta_slow=beta_slow, - mscale=mscale, - **kwargs, - ) + self._passed_args = { + **titan_args.__dict__, + **deepseek_v3_args.__dict__, + "attn_implementation": attn_implementation, + } + self._passed_args.update(kwargs) - if moe_args is not None: + if titan_args.moe_args is not None: # MoE args for HF config # HF uses different names for these + moe_args = titan_args.moe_args self.num_experts_per_tok = moe_args.top_k self.n_routed_experts = moe_args.num_experts self.n_shared_experts = moe_args.num_shared_experts - self.moe_intermediate_size = moe_inter_dim + self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim self._passed_args.update( dict( num_experts_per_tok=moe_args.top_k, n_routed_experts=moe_args.num_experts, n_shared_experts=moe_args.num_shared_experts, - moe_intermediate_size=moe_inter_dim, + moe_intermediate_size=deepseek_v3_args.moe_inter_dim, ) ) - def __repr__(self) -> str: # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. # PretrainedConfig has a __repr__ that serializes the object to JSON, but it @@ -148,88 +169,6 @@ def __repr__(self) -> str: args_str = "\n".join(args_lines) return f"{self.__class__.__name__}(\n{args_str}\n)" - @property - def dim(self) -> int: - """TorchTitan: Model dimension (alias for HF hidden_size)""" - return self.hidden_size - - @dim.setter - def dim(self, value: int): - self.hidden_size = value - - @property - def n_layers(self) -> int: - """TorchTitan: Number of layers (alias for HF num_hidden_layers)""" - return self.num_hidden_layers - - @n_layers.setter - def n_layers(self, value: int): - self.num_hidden_layers = value - - @property - def n_heads(self) -> int: - """TorchTitan: Number of attention heads (alias for HF num_attention_heads)""" - return self.num_attention_heads - - @n_heads.setter - def n_heads(self, value: int): - self.num_attention_heads = value - - @property - def n_kv_heads(self) -> Optional[int]: - """TorchTitan: Number of key-value heads (alias for HF num_key_value_heads)""" - return self.num_key_value_heads - - @n_kv_heads.setter - def n_kv_heads(self, value: Optional[int]): - self.num_key_value_heads = value - - @property - def norm_eps(self) -> float: - """TorchTitan: Layer norm epsilon (alias for HF rms_norm_eps)""" - return self.rms_norm_eps - - @norm_eps.setter - def norm_eps(self, value: float): - self.rms_norm_eps = value - - @property - def max_seq_len(self) -> int: - """TorchTitan: Maximum sequence length (alias for HF max_position_embeddings)""" - return self.max_position_embeddings - - @max_seq_len.setter - def max_seq_len(self, value: int): - self.max_position_embeddings = value - - @property - def eos_id(self) -> int: - """TorchTitan: End of sequence token ID (alias for HF eos_token_id)""" - return self.eos_token_id - - @eos_id.setter - def eos_id(self, value: int): - self.eos_token_id = value - - # === DeepSeekV3 specific properties === - @property - def inter_dim(self) -> int: - """TorchTitan: Intermediate dimension (alias for HF intermediate_size)""" - return self.intermediate_size - - @inter_dim.setter - def inter_dim(self, value: int): - self.intermediate_size = value - - @property - def n_dense_layers(self) -> int: - """TorchTitan: Number of dense layers (alias for HF first_k_dense_replace)""" - return self.first_k_dense_replace - - @n_dense_layers.setter - def n_dense_layers(self, value: int): - self.first_k_dense_replace = value - def update_from_config(self, job_config: JobConfig): # Load HF config (overwrites our HF attributes) hf_model_config = AutoConfig.from_pretrained( @@ -316,7 +255,6 @@ def _format_module(module: nn.Module, prefix: str = ""): _format_module(model, " ") - class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): super().__init__() From 46ae0a3caa1dfa02b1a00b426e0bfad74eba7be8 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 18 Sep 2025 08:05:27 +0000 Subject: [PATCH 028/129] fix refactor and simplify things --- .../transformers_backend/__init__.py | 136 ++++++++---- .../model/hf_transformers_args.py | 201 +++++++----------- 2 files changed, 178 insertions(+), 159 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 422df5621c..59900b0408 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -4,7 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import dataclasses +from dataclasses import dataclass +from typing import Optional from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers @@ -20,58 +21,113 @@ from torchtitan.models.moe import MoEArgs + __all__ = [ "HFTransformerModelArgs", "HFTransformerModel", "hf_transformers_configs", ] -#TODO(3outeille): identify that if MoE model is used, we add a moe_args field -flavors = { - "debugmodel": HFTransformerModelArgs( - # n_layers=2, - # vocab_size=2000, - max_seq_len=2048, - #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else - dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16 - ), - "medium": HFTransformerModelArgs( - dim=1024, - n_layers=12, - ), - "full": HFTransformerModelArgs(), -} +@dataclass +class TitanModelArgs: + """Arguments for the base TorchTitan model.""" + + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = 128256 + multiple_of: int = 256 + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + rope_theta: float = 10000 + max_seq_len: int = 2048 + depth_init: bool = True + use_flex_attn: bool = False + attn_mask_type: str = "causal" + eos_id: int = 0 + + +@dataclass +class DeepSeekV3Args: + """Arguments specific to DeepSeekV3 models.""" + moe_args: Optional[MoEArgs] = None + n_group: Optional[int] = None + topk_group: Optional[int] = None + inter_dim: Optional[int] = None + moe_inter_dim: Optional[int] = None + n_dense_layers: Optional[int] = None + n_expert_groups: Optional[int] = None + n_limited_groups: Optional[int] = None + q_lora_rank: Optional[int] = None + kv_lora_rank: Optional[int] = None + qk_nope_head_dim: Optional[int] = None + qk_rope_head_dim: Optional[int] = None + v_head_dim: Optional[int] = None + original_seq_len: Optional[int] = None + rope_factor: Optional[float] = None + beta_fast: Optional[int] = None + beta_slow: Optional[int] = None + mscale: Optional[float] = None +# #TODO(3outeille): identify that if MoE model is used, we add a moe_args field # flavors = { # "debugmodel": HFTransformerModelArgs( -# n_layers=3, -# vocab_size=2000, -# dim=256, -# inter_dim=1024, -# moe_inter_dim=256, -# n_dense_layers=1, -# n_heads=16, -# n_group=2, -# topk_group=1, -# moe_args=MoEArgs( -# num_experts=8, -# num_shared_experts=2, -# top_k=3, -# score_func="softmax", -# route_norm=True, -# score_before_experts=False, +# titan_args=TitanModelArgs( +# max_seq_len=2048, +# dim=256, +# n_layers=6, +# n_heads=16, +# n_kv_heads=16, +# vocab_size=2000, +# rope_theta=500000 # ), -# kv_lora_rank=16, -# q_lora_rank=0, -# qk_rope_head_dim=16, -# qk_nope_head_dim=32, -# v_head_dim=32, -# mscale=0.70, -# # TO REMOVE: -# n_kv_heads=16 +# ), +# "medium": HFTransformerModelArgs( +# titan_args=TitanModelArgs( +# dim=1024, +# n_layers=12, +# ), +# ), +# "full": HFTransformerModelArgs( +# titan_args=TitanModelArgs(), # ), # } +# DeepSeekV3 flavors +flavors = { + "debugmodel": HFTransformerModelArgs( + titan_args=TitanModelArgs( + dim=256, + n_layers=3, + n_heads=16, + n_kv_heads=16, + vocab_size=2000, + ), + deepseek_v3_args=DeepSeekV3Args( + inter_dim=1024, + moe_inter_dim=256, + n_dense_layers=1, + n_group=2, + topk_group=1, + kv_lora_rank=16, + q_lora_rank=0, + qk_nope_head_dim=32, + qk_rope_head_dim=16, + v_head_dim=32, + mscale=0.70, + moe_args=MoEArgs( + num_experts=8, + num_shared_experts=2, + top_k=3, + score_func="softmax", + route_norm=True, + score_before_experts=False, + ), + ) + ), +} + hf_train_spec = TrainSpec( name="hf_auto_model", model_cls=HFTransformerModel, diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index f5f04ce77b..2fa18e9abb 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -16,109 +16,59 @@ from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast -from torchtitan.models.moe import MoEArgs - from .hf_llama_patch import patch_hf_llama patch_hf_llama() -class AliasedPropertiesMeta(type): - """ - This metaclass automatically creates aliased properties on a class. - It looks for a `_TITAN_TO_HF_MAPPING` dictionary in the class - namespace and generates properties based on its contents. - """ - - def __new__(cls, name, bases, dct): - def _create_aliased_property(hf_name: str) -> property: - def getter(self): - return getattr(self, hf_name) - def setter(self, value): - setattr(self, hf_name, value) - return property(getter, setter) - - mapping = dct.get('_TITAN_TO_HF_MAPPING', {}) - for titan_name, hf_name in mapping.items(): - dct[titan_name] = _create_aliased_property(hf_name) - return super().__new__(cls, name, bases, dct) - -@dataclass -class TitanModelArgs: - """Arguments for the base TorchTitan model.""" - - dim: int = 4096 - n_layers: int = 32 - n_heads: int = 32 - n_kv_heads: Optional[int] = None - vocab_size: int = 128256 - multiple_of: int = 256 - ffn_dim_multiplier: Optional[float] = None - norm_eps: float = 1e-5 - rope_theta: float = 10000 - max_seq_len: int = 2048 - depth_init: bool = True - use_flex_attn: bool = False - attn_mask_type: str = "causal" - eos_id: int = 0 - moe_args: Optional[MoEArgs] = None - - @dataclass -class DeepSeekV3Args: - """Arguments specific to DeepSeekV3 models.""" - - n_group: Optional[int] = None - topk_group: Optional[int] = None - inter_dim: Optional[int] = None - moe_inter_dim: Optional[int] = None - n_dense_layers: Optional[int] = None - n_expert_groups: Optional[int] = None - n_limited_groups: Optional[int] = None - q_lora_rank: Optional[int] = None - kv_lora_rank: Optional[int] = None - qk_nope_head_dim: Optional[int] = None - qk_rope_head_dim: Optional[int] = None - v_head_dim: Optional[int] = None - original_seq_len: Optional[int] = None - rope_factor: Optional[float] = None - beta_fast: Optional[int] = None - beta_slow: Optional[int] = None - mscale: Optional[float] = None - - -@dataclass -class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs, metaclass=AliasedPropertiesMeta): +class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): """ Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. + Properties are created dynamically based on which arguments are provided. """ - _TITAN_TO_HF_MAPPING = { - # TorchTitan Name: HuggingFace Name - "dim": "hidden_size", - "n_layers": "num_hidden_layers", - "n_heads": "num_attention_heads", - "n_kv_heads": "num_key_value_heads", - "norm_eps": "rms_norm_eps", - "max_seq_len": "max_position_embeddings", - "eos_id": "eos_token_id", - # DeepSeekV3 specific aliases - "inter_dim": "intermediate_size", - "n_dense_layers": "first_k_dense_replace", + # Define all possible mappings organized by argument type + _ALL_MAPPINGS = { + "base": { + # Core TorchTitan mappings (always available) + "dim": "hidden_size", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "n_kv_heads": "num_key_value_heads", + "norm_eps": "rms_norm_eps", + "max_seq_len": "max_position_embeddings", + "eos_id": "eos_token_id", + }, + "deepseek_v3": { + # DeepSeekV3 specific mappings (only when deepseek_v3_args provided) + "inter_dim": "intermediate_size", + "n_dense_layers": "first_k_dense_replace", + }, } def __init__( self, - titan_args: Optional[TitanModelArgs] = None, - deepseek_v3_args: Optional[DeepSeekV3Args] = None, + titan_args, + deepseek_v3_args=None, # HuggingFace specific args attn_implementation: str = "sdpa", **kwargs, ): - titan_args = titan_args or TitanModelArgs() - deepseek_v3_args = deepseek_v3_args or DeepSeekV3Args() + assert titan_args is not None, "titan_args is required" + + active_mappings = {} + + active_mappings.update(self._ALL_MAPPINGS["base"]) + + if deepseek_v3_args is not None: + active_mappings.update(self._ALL_MAPPINGS["deepseek_v3"]) + + self._active_mappings = active_mappings + + self._create_dynamic_properties() - # Store TorchTitan-specific args (no HF equivalent) + # Fill all TorchTitan-specific args (no HF equivalent) self.multiple_of = titan_args.multiple_of self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier self.depth_init = titan_args.depth_init @@ -128,36 +78,49 @@ def __init__( # HuggingFace specific args self.attn_implementation = attn_implementation - # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to - # setting it to None in HuggingFace. - q_lora_rank = deepseek_v3_args.q_lora_rank - if q_lora_rank == 0: - q_lora_rank = None - deepseek_v3_args.q_lora_rank = q_lora_rank - - self._passed_args = { - **titan_args.__dict__, - **deepseek_v3_args.__dict__, - "attn_implementation": attn_implementation, - } + # Start with passed_args as just titan_args + self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} self._passed_args.update(kwargs) - if titan_args.moe_args is not None: - # MoE args for HF config - # HF uses different names for these - moe_args = titan_args.moe_args - self.num_experts_per_tok = moe_args.top_k - self.n_routed_experts = moe_args.num_experts - self.n_shared_experts = moe_args.num_shared_experts - self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim - self._passed_args.update( - dict( - num_experts_per_tok=moe_args.top_k, - n_routed_experts=moe_args.num_experts, - n_shared_experts=moe_args.num_shared_experts, - moe_intermediate_size=deepseek_v3_args.moe_inter_dim, + # If DeepSeekV3 args are provided, fill the rest + if deepseek_v3_args is not None: + # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to + # setting it to None in HuggingFace. + q_lora_rank = deepseek_v3_args.q_lora_rank + if q_lora_rank == 0: + q_lora_rank = None + deepseek_v3_args.q_lora_rank = q_lora_rank + + self._passed_args.update(**deepseek_v3_args.__dict__) + + if deepseek_v3_args.moe_args is not None: + moe_args = deepseek_v3_args.moe_args + self.num_experts_per_tok = moe_args.top_k + self.n_routed_experts = moe_args.num_experts + self.n_shared_experts = moe_args.num_shared_experts + self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim + self._passed_args.update( + dict( + num_experts_per_tok=moe_args.top_k, + n_routed_experts=moe_args.num_experts, + n_shared_experts=moe_args.num_shared_experts, + moe_intermediate_size=deepseek_v3_args.moe_inter_dim, + ) ) - ) + + def _create_dynamic_properties(self): + """Create properties dynamically based on active mappings.""" + def _create_property(hf_name: str) -> property: + def getter(self): + return getattr(self, hf_name) + def setter(self, value): + setattr(self, hf_name, value) + return property(getter, setter) + + for titan_name, hf_name in self._active_mappings.items(): + # Create getter/setter for attribute that don't already exist + if not hasattr(self.__class__, titan_name): + setattr(self.__class__, titan_name, _create_property(hf_name)) def __repr__(self) -> str: # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. @@ -194,14 +157,14 @@ def update_from_config(self, job_config: JobConfig): self.use_cache = False self.initializer_range = 1.0 # use as std for normal init in embedding - if self.inter_dim is None: # Only for llama model - ffn_hidden_size = 4 * self.dim - ffn_hidden_size = int(2 * ffn_hidden_size / 3) - if self.ffn_dim_multiplier is not None: - ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) - self.intermediate_size = self.multiple_of * ( - (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of - ) + # if self.inter_dim is None: # Only for llama model + ffn_hidden_size = 4 * self.dim + ffn_hidden_size = int(2 * ffn_hidden_size / 3) + if self.ffn_dim_multiplier is not None: + ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) + self.intermediate_size = self.multiple_of * ( + (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of + ) self.head_dim = self.dim // self.num_attention_heads From b33d5758763963786d5c2fedfa30134d87f1bfb9 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 18 Sep 2025 08:21:35 +0000 Subject: [PATCH 029/129] hacky way to switch flavors for now --- .../transformers_backend/__init__.py | 111 +++++++++--------- .../transformers_backend/compare_tt_hf_run.sh | 34 +++++- .../model/hf_transformers_args.py | 17 +-- 3 files changed, 97 insertions(+), 65 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 59900b0408..de81b18794 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -3,7 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - +import os from dataclasses import dataclass from typing import Optional @@ -71,62 +71,65 @@ class DeepSeekV3Args: mscale: Optional[float] = None # #TODO(3outeille): identify that if MoE model is used, we add a moe_args field -# flavors = { -# "debugmodel": HFTransformerModelArgs( -# titan_args=TitanModelArgs( -# max_seq_len=2048, -# dim=256, -# n_layers=6, -# n_heads=16, -# n_kv_heads=16, -# vocab_size=2000, -# rope_theta=500000 -# ), -# ), -# "medium": HFTransformerModelArgs( -# titan_args=TitanModelArgs( -# dim=1024, -# n_layers=12, -# ), -# ), -# "full": HFTransformerModelArgs( -# titan_args=TitanModelArgs(), -# ), -# } -# DeepSeekV3 flavors -flavors = { - "debugmodel": HFTransformerModelArgs( - titan_args=TitanModelArgs( - dim=256, - n_layers=3, - n_heads=16, - n_kv_heads=16, - vocab_size=2000, +if os.environ.get("MODEL_TYPE") == "llama": + print("Using llama model") + flavors = { + "debugmodel": HFTransformerModelArgs( + titan_args=TitanModelArgs( + max_seq_len=2048, + dim=256, + n_layers=6, + n_heads=16, + n_kv_heads=16, + vocab_size=2000, + rope_theta=500000 + ), + ), + "medium": HFTransformerModelArgs( + titan_args=TitanModelArgs( + dim=1024, + n_layers=12, + ), + ), + "full": HFTransformerModelArgs( + titan_args=TitanModelArgs(), ), - deepseek_v3_args=DeepSeekV3Args( - inter_dim=1024, - moe_inter_dim=256, - n_dense_layers=1, - n_group=2, - topk_group=1, - kv_lora_rank=16, - q_lora_rank=0, - qk_nope_head_dim=32, - qk_rope_head_dim=16, - v_head_dim=32, - mscale=0.70, - moe_args=MoEArgs( - num_experts=8, - num_shared_experts=2, - top_k=3, - score_func="softmax", - route_norm=True, - score_before_experts=False, + } +else: + print("Using deepseek model") + flavors = { + "debugmodel": HFTransformerModelArgs( + titan_args=TitanModelArgs( + vocab_size=2000, + dim=256, + n_layers=3, + n_heads=16, + n_kv_heads=16, ), - ) - ), -} + deepseek_v3_args=DeepSeekV3Args( + inter_dim=1024, + moe_inter_dim=256, + n_dense_layers=1, + n_group=2, + topk_group=1, + kv_lora_rank=16, + q_lora_rank=0, + qk_nope_head_dim=32, + qk_rope_head_dim=16, + v_head_dim=32, + mscale=0.70, + moe_args=MoEArgs( + num_experts=8, + num_shared_experts=2, + top_k=3, + score_func="softmax", + route_norm=True, + score_before_experts=False, + ), + ) + ), + } hf_train_spec = TrainSpec( name="hf_auto_model", diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh index 0461ebfb7b..e49a2a5803 100755 --- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh @@ -12,6 +12,35 @@ set -o pipefail NGPU=${NGPU:-"1"} export LOG_RANK=${LOG_RANK:-0} +# Parse command line arguments for model selection +MODEL_TYPE=${1:-"llama"} +export MODEL_TYPE + +# Set model names based on argument +case $MODEL_TYPE in + "llama") + TT_MODEL_NAME="llama3" + HF_MODEL_NAME="meta-llama/Llama-3.2-1B" + ;; + "deepseek") + TT_MODEL_NAME="deepseek_v3" + HF_MODEL_NAME="deepseek-ai/DeepSeek-V3" + ;; + *) + echo "Error: Unsupported model type '$MODEL_TYPE'" + echo "Usage: $0 [llama|deepseek] [additional_args...]" + echo " llama - Uses llama3 for TT and meta-llama/Llama-3.2-1B for HF" + echo " deepseek - Uses deepseek_v3 for TT and deepseek-ai/DeepSeek-V3 for HF" + exit 1 + ;; +esac + +echo "Using model type: $MODEL_TYPE" +echo " TT model: $TT_MODEL_NAME" +echo " HF model: $HF_MODEL_NAME" + +# Shift to remove the model type argument, pass remaining args to training +shift run_tt() { echo "##############################################" @@ -23,7 +52,7 @@ run_tt() { CUDA_VISIBLE_DEVICES=0 \ torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic "$@" + -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic --model.name ${TT_MODEL_NAME} "$@" } run_hf() { @@ -36,10 +65,9 @@ run_hf() { CUDA_VISIBLE_DEVICES=1 \ torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic "$@" + -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic --model.name ${HF_MODEL_NAME} "$@" } - TT_LOG="tt_run.log" HF_LOG="hf_run.log" DIFF_LOG="run_diff.log" diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 2fa18e9abb..4366467129 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -157,20 +157,21 @@ def update_from_config(self, job_config: JobConfig): self.use_cache = False self.initializer_range = 1.0 # use as std for normal init in embedding - # if self.inter_dim is None: # Only for llama model - ffn_hidden_size = 4 * self.dim - ffn_hidden_size = int(2 * ffn_hidden_size / 3) - if self.ffn_dim_multiplier is not None: - ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) - self.intermediate_size = self.multiple_of * ( - (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of - ) + if not hasattr(self, "inter_dim"): # Only for llama model + ffn_hidden_size = 4 * self.dim + ffn_hidden_size = int(2 * ffn_hidden_size / 3) + if self.ffn_dim_multiplier is not None: + ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) + self.intermediate_size = self.multiple_of * ( + (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of + ) self.head_dim = self.dim // self.num_attention_heads return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: + #TODO(3outeille): adapt to handle MoE nparams = sum(p.numel() for p in model.parameters()) nparams_embedding = sum( sum(p.numel() for p in m.parameters()) From 007f00555724cf6e59957cef3cdf1322b6e57178 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 18 Sep 2025 13:48:53 +0000 Subject: [PATCH 030/129] hf deepseek train while matching same param counts as tt deepseek --- .../experiments/transformers_backend/__init__.py | 10 ++++++---- .../transformers_backend/model/hf_transformers_args.py | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index de81b18794..06d8524c14 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -69,6 +69,7 @@ class DeepSeekV3Args: beta_fast: Optional[int] = None beta_slow: Optional[int] = None mscale: Optional[float] = None + partial_rotary_factor: Optional[float] = None # #TODO(3outeille): identify that if MoE model is used, we add a moe_args field @@ -108,16 +109,17 @@ class DeepSeekV3Args: n_kv_heads=16, ), deepseek_v3_args=DeepSeekV3Args( + partial_rotary_factor=4.0, inter_dim=1024, moe_inter_dim=256, n_dense_layers=1, n_group=2, topk_group=1, - kv_lora_rank=16, + kv_lora_rank=512, q_lora_rank=0, - qk_nope_head_dim=32, - qk_rope_head_dim=16, - v_head_dim=32, + qk_nope_head_dim=128, + qk_rope_head_dim=64, + v_head_dim=128, mscale=0.70, moe_args=MoEArgs( num_experts=8, diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 4366467129..2e3b3e93f0 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -29,7 +29,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): """ # Define all possible mappings organized by argument type - _ALL_MAPPINGS = { + _TT_TO_HF_MAPPINGS = { "base": { # Core TorchTitan mappings (always available) "dim": "hidden_size", @@ -59,10 +59,10 @@ def __init__( active_mappings = {} - active_mappings.update(self._ALL_MAPPINGS["base"]) + active_mappings.update(self._TT_TO_HF_MAPPINGS["base"]) if deepseek_v3_args is not None: - active_mappings.update(self._ALL_MAPPINGS["deepseek_v3"]) + active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"]) self._active_mappings = active_mappings @@ -93,6 +93,8 @@ def __init__( self._passed_args.update(**deepseek_v3_args.__dict__) + self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor + if deepseek_v3_args.moe_args is not None: moe_args = deepseek_v3_args.moe_args self.num_experts_per_tok = moe_args.top_k From dd2b04cf947f97c0735b2f8e1b8f188440ee1c29 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 22 Sep 2025 09:31:00 +0000 Subject: [PATCH 031/129] wtf deepseek q_proj weight init differ ??? --- .../transformers_backend/__init__.py | 7 +- .../model/hf_deepseek_v3_patch.py | 115 ++++++++++++++++++ .../model/hf_transformers_args.py | 3 - 3 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 06d8524c14..0cecbfb199 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -20,6 +20,9 @@ from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel from torchtitan.models.moe import MoEArgs +from .model.hf_llama_patch import patch_hf_llama +from .model.hf_deepseek_v3_patch import patch_hf_deepseek_v3 + __all__ = [ @@ -75,6 +78,7 @@ class DeepSeekV3Args: if os.environ.get("MODEL_TYPE") == "llama": print("Using llama model") + patch_hf_llama() flavors = { "debugmodel": HFTransformerModelArgs( titan_args=TitanModelArgs( @@ -99,12 +103,13 @@ class DeepSeekV3Args: } else: print("Using deepseek model") + patch_hf_deepseek_v3() flavors = { "debugmodel": HFTransformerModelArgs( titan_args=TitanModelArgs( vocab_size=2000, dim=256, - n_layers=3, + n_layers=2, n_heads=16, n_kv_heads=16, ), diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py new file mode 100644 index 0000000000..53769b9cc9 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py @@ -0,0 +1,115 @@ + + +import torch.nn as nn + +from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config +from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer +from transformers.modeling_utils import PreTrainedModel + +_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__ + +def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int): + _original_deepseek_v3_decoder_layer_init(self, config, layer_idx) + + self.mlp.layer_idx = layer_idx + + if hasattr(self.mlp, 'experts'): + for expert in self.mlp.experts: + expert.layer_idx = layer_idx + self.mlp.shared_experts.layer_idx = layer_idx + +def _initialize_weights_patched(self, module): + # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly + # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, + # which prevents subsequent proper initialization. + if getattr(module, "_is_hf_initialized", False): + return + + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + + # If not on a meta device, call the original weight initialization + self._init_weights(module) + module._is_hf_initialized = True + +def _init_weights_patched(self, module): + """ + Patched version of _init_weights to match TorchTitan's initialization for Llama. + `self` is a LlamaPreTrainedModel instance. + """ + config = self.config + + #TODO(3outeille): only out_proj/down_proj needs std=init_std. so we can refactor to loop over module and only init last layer with std=init_std + if isinstance(module, (DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE)): + layer_idx = module.layer_idx + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + + if isinstance(module, DeepseekV3Attention): + print("DeepseekV3Attention", module.layer_idx) + if hasattr(module, 'q_proj'): + nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) + # NOTE(3outeille): module.smart_apply is called on parent class, we have 3 child so init will be called 3 times + # That's why we need to set _is_hf_initialized to True to avoid triple initialization + print(f"module.q_proj.weight: {module.q_proj.weight}") + else: + nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) + + nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) + + nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) + print("=====") + + elif isinstance(module, DeepseekV3MLP): + nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, DeepseekV3MoE): + nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) + for expert in module.experts: + nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) + + nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) + + elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif ( + isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) + or "LayerNorm" in module.__class__.__name__ + or "RMSNorm" in module.__class__.__name__ + ): + # Norms can exist without weights (in which case they are None from torch primitives) + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + +def patch_hf_deepseek_v3(): + DeepseekV3DecoderLayer.__init__ = _deepseek_v3_decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 2e3b3e93f0..64fb64d72f 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -16,9 +16,6 @@ from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast -from .hf_llama_patch import patch_hf_llama -patch_hf_llama() - @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): """ From 9abdae34fdb4a2e6a22a654f78babf3163df725e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 22 Sep 2025 11:19:50 +0000 Subject: [PATCH 032/129] deepseek now has same weight init in HF & TT. Reasons was rng_state was not same as we call weight init at different time --- .../transformers_backend/compare_tt_hf_run.sh | 7 +- .../model/hf_deepseek_v3_patch.py | 36 +++- .../reference_diff_deepseekv3_1gpu.log | 163 ++++++++++++++++++ torchtitan/models/deepseek_v3/__init__.py | 2 +- torchtitan/models/deepseek_v3/model/model.py | 34 ++++ torchtitan/models/moe.py | 34 ++++ 6 files changed, 265 insertions(+), 11 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh index e49a2a5803..be7243f81b 100755 --- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh @@ -15,7 +15,8 @@ export LOG_RANK=${LOG_RANK:-0} # Parse command line arguments for model selection MODEL_TYPE=${1:-"llama"} export MODEL_TYPE - +SEED=${SEED:-42} +export SEED # Set model names based on argument case $MODEL_TYPE in "llama") @@ -52,7 +53,7 @@ run_tt() { CUDA_VISIBLE_DEVICES=0 \ torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic --model.name ${TT_MODEL_NAME} "$@" + -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${TT_MODEL_NAME} "$@" } run_hf() { @@ -65,7 +66,7 @@ run_hf() { CUDA_VISIBLE_DEVICES=1 \ torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic --model.name ${HF_MODEL_NAME} "$@" + -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${HF_MODEL_NAME} "$@" } TT_LOG="tt_run.log" diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py index 53769b9cc9..346a400260 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py +++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py @@ -1,6 +1,7 @@ - - +import os +import torch import torch.nn as nn +import functools from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer @@ -8,6 +9,31 @@ _original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__ +def seeded_init_decorator_for_test(seed): + """ + Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call + and prints layer weights after initialization. + """ + import lovely_tensors as lt; lt.monkey_patch() + def decorator(func): + @functools.wraps(func) + def wrapper(self, module): + original_trunc_normal = nn.init.trunc_normal_ + + def seeded_trunc_normal(*args, **kwargs): + torch.manual_seed(seed) + tensor = args[0] # First argument is always the tensor + result = original_trunc_normal(*args, **kwargs) + # module_name = getattr(module, "__class__", type(module)).__name__ + # print(f"Module: {module_name}, Tensor value: {tensor}") + return result + + nn.init.trunc_normal_ = seeded_trunc_normal + return func(self, module) + + return wrapper + return decorator + def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int): _original_deepseek_v3_decoder_layer_init(self, config, layer_idx) @@ -33,6 +59,7 @@ def _initialize_weights_patched(self, module): self._init_weights(module) module._is_hf_initialized = True +@seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def _init_weights_patched(self, module): """ Patched version of _init_weights to match TorchTitan's initialization for Llama. @@ -46,12 +73,8 @@ def _init_weights_patched(self, module): init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 if isinstance(module, DeepseekV3Attention): - print("DeepseekV3Attention", module.layer_idx) if hasattr(module, 'q_proj'): nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) - # NOTE(3outeille): module.smart_apply is called on parent class, we have 3 child so init will be called 3 times - # That's why we need to set _is_hf_initialized to True to avoid triple initialization - print(f"module.q_proj.weight: {module.q_proj.weight}") else: nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) @@ -60,7 +83,6 @@ def _init_weights_patched(self, module): nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) - print("=====") elif isinstance(module, DeepseekV3MLP): nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) diff --git a/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log new file mode 100644 index 0000000000..1155c9a5db --- /dev/null +++ b/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log @@ -0,0 +1,163 @@ +diff --git a/tt_run.log.filtered b/hf_run.log.filtered +index 9726db6..84b6138 100644 +--- a/tt_run.log.filtered ++++ b/hf_run.log.filtered +@@ -1,85 +1,153 @@ ++ echo '##############################################' +##############################################'#######################################################' +####################################################### ++ echo '### Running TorchTitan (native)with HF backend training ###' +### Running TorchTitan (native)with HF backend training ### ++ echo '##############################################' +##############################################'#######################################################' +####################################################### ++ TT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.tomlHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml ++ CUDA_VISIBLE_DEVICES=0CUDA_VISIBLE_DEVICES=1 ++ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml --training.seed 42 --training.deterministic --model.name deepseek_v3deepseek-ai/DeepSeek-V3 +[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. +[rank0]: warnings.warn( +[rank0]:[titan] TIMESTAMP - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily. +[rank0]:[titan] TIMESTAMP - root - INFO - Starting job: HF Llama 3 debug training +[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config +[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [] +[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation). +[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json +[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test +[rank0]:[titan] TIMESTAMP - root - INFO - Building deepseek_v3deepseek-ai/DeepSeek-V3 debugmodel with DeepSeekV3ModelArgs(_enforced='This field is used to enforce all fields have defaults.', max_batch_size=8, max_seq_len=2048, vocab_size=2000, dim=256, inter_dim=1024, moe_inter_dim=256, n_layers=2, n_dense_layers=1, n_heads=16, norm_eps=1e-05, moe_args=MoEArgs(num_experts=8,HFTransformerModelArgs( +[rank0]:attn_implementation='sdpa' +[rank0]:attn_mask_type='causal' +[rank0]:beta_fast=None +[rank0]:beta_slow=None +[rank0]:depth_init=True +[rank0]:dim=256 +[rank0]:eos_id=0 +[rank0]:ffn_dim_multiplier=None +[rank0]:inter_dim=1024 +[rank0]:kv_lora_rank=512 +[rank0]:max_seq_len=2048 +[rank0]:moe_args=MoEArgs(num_experts=8, num_shared_experts=2, score_func='softmax', route_norm=True, route_scale=1.0, score_before_experts=False, top_k=3, use_grouped_mm=True, load_balance_coeff=0.001), n_expert_groups=1, n_limited_groups=1, q_lora_rank=0, kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64, v_head_dim=128, use_flex_attn=False, attn_mask_type='causal', original_seq_len=4096, rope_theta=10000.0, rope_factor=40, beta_fast=32, beta_slow=1, mscale=0.7)load_balance_coeff=0.001) +[rank0]:moe_inter_dim=256 +[rank0]:moe_intermediate_size=256 +[rank0]:mscale=0.7 +[rank0]:multiple_of=256 +[rank0]:n_dense_layers=1 +[rank0]:n_expert_groups=None +[rank0]:n_group=2 +[rank0]:n_heads=16 +[rank0]:n_kv_heads=16 +[rank0]:n_layers=2 +[rank0]:n_limited_groups=None +[rank0]:n_routed_experts=8 +[rank0]:n_shared_experts=2 +[rank0]:norm_eps=1e-05 +[rank0]:num_experts_per_tok=3 +[rank0]:original_seq_len=None +[rank0]:partial_rotary_factor=4.0 +[rank0]:q_lora_rank=None +[rank0]:qk_nope_head_dim=128 +[rank0]:qk_rope_head_dim=64 +[rank0]:rope_factor=None +[rank0]:rope_theta=10000 +[rank0]:topk_group=1 +[rank0]:use_flex_attn=False +[rank0]:v_head_dim=128 +[rank0]:vocab_size=2000 +[rank0]:) +[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory +[rank0]:[titan] TIMESTAMP - root - INFO - Total parameter count: dense 8,923,392, sparse 1,968,128, active 9,908,480 +[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown: +[rank0]:[titan] TIMESTAMP - root - INFO - DeepSeekV3ModelHFTransformerModel - 10,891,520 params +[rank0]:[titan] TIMESTAMP - root - INFO - (tok_embeddings):(embed_tokens): Embedding - 512,000 params +[rank0]:[titan] TIMESTAMP - root - INFO - (layers): ModuleDictModuleList - 9,867,264 params +[rank0]:[titan] TIMESTAMP - root - INFO - (0): TransformerBlockDeepseekV3DecoderLayer - 4,342,784 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): DeepseekV3Attention - 3,555,840 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 786,432 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_a):(kv_a_proj_with_mqa): Linear - 147,456 params +[rank0]:[titan] TIMESTAMP - root - INFO - (kv_norm): RMSNorm(kv_a_layernorm): DeepseekV3RMSNorm - 512 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_b):(kv_b_proj): Linear - 2,097,152 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 524,288 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(mlp): DeepseekV3MLP - 256786,432 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(gate_proj): Linear - 256262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(up_proj): Linear - 786,432262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(down_proj): Linear - 262,144 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2): Linear(input_layernorm): DeepseekV3RMSNorm - 262,144256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3): Linear(post_attention_layernorm): DeepseekV3RMSNorm - 262,144256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (1): TransformerBlockDeepseekV3DecoderLayer - 5,524,480 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): DeepseekV3Attention - 3,555,840 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 786,432 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_a):(kv_a_proj_with_mqa): Linear - 147,456 params +[rank0]:[titan] TIMESTAMP - root - INFO - (kv_norm): RMSNorm(kv_a_layernorm): DeepseekV3RMSNorm - 512 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_b):(kv_b_proj): Linear - 2,097,152 params +[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 524,288 params +[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(mlp): DeepseekV3MoE - 2561,968,128 params +[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(experts): ModuleList - 2561,572,864 params +[rank0]:[titan] TIMESTAMP - root - INFO - (moe): MoE(0): DeepseekV3MLP - 1,968,128196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (experts): GroupedExperts(gate_proj): Linear - 1,572,86465,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (router): TokenChoiceTopKRouter(up_proj): Linear - 2,04865,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate):(down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (1): DeepseekV3MLP - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (2): DeepseekV3MLP - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (3): DeepseekV3MLP - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (4): DeepseekV3MLP - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (5): DeepseekV3MLP - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (6): DeepseekV3MLP - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (7): DeepseekV3MLP - 196,608 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params +[rank0]:[titan] TIMESTAMP - root - INFO - (gate): DeepseekV3TopkRouter - 2,048 params +[rank0]:[titan] TIMESTAMP - root - INFO - (shared_experts): FeedForwardDeepseekV3MLP - 393,216 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 131,072 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 131,072 params +[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 131,072 params +[rank0]:[titan] TIMESTAMP - root - INFO - (input_layernorm): DeepseekV3RMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (post_attention_layernorm): DeepseekV3RMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (norm): RMSNormDeepseekV3RMSNorm - 256 params +[rank0]:[titan] TIMESTAMP - root - INFO - (output):(lm_head): Linear - 512,000 params +[rank0]:[titan] TIMESTAMP - root - INFO - Model deepseek_v3deepseek-ai/DeepSeek-V3 debugmodel size: 10,891,520 total parameters +[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model +[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14 +[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.05GiB(0.06%) +[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP +[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2) +[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1 +[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace +[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/lib/python3.12/site-packages/torch/nn/functional.py:2920: UserWarning: Mismatch dtype between input and weight: input dtype = c10::BFloat16, weight dtype = float, Cannot dispatch to fused implementation. (Triggered internally at /pytorch/aten/src/ATen/native/layer_norm.cpp:344.) +[rank0]: return torch.rms_norm(input, normalized_shape, weight, eps)./outputs/profile_trace_hf +[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 8.13818.1218 grad_norm: 2.73742.7807 memory: 2.14GiB(2.70%)2.48GiB(3.13%) tps: 18,02411,445 tflops: 1.240.89 mfu: 0.13%0.09% +[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 +[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.02086.8905 grad_norm: 3.26153.2709 memory: 2.15GiB(2.71%)2.49GiB(3.13%) tps: 20,23217,755 tflops: 1.401.38 mfu: 0.14% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 5.26425.1682 grad_norm: 2.87352.8229 memory: 2.15GiB(2.71%)2.49GiB(3.13%) tps: 325,066119,606 tflops: 22.429.32 mfu: 2.27%0.94% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 4.82864.7719 grad_norm: 2.18852.2433 memory: 2.15GiB(2.71%)2.51GiB(3.15%) tps: 345,536135,937 tflops: 23.8310.59 mfu: 2.41%1.07% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 4.43704.3827 grad_norm: 2.30532.3779 memory: 2.15GiB(2.71%)2.51GiB(3.15%) tps: 296,009133,266 tflops: 20.4110.39 mfu: 2.06%1.05% +[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5 +[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.05 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.30634.2368 grad_norm: 2.24452.2557 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 136,06566,465 tflops: 9.385.18 mfu: 0.95%0.52% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.12534.0403 grad_norm: 1.96261.9132 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 299,863131,077 tflops: 20.6810.22 mfu: 2.09%1.03% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.06453.9796 grad_norm: 1.82991.8154 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 343,855147,955 tflops: 23.7111.53 mfu: 2.40%1.17% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.47584.4010 grad_norm: 1.47431.4965 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 346,707139,416 tflops: 23.9110.87 mfu: 2.42%1.10% +[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.94833.8448 grad_norm: 1.62401.6185 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 303,029139,581 tflops: 20.9010.88 mfu: 2.11%1.10% +[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10 +[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.020.04 seconds +[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete +[rank0]:[titan] TIMESTAMP - root - INFO - Training completed +[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed diff --git a/torchtitan/models/deepseek_v3/__init__.py b/torchtitan/models/deepseek_v3/__init__.py index 1c3d2b19d2..3322ad0a83 100644 --- a/torchtitan/models/deepseek_v3/__init__.py +++ b/torchtitan/models/deepseek_v3/__init__.py @@ -35,7 +35,7 @@ dim=256, inter_dim=1024, moe_inter_dim=256, - n_layers=3, + n_layers=2, n_dense_layers=1, n_heads=16, moe_args=MoEArgs( diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py index e2c4bbeda9..5547840e27 100644 --- a/torchtitan/models/deepseek_v3/model/model.py +++ b/torchtitan/models/deepseek_v3/model/model.py @@ -5,6 +5,8 @@ # LICENSE file in the root directory of this source tree. import math +import os +import functools from typing import Tuple import torch @@ -17,6 +19,35 @@ from .args import DeepSeekV3ModelArgs +def seeded_init_decorator_for_test(seed): + """ + Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call + and prints layer weights after initialization. + """ + import lovely_tensors as lt; lt.monkey_patch() + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + original_trunc_normal = nn.init.trunc_normal_ + + def seeded_trunc_normal(*trunc_args, **trunc_kwargs): + torch.manual_seed(seed) + tensor = trunc_args[0] # First argument is always the tensor + result = original_trunc_normal(*trunc_args, **trunc_kwargs) + # # Try to get module info from the calling context + # module_name = "Unknown" + # if len(args) > 0 and hasattr(args[0], "__class__"): + # module_name = args[0].__class__.__name__ + # print(f"Module: {module_name}, Tensor value: {tensor}") + return result + + nn.init.trunc_normal_ = seeded_trunc_normal + return func(*args, **kwargs) + + return wrapper + return decorator + + # Adapted from https://github.com/DeepSeek-ai/DeepSeek-V3/blob/main/inference/model.py#L294 def precompute_freqs_cis(args: DeepSeekV3ModelArgs) -> torch.Tensor: """ @@ -240,6 +271,7 @@ def forward( output = output.view(bsz, seqlen, -1) # (bsz, seqlen, n_heads * v_head_dim) return self.wo(output) # (bsz, seqlen, dim) + @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float): linear_list = [ self.wkv_a, @@ -302,6 +334,7 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor): x = x + self.feed_forward(self.ffn_norm(x)) return x + @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, buffer_device: torch.device): for norm in (self.attention_norm, self.ffn_norm): norm.reset_parameters() @@ -339,6 +372,7 @@ def __init__(self, model_args: DeepSeekV3ModelArgs): self.model_args = model_args self.init_weights() + @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, buffer_device: torch.device | None = None) -> None: buffer_device = buffer_device or self.freqs_cis.device with torch.device(buffer_device): diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py index 8be14ecbf0..5ba63b9157 100644 --- a/torchtitan/models/moe.py +++ b/torchtitan/models/moe.py @@ -12,8 +12,38 @@ from torch import nn from torchtitan.distributed.expert_parallel import expert_parallel +import os +import functools +def seeded_init_decorator_for_test(seed): + """ + Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call + and prints layer weights after initialization. + """ + import lovely_tensors as lt; lt.monkey_patch() + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + original_trunc_normal = nn.init.trunc_normal_ + + def seeded_trunc_normal(*trunc_args, **trunc_kwargs): + torch.manual_seed(seed) + tensor = trunc_args[0] # First argument is always the tensor + result = original_trunc_normal(*trunc_args, **trunc_kwargs) + # # Try to get module info from the calling context + # module_name = "Unknown" + # if len(args) > 0 and hasattr(args[0], "__class__"): + # module_name = args[0].__class__.__name__ + # print(f"Module: {module_name}, Tensor value: {tensor}") + return result + + nn.init.trunc_normal_ = seeded_trunc_normal + return func(*args, **kwargs) + + return wrapper + return decorator + @dataclass class MoEArgs: num_experts: int = 8 @@ -57,6 +87,7 @@ def __init__( def forward(self, x: torch.Tensor) -> torch.Tensor: return self.w2(F.silu(self.w1(x)) * self.w3(x)) + @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float = 0.02): nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02) for linear in (self.w2, self.w3): @@ -153,6 +184,7 @@ def forward( self.w1, self.w2, self.w3, x, num_tokens_per_expert ) + @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float): nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02) nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std) @@ -246,6 +278,7 @@ def forward( return top_scores, selected_experts_indices, num_tokens_per_expert + @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float): nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std) @@ -435,6 +468,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out = out.reshape(bs, slen, dim) return out + @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights( self, init_std: float, From f9e90bc03aba179edb1cb6c488ac2b0e9a002de4 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 22 Sep 2025 11:41:27 +0000 Subject: [PATCH 033/129] adapt mfu to handle moe --- .../model/hf_transformers_args.py | 89 ++++++++++++++----- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 64fb64d72f..704f83a534 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -170,27 +170,76 @@ def update_from_config(self, job_config: JobConfig): return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - #TODO(3outeille): adapt to handle MoE - nparams = sum(p.numel() for p in model.parameters()) - nparams_embedding = sum( - sum(p.numel() for p in m.parameters()) - for m in model.children() - if isinstance(m, nn.Embedding) - ) + # Check if this is a MoE model by looking for MoE attributes + is_moe = hasattr(self, 'n_routed_experts') and hasattr(self, 'num_experts_per_tok') + + if is_moe: + # MoE parameter counting (adapted from DeepSeek V3 implementation) + nparams_embedding = 0 + nparams_moe_router = 0 + nparams_shared_experts = 0 + nparams_experts = 0 + nparams_dense = 0 - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Reasoning behind the factor of 12 for the self-attention part of the formula: - # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) - # 2. the flash attention does 1 more matmul recomputation in the backward - # but recomputation should not be counted in calculating MFU (+0) - # 3. each matmul performs 1 multiplication and 1 addition (*2) - # 4. we follow the convention and do not account for sparsity in causal attention - num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + for name, p in model.named_parameters(): + if "embedding" in name: + nparams_embedding += p.numel() + nparams_dense += p.numel() + elif "moe.shared_experts" in name: + nparams_shared_experts += p.numel() + elif "moe.router" in name: + nparams_moe_router += p.numel() + elif "moe.experts" in name: + nparams_experts += p.numel() + else: + nparams_dense += p.numel() + + nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts + nparams = nparams_dense + nparams_sparse + nparams_sparse_active = ( + nparams_moe_router + + nparams_shared_experts + + nparams_experts * self.num_experts_per_tok // self.n_routed_experts + ) + + logger.info( + f"Total parameter count: dense {nparams_dense:,}, " + f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" + ) + + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Use active parameters for FLOPS calculation in MoE + num_flops_per_token = ( + 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) + + 12 * l * h * q * t + ) + else: + # Dense model parameter counting (original implementation) + nparams = sum(p.numel() for p in model.parameters()) + nparams_embedding = sum( + sum(p.numel() for p in m.parameters()) + for m in model.children() + if isinstance(m, nn.Embedding) + ) + + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Reasoning behind the factor of 12 for the self-attention part of the formula: + # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # 2. the flash attention does 1 more matmul recomputation in the backward + # but recomputation should not be counted in calculating MFU (+0) + # 3. each matmul performs 1 multiplication and 1 addition (*2) + # 4. we follow the convention and do not account for sparsity in causal attention + num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t return nparams, num_flops_per_token From ba5d6d1e1d2aa7b1168efac27f0d0859db7e4976 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 23 Sep 2025 12:06:18 +0000 Subject: [PATCH 034/129] beginning parallelism by setting tests --- .../compare_distributed_run.py | 564 ++++++++++++++++++ .../compare_distributed_run.sh | 6 + .../transformers_backend/compare_tt_hf_run.sh | 5 - 3 files changed, 570 insertions(+), 5 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/compare_distributed_run.py create mode 100755 torchtitan/experiments/transformers_backend/compare_distributed_run.sh diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py new file mode 100644 index 0000000000..08e8057c90 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -0,0 +1,564 @@ +#!/usr/bin/env python3 +""" +compare_distributed_run.py - Test different parallelism configurations against baseline +Based on TorchTitan convergence guidelines + +Copyright (c) Meta Platforms, Inc. and affiliates. +All rights reserved. + +This source code is licensed under the BSD-style license found in the +LICENSE file in the root directory of this source tree. +""" + +import argparse +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path +from typing import Dict, List, Tuple, Optional, NamedTuple +import tempfile +import json +from dataclasses import dataclass +from enum import Enum +import logging + +# Configure logging with colors +class Colors: + RED = '\033[0;31m' + GREEN = '\033[0;32m' + YELLOW = '\033[1;33m' + BLUE = '\033[0;34m' + MAGENTA = '\033[0;35m' + CYAN = '\033[0;36m' + NC = '\033[0m' # No Color + +class LogLevel(Enum): + INFO = "INFO" + SUCCESS = "SUCCESS" + WARNING = "WARNING" + ERROR = "ERROR" + TEST_PASS = "TEST_PASS" + TEST_FAIL = "TEST_FAIL" + +def log_message(level: LogLevel, message: str) -> None: + """Log a message with appropriate color coding.""" + color_map = { + LogLevel.INFO: Colors.BLUE, + LogLevel.SUCCESS: Colors.GREEN, + LogLevel.WARNING: Colors.YELLOW, + LogLevel.ERROR: Colors.RED, + LogLevel.TEST_PASS: Colors.GREEN, + LogLevel.TEST_FAIL: Colors.RED, + } + + prefix_map = { + LogLevel.INFO: "[INFO]", + LogLevel.SUCCESS: "[SUCCESS]", + LogLevel.WARNING: "[WARNING]", + LogLevel.ERROR: "[ERROR]", + LogLevel.TEST_PASS: "✅ TEST PASS", + LogLevel.TEST_FAIL: "❌ TEST FAIL", + } + + color = color_map[level] + prefix = prefix_map[level] + print(f"{color}{prefix}{Colors.NC} {message}") + +@dataclass +class ParallelismConfig: + """Configuration for a parallelism setup.""" + name: str + dp_replicate: int + dp_shard: int + tp: int + pp: int + pp_schedule: str + cp: int + ep: int + eptp: int + +@dataclass +class TrainingMetrics: + """Training metrics extracted from logs.""" + loss: Optional[float] = None + grad_norm: Optional[float] = None + +class CompareDistributedRun: + """Main class for running distributed parallelism comparison tests.""" + + # Default values + DEFAULT_THRESHOLD_LOSS = 1e-4 + DEFAULT_THRESHOLD_GRAD_NORM = 1e-3 + DEFAULT_STEPS = 10 + DEFAULT_SEED = 42 + DEFAULT_FLAVOR = "debugmodel" + + # HF Model lists - extendable for different model families + HF_MODEL_LISTS = { + "llama": "meta-llama/Llama-3.2-1B", + "deepseek": "deepseek-ai/DeepSeek-V3", + } + + # Available flavors per model type + MODEL_FLAVORS = { + "llama": ["debugmodel", "medium", "full"], + "deepseek": ["debugmodel"], + } + + # Available ND parallelisms <-> number of GPUs + ND_PARALLEL_TO_NB_GPUS = { + "1d": 2, + "2d": 4, + "3d": 8, + "4d": 16, + } + + def __init__(self): + self.script_dir = Path(__file__).parent.absolute() + self.torchtitan_root = self.script_dir.parent.parent + self.results_dir = self.script_dir / "comparison_results" + self.config_dir = self.script_dir / "generated_configs" + + # Configuration parameters + self.loss_threshold = self.DEFAULT_THRESHOLD_LOSS + self.grad_norm_threshold = self.DEFAULT_THRESHOLD_GRAD_NORM + self.nd_parallel_to_nb_gpus = self.ND_PARALLEL_TO_NB_GPUS + self.steps = self.DEFAULT_STEPS + self.seed = self.DEFAULT_SEED + self.model_filter = "" + self.flavor = self.DEFAULT_FLAVOR + self.verbose = False + self.parallelism_configs: List[ParallelismConfig] = [] + + def generate_parallelism_configs(self) -> None: + """Generate parallelism configurations based on the number of GPUs.""" + ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] + configs = [] + + def _get_factors(n: int) -> List[int]: + factors = set() + for i in range(1, int(n**0.5) + 1): + if n % i == 0: + factors.add(i) + factors.add(n // i) + return sorted(list(factors)) + + # Baseline FSDP + configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="Interleaved1F1B", cp=1, ep=1, eptp=1)) + + possible_tp = _get_factors(ngpu) + possible_pp = _get_factors(ngpu) + possible_ep = _get_factors(ngpu) + #TODO(3outeille): is CP borrowing degree from DP ? + #TODO(3outeille): is EP borrowing degree from DP ? + + # Is that correct ? + for tp in possible_tp: + for pp in possible_pp: + for ep in possible_ep: + if tp * pp * ep > ngpu: + continue + + if ngpu % (tp * pp * ep) == 0: + dp = ngpu // (tp * pp * ep) + if dp > 0 and (tp > 1 or pp > 1 or ep > 1 or dp > 1): + # DDP style + if dp > 1: + configs.append( + ParallelismConfig( + name=f"tp{tp}_pp{pp}_ep{ep}_ddp{dp}", + dp_replicate=dp, + dp_shard=1, + tp=tp, + pp=pp, + pp_schedule="Interleaved1F1B", + cp=1, + ep=ep, + eptp=1 + ) + ) + # FSDP with other parallelisms + if tp > 1 or pp > 1 or ep > 1: + configs.append( + ParallelismConfig( + name=f"tp{tp}_pp{pp}_ep{ep}_fsdp", + dp_replicate=1, + dp_shard=-1, + tp=tp, + pp=pp, + pp_schedule="Interleaved1F1B", + cp=1, + ep=ep, + eptp=1 + ) + ) + + # HSDP requires a DP degree that can be split + for dp in _get_factors(ngpu): + if dp > 1: + dp_factors = _get_factors(dp) + for replicate in dp_factors: + if replicate > 1: + shard = dp // replicate + if shard > 1: + configs.append( + ParallelismConfig( + name=f"hsdp_r{replicate}_s{shard}", + dp_replicate=replicate, + dp_shard=shard, + tp=1, + pp=1, + pp_schedule="Interleaved1F1B", + cp=1, + ep=1, + eptp=1 + ) + ) + + # Remove duplicates and assign to instance + unique_configs = [] + seen_configs = set() + for config in configs: + # Create a tuple of the config values to check for duplicates + config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.ep) + if config_tuple not in seen_configs: + unique_configs.append(config) + seen_configs.add(config_tuple) + + self.parallelism_configs = unique_configs + + log_message(LogLevel.INFO, f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.") + if self.verbose: + for config in self.parallelism_configs: + log_message(LogLevel.INFO, f" - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, ep={config.ep}") + def generate_config(self, config: ParallelismConfig, model_name: str, model_type: str) -> Path: + """Generate configuration file for a parallelism setup.""" + config_file = self.config_dir / f"{config.name}_{model_type}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu.toml" + + #TODO(3outeille): create template instead + if model_type == "llama": + base_config = self.script_dir / "configs" / "debug_1_gpu_tt.toml" + else: + base_config = self.script_dir / "configs" / "debug_1_gpu_hf.toml" + + shutil.copy2(base_config, config_file) + + with open(config_file, 'r') as f: + content = f.read() + + # Update model name if it's HF backend + if model_type != "llama": + content = re.sub(r'name = ".*"', f'name = "{model_name}"', content) + + # Update model flavor + content = re.sub(r'flavor = ".*"', f'flavor = "{self.flavor}"', content) + + # Validate flavor for model type + if model_type in self.MODEL_FLAVORS: + if self.flavor not in self.MODEL_FLAVORS[model_type]: + log_message(LogLevel.WARNING, + f"Flavor '{self.flavor}' not available for {model_type}. " + f"Available: {self.MODEL_FLAVORS[model_type]}") + + # Update training steps and seed + content = re.sub(r'steps = .*', f'steps = {self.steps}', content) + if 'seed = ' in content: + content = re.sub(r'seed = .*', f'seed = {self.seed}', content) + else: + content = re.sub(r'(steps = .*)', f'\\1\nseed = {self.seed}', content) + + #TODO(3outeille): is this correct ? + # Ensure deterministic training + if 'deterministic = true' not in content: + content = re.sub(r'(seed = .*)', '\\1\ndeterministic = true', content) + + # Update parallelism configuration + content = re.sub(r'data_parallel_replicate_degree = .*', + f'data_parallel_replicate_degree = {config.dp_replicate}', content) + content = re.sub(r'data_parallel_shard_degree = .*', + f'data_parallel_shard_degree = {config.dp_shard}', content) + content = re.sub(r'tensor_parallel_degree = .*', + f'tensor_parallel_degree = {config.tp}', content) + content = re.sub(r'pipeline_parallel_degree = .*', + f'pipeline_parallel_degree = {config.pp}', content) + content = re.sub(r'pipeline_parallel_schedule = .*', + f'pipeline_parallel_schedule = "{config.pp_schedule}"', content) + content = re.sub(r'context_parallel_degree = .*', + f'context_parallel_degree = {config.cp}', content) + content = re.sub(r'expert_parallel_degree = .*', + f'expert_parallel_degree = {config.ep}', content) + + content = re.sub(r'expert_tensor_parallel_degree = .*', + f'expert_tensor_parallel_degree = {config.eptp}', content) + + # Write modified config + with open(config_file, 'w') as f: + f.write(content) + + log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name}, type: {model_type})") + return config_file + + def extract_metrics(self, log_file: Path) -> TrainingMetrics: + """Extract metrics from log file.""" + metrics = TrainingMetrics() + + try: + with open(log_file, 'r') as f: + content = f.read() + + # Extract final loss and grad_norm from the last step + loss_matches = re.findall(r'loss:\s*([0-9]+\.?[0-9]*)', content) + grad_norm_matches = re.findall(r'grad_norm:\s*([0-9]+\.?[0-9]*)', content) + + if loss_matches: + metrics.loss = float(loss_matches[-1]) + if grad_norm_matches: + metrics.grad_norm = float(grad_norm_matches[-1]) + + except Exception as e: + log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}") + + if metrics.loss is None or metrics.grad_norm is None: + log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}") + + return metrics + + def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, + config_name: str) -> bool: + """Compare metrics between baseline and test configuration.""" + if (baseline_metrics.loss is None or baseline_metrics.grad_norm is None or + test_metrics.loss is None or test_metrics.grad_norm is None): + log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics") + return False + + # Calculate absolute differences + loss_diff = abs(baseline_metrics.loss - test_metrics.loss) + grad_norm_diff = abs(baseline_metrics.grad_norm - test_metrics.grad_norm) + + # Check if differences are within thresholds + loss_pass = loss_diff < self.loss_threshold + grad_pass = grad_norm_diff < self.grad_norm_threshold + + if loss_pass and grad_pass: + log_message(LogLevel.TEST_PASS, + f"{config_name} - Loss diff: {loss_diff:.2e} (< {self.loss_threshold:.2e}), " + f"Grad norm diff: {grad_norm_diff:.2e} (< {self.grad_norm_threshold:.2e})") + return True + else: + log_message(LogLevel.TEST_FAIL, + f"{config_name} - Loss diff: {loss_diff:.2e} (threshold: {self.loss_threshold:.2e}), " + f"Grad norm diff: {grad_norm_diff:.2e} (threshold: {self.grad_norm_threshold:.2e})") + return False + + def generate_diff(self, baseline_log: Path, log_path: Path, diff_file: Path) -> None: + """Generate diff between baseline and test logs.""" + + def _filter_log(log_file: Path) -> Path: + """Filter log file to normalize volatile information.""" + filtered_file = log_file.with_suffix(log_file.suffix + '.filtered') + + with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile: + for line in infile: + # Apply filtering patterns + line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', + 'TIMESTAMP', line) + line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', + 'torchrun ... --master_port=XXXX', line) + line = re.sub(r'PID [0-9]+', 'PID XXXX', line) + line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line) + line = re.sub(r'memory: [0-9]+\.[0-9]+GiB', 'memory: XX.XXGiB', line) + line = re.sub(r'tps: [0-9,]+', 'tps: XXXXX', line) + line = re.sub(r'tflops: [0-9]+\.[0-9]+', 'tflops: XX.XX', line) + line = re.sub(r'mfu: [0-9]+\.[0-9]+%', 'mfu: XX.XX%', line) + outfile.write(line) + + return filtered_file + try: + # Filter logs to remove timestamps and volatile information + baseline_filtered = _filter_log(baseline_log) + test_filtered = _filter_log(log_path) + + # Generate colored diff using git diff + cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color", + str(baseline_filtered), str(test_filtered)] + + with open(diff_file, 'w') as f: + subprocess.run(cmd, stdout=f, stderr=subprocess.DEVNULL) + + # Clean up filtered files + baseline_filtered.unlink() + test_filtered.unlink() + + except Exception as e: + log_message(LogLevel.WARNING, f"Could not generate diff: {e}") + + def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> bool: + """Run training with given configuration.""" + log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}") + cmd = [ + "torchrun", + f"--nproc_per_node={self.nd_parallel_to_nb_gpus[self.nd_parallel]}", + "--rdzv_backend", "c10d", + "--rdzv_endpoint=localhost:0", + "--local-ranks-filter", "0", + "--role", "rank", + "--tee", "3", + "-m", "torchtitan.train", + "--job.config_file", str(config_file) + ] + + env = os.environ.copy() + + if self.verbose: + log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}") + + try: + with open(log_file, 'w') as f: + result = subprocess.run( + cmd, + cwd=self.torchtitan_root, + stdout=f, + stderr=subprocess.STDOUT, + env=env, + check=True + ) + + if self.verbose: + log_message(LogLevel.SUCCESS, f"Training completed: {config_name}") + return True + + except subprocess.CalledProcessError as e: + log_message(LogLevel.ERROR, f"Training failed: {config_name}") + return False + + def run(self) -> int: + """Main execution function. Runs all test suites for all models.""" + parser = argparse.ArgumentParser( + description="Test different parallelism configurations against a baseline FSDP model.", + ) + parser.add_argument("-m", "--model-filter", default="", + help="Filter models by name pattern (e.g., 'llama')") + parser.add_argument("-t", "--loss-threshold", type=float, default=self.DEFAULT_THRESHOLD_LOSS, + help=f"Loss difference threshold (default: {self.DEFAULT_THRESHOLD_LOSS})") + parser.add_argument("-g", "--grad-threshold", type=float, default=self.DEFAULT_THRESHOLD_GRAD_NORM, + help=f"Grad norm difference threshold (default: {self.DEFAULT_THRESHOLD_GRAD_NORM})") + parser.add_argument("-nd", "--nd_parallel", type=str, default="2d", + help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})") + parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS, + help=f"Training steps (default: {self.DEFAULT_STEPS})") + parser.add_argument("--seed", type=int, default=self.DEFAULT_SEED, + help=f"Random seed (default: {self.DEFAULT_SEED})") + parser.add_argument("--flavor", default=self.DEFAULT_FLAVOR, + help=f"Model flavor/size (default: {self.DEFAULT_FLAVOR}). " + f"Available: llama=[debugmodel, medium, full], deepseek=[debugmodel]") + parser.add_argument("-v", "--verbose", action="store_true", + help="Verbose output") + + args = parser.parse_args() + + self.loss_threshold = args.loss_threshold + self.grad_norm_threshold = args.grad_threshold + self.nd_parallel = args.nd_parallel + self.steps = args.steps + self.seed = args.seed + self.model_filter = args.model_filter + self.flavor = args.flavor + self.verbose = args.verbose + + log_message(LogLevel.INFO, "=== TorchTitan Distributed Parallelism Comparison ===") + log_message(LogLevel.INFO, f"Loss threshold: {self.loss_threshold}") + log_message(LogLevel.INFO, f"Grad norm threshold: {self.grad_norm_threshold}") + log_message(LogLevel.INFO, f"GPUs: {self.nd_parallel_to_nb_gpus[self.nd_parallel]}") + log_message(LogLevel.INFO, f"Steps: {self.steps}") + log_message(LogLevel.INFO, f"Seed: {self.seed}") + log_message(LogLevel.INFO, f"Model filter: {self.model_filter or 'all'}") + log_message(LogLevel.INFO, f"Model flavor: {self.flavor}") + print() + + self.results_dir.mkdir(exist_ok=True) + self.config_dir.mkdir(exist_ok=True) + + if self.verbose: + log_message(LogLevel.INFO, f"Results directory: {self.results_dir}") + log_message(LogLevel.INFO, f"Config directory: {self.config_dir}") + + self.generate_parallelism_configs() + + total_model_failures = 0 + + for model_type, model_name in self.HF_MODEL_LISTS.items(): + # Apply model filter if specified + if self.model_filter and self.model_filter not in model_type: + continue + + log_message(LogLevel.INFO, f"Testing model: {model_type} ({model_name})") + total_tests = 0 + passed_tests = 0 + failed_tests = 0 + configs_to_run = [] + + for config in self.parallelism_configs: + # Skip configurations that require more GPUs than available + required_gpus = config.dp_replicate * config.tp * config.pp + if config.dp_shard != -1: + required_gpus *= config.dp_shard + + if required_gpus > self.nd_parallel_to_nb_gpus[self.nd_parallel]: + log_message(LogLevel.WARNING, + f"Skipping {config.name}: requires {required_gpus} GPUs but only {self.ngpu} available") + continue + + config_file = self.generate_config(config, model_name, model_type) + configs_to_run.append((config, config_file)) + + # # Test each parallelism configuration + # for config, config_file in configs_to_run: + # log_path = self.results_dir / f"{config.name}_{model_type}_{self.flavor}_{self.ngpu}gpu.log" + # if not self.run_training(config_file, log_path, config.name, model_name): + # log_message(LogLevel.TEST_FAIL, f"{config.name} - Training failed") + # failed_tests += 1 + # continue + # test_metrics = self.extract_metrics(log_path) + # if self.compare_metrics(baseline_metrics, test_metrics, config.name): + # passed_tests += 1 + # else: + # failed_tests += 1 + # diff_file = self.results_dir / f"diff_{config.name}_vs_baseline_{model_type}_{self.flavor}_{self.ngpu}gpu.log" + # self.generate_diff(baseline_log, log_path, diff_file) + # log_message(LogLevel.INFO, f"Diff saved to: {diff_file}") + # total_tests += 1 + + # Print summary for this model + print() + log_message(LogLevel.INFO, f"=== TEST SUMMARY for {model_type} ===") + log_message(LogLevel.INFO, f"Total tests: {total_tests}") + log_message(LogLevel.SUCCESS, f"Passed: {passed_tests}") + if failed_tests > 0: + log_message(LogLevel.TEST_FAIL, f"Failed: {failed_tests}") + else: + log_message(LogLevel.INFO, f"Failed: {failed_tests}") + print() + + if failed_tests > 0: + total_model_failures += 1 + + # Final summary + print() + log_message(LogLevel.INFO, "=== FINAL SUMMARY ===") + if total_model_failures == 0: + log_message(LogLevel.SUCCESS, "All model tests passed! 🎉") + return 0 + else: + log_message(LogLevel.TEST_FAIL, f"{total_model_failures} model(s) had test failures") + log_message(LogLevel.INFO, f"Check the diff files in {self.results_dir} for details") + return 1 + +def main(): + """Entry point for the script.""" + runner = CompareDistributedRun() + return runner.run() + +if __name__ == "__main__": + sys.exit(main()) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh new file mode 100755 index 0000000000..80bb2d04ca --- /dev/null +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh @@ -0,0 +1,6 @@ +#!/usr/bin/bash + +# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel + +# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d +debugpy-run compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh index be7243f81b..703a9b55c9 100755 --- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh @@ -1,9 +1,4 @@ #!/usr/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. set -ex set -o pipefail From 338a25006716ed6a9ef23631ca54421a9b00779e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 24 Sep 2025 12:05:24 +0000 Subject: [PATCH 035/129] better compare_distributed_run test --- .../transformers_backend/__init__.py | 2 +- .../compare_distributed_run.py | 466 ++++++++++-------- .../compare_distributed_run.sh | 5 +- 3 files changed, 258 insertions(+), 215 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 0cecbfb199..fa8cc4c119 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -76,7 +76,7 @@ class DeepSeekV3Args: # #TODO(3outeille): identify that if MoE model is used, we add a moe_args field -if os.environ.get("MODEL_TYPE") == "llama": +if os.environ.get("MODEL_TYPE") == "llama3" or os.environ.get("MODEL_TYPE") == "meta-llama/Llama-3.2-1B": print("Using llama model") patch_hf_llama() flavors = { diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 08e8057c90..9be6b52acf 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -1,15 +1,46 @@ -#!/usr/bin/env python3 """ -compare_distributed_run.py - Test different parallelism configurations against baseline -Based on TorchTitan convergence guidelines +python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 2d --verbose +python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor flavor --nd_parallel 2d --verbose + +Methodology: + - train on FSDP with TT (baseline) + - train on FSDP with HF (baseline) + - For all parallelism, train with nd-// with HF + - If one train fails: + - generated diff between HF FSDP (baseline) HF nd-// + - train the nd-// TT counterpart + - diff between TT nd-// and HF nd-// + - diff between TT FSDP (baseline) and HF nd-// +results/ +|_ meta-llama + |_ Llama-3.2-1B + |_ 2D + |_ debugmodel + |_ baseline_hf_fsdp_4gpu.log + |_ baseline_tt_fsdp_4gpu.log + |_ baseline_fsdp_debugmodel_4gpu_huggingface.toml + |_ baseline_fsdp_debugmodel_4gpu_torchtitan.toml + |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface/ + |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.toml + |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_torchtitan.toml + |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.log + |_ diff_hf_baseline_vs_hf_nd_parallelism.log + |_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log + |_ diff_tt_baseline_vs_hf_nd_parallelism.log + |_ full + |_ baseline_hf_fsdp_4gpu.log + |_ baseline_tt_fsdp_4gpu.log + |_ baseline_fsdp_full_4gpu_huggingface.toml + |_ baseline_fsdp_full_4gpu_torchtitan.toml + |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface/ + |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.toml + |_ fsdp1_cp1_tp2_pp2_full_4gpu_torchtitan.toml + |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.log + |_ diff_hf_baseline_vs_hf_nd_parallelism.log + |_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log + |_ diff_tt_baseline_vs_hf_nd_parallelism.log -Copyright (c) Meta Platforms, Inc. and affiliates. -All rights reserved. - -This source code is licensed under the BSD-style license found in the -LICENSE file in the root directory of this source tree. """ - import argparse import os import re @@ -17,12 +48,9 @@ import subprocess import sys from pathlib import Path -from typing import Dict, List, Tuple, Optional, NamedTuple -import tempfile -import json +from typing import List, Optional from dataclasses import dataclass from enum import Enum -import logging # Configure logging with colors class Colors: @@ -95,31 +123,32 @@ class CompareDistributedRun: DEFAULT_SEED = 42 DEFAULT_FLAVOR = "debugmodel" - # HF Model lists - extendable for different model families - HF_MODEL_LISTS = { - "llama": "meta-llama/Llama-3.2-1B", - "deepseek": "deepseek-ai/DeepSeek-V3", + MODEL_LISTS = { + "torchtitan": ["llama3", "deepseek_v3"], + "huggingface": ["meta-llama/Llama-3.2-1B", "deepseek-ai/DeepSeek-V3"] } - # Available flavors per model type MODEL_FLAVORS = { - "llama": ["debugmodel", "medium", "full"], - "deepseek": ["debugmodel"], + "llama3": ["debugmodel", "medium", "full"], + "deepseek_v3": ["debugmodel"], + "meta-llama/Llama-3.2-1B": ["debugmodel", "medium", "full"], + "deepseek-ai/DeepSeek-V3": ["debugmodel"], } # Available ND parallelisms <-> number of GPUs ND_PARALLEL_TO_NB_GPUS = { + "0d": 1, "1d": 2, "2d": 4, "3d": 8, "4d": 16, + "5d": 32, } def __init__(self): self.script_dir = Path(__file__).parent.absolute() self.torchtitan_root = self.script_dir.parent.parent - self.results_dir = self.script_dir / "comparison_results" - self.config_dir = self.script_dir / "generated_configs" + self.base_results_dir = self.script_dir / "results" # Configuration parameters self.loss_threshold = self.DEFAULT_THRESHOLD_LOSS @@ -131,6 +160,7 @@ def __init__(self): self.flavor = self.DEFAULT_FLAVOR self.verbose = False self.parallelism_configs: List[ParallelismConfig] = [] + self.results_dir: Optional[Path] = None def generate_parallelism_configs(self) -> None: """Generate parallelism configurations based on the number of GPUs.""" @@ -148,81 +178,65 @@ def _get_factors(n: int) -> List[int]: # Baseline FSDP configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="Interleaved1F1B", cp=1, ep=1, eptp=1)) + #NOTE(3outeille): No need to handle DDP (dp_replicate) as DDP is not supported > 1D parallelism" + #(cf https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/infra/parallelize.py#L139) + possible_fsdp = _get_factors(ngpu) # dp_shard + possible_cp = _get_factors(ngpu) possible_tp = _get_factors(ngpu) possible_pp = _get_factors(ngpu) - possible_ep = _get_factors(ngpu) - #TODO(3outeille): is CP borrowing degree from DP ? - #TODO(3outeille): is EP borrowing degree from DP ? - - # Is that correct ? - for tp in possible_tp: - for pp in possible_pp: - for ep in possible_ep: - if tp * pp * ep > ngpu: - continue - - if ngpu % (tp * pp * ep) == 0: - dp = ngpu // (tp * pp * ep) - if dp > 0 and (tp > 1 or pp > 1 or ep > 1 or dp > 1): - # DDP style - if dp > 1: - configs.append( - ParallelismConfig( - name=f"tp{tp}_pp{pp}_ep{ep}_ddp{dp}", - dp_replicate=dp, - dp_shard=1, - tp=tp, - pp=pp, - pp_schedule="Interleaved1F1B", - cp=1, - ep=ep, - eptp=1 - ) - ) - # FSDP with other parallelisms - if tp > 1 or pp > 1 or ep > 1: - configs.append( - ParallelismConfig( - name=f"tp{tp}_pp{pp}_ep{ep}_fsdp", - dp_replicate=1, - dp_shard=-1, - tp=tp, - pp=pp, - pp_schedule="Interleaved1F1B", - cp=1, - ep=ep, - eptp=1 - ) - ) - - # HSDP requires a DP degree that can be split - for dp in _get_factors(ngpu): - if dp > 1: - dp_factors = _get_factors(dp) - for replicate in dp_factors: - if replicate > 1: - shard = dp // replicate - if shard > 1: - configs.append( - ParallelismConfig( - name=f"hsdp_r{replicate}_s{shard}", - dp_replicate=replicate, - dp_shard=shard, - tp=1, - pp=1, - pp_schedule="Interleaved1F1B", - cp=1, - ep=1, - eptp=1 - ) + + #TODO(3outeille): handle HSDP later + + for dp_shard in possible_fsdp: + for cp in possible_cp: + for tp in possible_tp: + for pp in possible_pp: + + if dp_shard * cp * tp * pp != ngpu: + continue + + num_parallelisms_used = sum(parallel_degree > 1 for parallel_degree in [dp_shard, cp, tp, pp]) + ndims_required = int(self.nd_parallel[0]) + #NOTE(3outeille): if 2D//, we need at least 2 parallelisms to be active (> 1). For 3D //, least 3 parallelisms > 1 etc. + if ndims_required > 1 and num_parallelisms_used < ndims_required: + continue + + configs.append( + ParallelismConfig( + name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}", + dp_replicate=1, + dp_shard=dp_shard, + tp=tp, + pp=pp, + pp_schedule="Interleaved1F1B", + cp=cp, + ep=1, + eptp=1 ) + ) + + # NOTE(3outeille): EP borrowing degree from dp_shard + configs.append( + ParallelismConfig( + name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}", + dp_replicate=1, + dp_shard=dp_shard, + tp=tp, + pp=pp, + pp_schedule="Interleaved1F1B", + cp=cp, + ep=dp_shard, + eptp=1 + ) + ) + # Remove duplicates and assign to instance unique_configs = [] seen_configs = set() for config in configs: # Create a tuple of the config values to check for duplicates - config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.ep) + config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.cp, config.ep, config.eptp) if config_tuple not in seen_configs: unique_configs.append(config) seen_configs.add(config_tuple) @@ -232,72 +246,66 @@ def _get_factors(n: int) -> List[int]: log_message(LogLevel.INFO, f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.") if self.verbose: for config in self.parallelism_configs: - log_message(LogLevel.INFO, f" - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, ep={config.ep}") - def generate_config(self, config: ParallelismConfig, model_name: str, model_type: str) -> Path: + log_message(LogLevel.INFO, f" - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, cp={config.cp}, ep={config.ep}, eptp={config.eptp}") + + def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None) -> Path: """Generate configuration file for a parallelism setup.""" - config_file = self.config_dir / f"{config.name}_{model_type}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu.toml" - - #TODO(3outeille): create template instead - if model_type == "llama": - base_config = self.script_dir / "configs" / "debug_1_gpu_tt.toml" + import toml + + if filename: + config_file = config_dir / filename else: - base_config = self.script_dir / "configs" / "debug_1_gpu_hf.toml" - + config_file = config_dir / f"{config.name}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu_{backend}.toml" + + base_config = self.script_dir / "configs" / "test_template.toml" shutil.copy2(base_config, config_file) + # Load the TOML file as a dict with open(config_file, 'r') as f: - content = f.read() - - # Update model name if it's HF backend - if model_type != "llama": - content = re.sub(r'name = ".*"', f'name = "{model_name}"', content) - - # Update model flavor - content = re.sub(r'flavor = ".*"', f'flavor = "{self.flavor}"', content) - + config_data = toml.load(f) + + # Update [model] section + if "model" not in config_data: + config_data["model"] = {} + config_data["model"]["name"] = model_name + config_data["model"]["flavor"] = self.flavor + # Validate flavor for model type - if model_type in self.MODEL_FLAVORS: - if self.flavor not in self.MODEL_FLAVORS[model_type]: + if model_name in self.MODEL_FLAVORS: + if self.flavor not in self.MODEL_FLAVORS[model_name]: log_message(LogLevel.WARNING, - f"Flavor '{self.flavor}' not available for {model_type}. " - f"Available: {self.MODEL_FLAVORS[model_type]}") - - # Update training steps and seed - content = re.sub(r'steps = .*', f'steps = {self.steps}', content) - if 'seed = ' in content: - content = re.sub(r'seed = .*', f'seed = {self.seed}', content) - else: - content = re.sub(r'(steps = .*)', f'\\1\nseed = {self.seed}', content) - - #TODO(3outeille): is this correct ? - # Ensure deterministic training - if 'deterministic = true' not in content: - content = re.sub(r'(seed = .*)', '\\1\ndeterministic = true', content) - - # Update parallelism configuration - content = re.sub(r'data_parallel_replicate_degree = .*', - f'data_parallel_replicate_degree = {config.dp_replicate}', content) - content = re.sub(r'data_parallel_shard_degree = .*', - f'data_parallel_shard_degree = {config.dp_shard}', content) - content = re.sub(r'tensor_parallel_degree = .*', - f'tensor_parallel_degree = {config.tp}', content) - content = re.sub(r'pipeline_parallel_degree = .*', - f'pipeline_parallel_degree = {config.pp}', content) - content = re.sub(r'pipeline_parallel_schedule = .*', - f'pipeline_parallel_schedule = "{config.pp_schedule}"', content) - content = re.sub(r'context_parallel_degree = .*', - f'context_parallel_degree = {config.cp}', content) - content = re.sub(r'expert_parallel_degree = .*', - f'expert_parallel_degree = {config.ep}', content) - - content = re.sub(r'expert_tensor_parallel_degree = .*', - f'expert_tensor_parallel_degree = {config.eptp}', content) + f"Flavor '{self.flavor}' not available for {model_name}. " + f"Available: {self.MODEL_FLAVORS[model_name]}") + + # Update [training] section + if "training" not in config_data: + config_data["training"] = {} + config_data["training"]["steps"] = self.steps + config_data["training"]["seed"] = self.seed + + # Update [parallelism] section + if "parallelism" not in config_data: + config_data["parallelism"] = {} + config_data["parallelism"]["data_parallel_replicate_degree"] = config.dp_replicate + config_data["parallelism"]["data_parallel_shard_degree"] = config.dp_shard + config_data["parallelism"]["tensor_parallel_degree"] = config.tp + config_data["parallelism"]["pipeline_parallel_degree"] = config.pp + config_data["parallelism"]["pipeline_parallel_schedule"] = config.pp_schedule + config_data["parallelism"]["context_parallel_degree"] = config.cp + config_data["parallelism"]["expert_parallel_degree"] = config.ep + config_data["parallelism"]["expert_tensor_parallel_degree"] = config.eptp + + # Write back the modified TOML + with open(config_file, 'w') as f: + toml.dump(config_data, f) + + log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})") + return config_file - # Write modified config with open(config_file, 'w') as f: f.write(content) - log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name}, type: {model_type})") + log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})") return config_file def extract_metrics(self, log_file: Path) -> TrainingMetrics: @@ -352,7 +360,7 @@ def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: Train f"Grad norm diff: {grad_norm_diff:.2e} (threshold: {self.grad_norm_threshold:.2e})") return False - def generate_diff(self, baseline_log: Path, log_path: Path, diff_file: Path) -> None: + def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None: """Generate diff between baseline and test logs.""" def _filter_log(log_file: Path) -> Path: @@ -378,7 +386,7 @@ def _filter_log(log_file: Path) -> Path: try: # Filter logs to remove timestamps and volatile information baseline_filtered = _filter_log(baseline_log) - test_filtered = _filter_log(log_path) + test_filtered = _filter_log(test_log) # Generate colored diff using git diff cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color", @@ -410,6 +418,8 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode ] env = os.environ.copy() + env["SEED"] = str(self.seed) + env["MODEL_TYPE"] = model_name if self.verbose: log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}") @@ -439,7 +449,7 @@ def run(self) -> int: description="Test different parallelism configurations against a baseline FSDP model.", ) parser.add_argument("-m", "--model-filter", default="", - help="Filter models by name pattern (e.g., 'llama')") + help="Filter models by name pattern (e.g., 'llama3')") parser.add_argument("-t", "--loss-threshold", type=float, default=self.DEFAULT_THRESHOLD_LOSS, help=f"Loss difference threshold (default: {self.DEFAULT_THRESHOLD_LOSS})") parser.add_argument("-g", "--grad-threshold", type=float, default=self.DEFAULT_THRESHOLD_GRAD_NORM, @@ -448,11 +458,9 @@ def run(self) -> int: help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})") parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS, help=f"Training steps (default: {self.DEFAULT_STEPS})") - parser.add_argument("--seed", type=int, default=self.DEFAULT_SEED, - help=f"Random seed (default: {self.DEFAULT_SEED})") parser.add_argument("--flavor", default=self.DEFAULT_FLAVOR, help=f"Model flavor/size (default: {self.DEFAULT_FLAVOR}). " - f"Available: llama=[debugmodel, medium, full], deepseek=[debugmodel]") + f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") @@ -461,97 +469,133 @@ def run(self) -> int: self.loss_threshold = args.loss_threshold self.grad_norm_threshold = args.grad_threshold self.nd_parallel = args.nd_parallel + self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] self.steps = args.steps - self.seed = args.seed self.model_filter = args.model_filter self.flavor = args.flavor self.verbose = args.verbose - log_message(LogLevel.INFO, "=== TorchTitan Distributed Parallelism Comparison ===") + log_message(LogLevel.INFO, "=== Distributed Parallelism Comparison ===") log_message(LogLevel.INFO, f"Loss threshold: {self.loss_threshold}") log_message(LogLevel.INFO, f"Grad norm threshold: {self.grad_norm_threshold}") - log_message(LogLevel.INFO, f"GPUs: {self.nd_parallel_to_nb_gpus[self.nd_parallel]}") + log_message(LogLevel.INFO, f"GPUs: {self.ngpu}") log_message(LogLevel.INFO, f"Steps: {self.steps}") log_message(LogLevel.INFO, f"Seed: {self.seed}") log_message(LogLevel.INFO, f"Model filter: {self.model_filter or 'all'}") log_message(LogLevel.INFO, f"Model flavor: {self.flavor}") print() - self.results_dir.mkdir(exist_ok=True) - self.config_dir.mkdir(exist_ok=True) + self.base_results_dir.mkdir(exist_ok=True) + + self.generate_parallelism_configs() + #TODO(3outeille): make it more generic later + if self.model_filter == "llama3": + hf_model_name = "meta-llama/Llama-3.2-1B" + tt_model_name = "llama3" + elif self.model_filter == "deepseek_v3": + hf_model_name = "deepseek-ai/DeepSeek-V3" + tt_model_name = "deepseek_v3" + else: + raise ValueError(f"Model filter {self.model_filter} not supported") + + model_owner, model_repo = hf_model_name.split("/", 1) + nd_parallel_upper = self.nd_parallel.upper() + self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor + self.results_dir.mkdir(parents=True, exist_ok=True) + if self.verbose: log_message(LogLevel.INFO, f"Results directory: {self.results_dir}") - log_message(LogLevel.INFO, f"Config directory: {self.config_dir}") - self.generate_parallelism_configs() + log_message(LogLevel.INFO, "--- Running baseline (FSDP) for huggingface backend ---") + + log_message(LogLevel.INFO, f"Testing model {hf_model_name} (HF) for {self.nd_parallel} parallelism") + + baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None) + + baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" + baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf) + baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log" + if not self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name): + log_message(LogLevel.ERROR, f"Huggingface baseline (FSDP) training failed for {hf_model_name}") + # raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") + + hf_baseline_metrics = self.extract_metrics(baseline_log_hf) + if hf_baseline_metrics.loss is None or hf_baseline_metrics.grad_norm is None: + log_message(LogLevel.ERROR, f"Could not extract huggingface baseline metrics for {hf_model_name}") + # raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") - total_model_failures = 0 + log_message(LogLevel.INFO, "--- Running baseline (FSDP) for torchtitan backend ---") - for model_type, model_name in self.HF_MODEL_LISTS.items(): - # Apply model filter if specified - if self.model_filter and self.model_filter not in model_type: - continue + log_message(LogLevel.INFO, f"Testing model {hf_model_name} (TT) for {self.nd_parallel} parallelism") - log_message(LogLevel.INFO, f"Testing model: {model_type} ({model_name})") - total_tests = 0 - passed_tests = 0 - failed_tests = 0 - configs_to_run = [] + baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt) + baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log" + if not self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name): + raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") - for config in self.parallelism_configs: - # Skip configurations that require more GPUs than available - required_gpus = config.dp_replicate * config.tp * config.pp - if config.dp_shard != -1: - required_gpus *= config.dp_shard - - if required_gpus > self.nd_parallel_to_nb_gpus[self.nd_parallel]: - log_message(LogLevel.WARNING, - f"Skipping {config.name}: requires {required_gpus} GPUs but only {self.ngpu} available") - continue - - config_file = self.generate_config(config, model_name, model_type) - configs_to_run.append((config, config_file)) - - # # Test each parallelism configuration - # for config, config_file in configs_to_run: - # log_path = self.results_dir / f"{config.name}_{model_type}_{self.flavor}_{self.ngpu}gpu.log" - # if not self.run_training(config_file, log_path, config.name, model_name): - # log_message(LogLevel.TEST_FAIL, f"{config.name} - Training failed") - # failed_tests += 1 - # continue - # test_metrics = self.extract_metrics(log_path) - # if self.compare_metrics(baseline_metrics, test_metrics, config.name): - # passed_tests += 1 - # else: - # failed_tests += 1 - # diff_file = self.results_dir / f"diff_{config.name}_vs_baseline_{model_type}_{self.flavor}_{self.ngpu}gpu.log" - # self.generate_diff(baseline_log, log_path, diff_file) - # log_message(LogLevel.INFO, f"Diff saved to: {diff_file}") - # total_tests += 1 - - # Print summary for this model - print() - log_message(LogLevel.INFO, f"=== TEST SUMMARY for {model_type} ===") - log_message(LogLevel.INFO, f"Total tests: {total_tests}") - log_message(LogLevel.SUCCESS, f"Passed: {passed_tests}") - if failed_tests > 0: - log_message(LogLevel.TEST_FAIL, f"Failed: {failed_tests}") + tt_baseline_metrics = self.extract_metrics(baseline_log_tt) + if tt_baseline_metrics.loss is None or tt_baseline_metrics.grad_norm is None: + raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") + + log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---") + + passed_tests = 0 + failed_tests = 0 + test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"] + total_tests = len(test_configs) + + for config in test_configs: + # Create a subdirectory for each test configuration + test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface" + test_dir = self.results_dir / test_dir_name + test_dir.mkdir(exist_ok=True) + + config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" + config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf) + log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" + + successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name) + + # Compare metrics between baseline (HF) and current (HF) nd-parallelism run + hf_metrics = self.extract_metrics(log_path_hf) + successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)") + + if successful_hf_run and successful_hf_extract: + passed_tests += 1 else: - log_message(LogLevel.INFO, f"Failed: {failed_tests}") - print() + failed_tests += 1 + # Generate diff with baseline (HF) + diff_file_hf_vs_baseline = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" + self.generate_diff(baseline_log_hf, log_path_hf, diff_file_hf_vs_baseline) + log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_file_hf_vs_baseline}") + + # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run + config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt) + log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" + if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name): + raise ValueError(f"TorchTitan training failed for {tt_model_name}") + + # generated diff between nd-paralellism TT and current hf nd-parallelism run + diff_file_tt_vs_hf = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" + self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_vs_hf) + log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_vs_hf}") - if failed_tests > 0: - total_model_failures += 1 + # generated diff between baseline TT and current hf nd-parallelism run + diff_file_tt_baseline_vs_hf = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" + self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf) + log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf}") - # Final summary print() + log_message(LogLevel.INFO, "=== FINAL SUMMARY ===") - if total_model_failures == 0: + if passed_tests == total_tests: log_message(LogLevel.SUCCESS, "All model tests passed! 🎉") return 0 else: - log_message(LogLevel.TEST_FAIL, f"{total_model_failures} model(s) had test failures") + log_message(LogLevel.TEST_FAIL, f"{failed_tests} model(s) had test failures") log_message(LogLevel.INFO, f"Check the diff files in {self.results_dir} for details") return 1 diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh index 80bb2d04ca..4d0319a03f 100755 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh @@ -1,6 +1,5 @@ #!/usr/bin/bash -# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel +python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 1d --verbose -# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d -debugpy-run compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d +# debugpy-run compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d From 36a5673476b6866e4de97e76dda6292f077432a5 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 24 Sep 2025 12:12:11 +0000 Subject: [PATCH 036/129] add seed + deterministic to compare_distributed_run --- .../experiments/transformers_backend/compare_distributed_run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 9be6b52acf..6f7b539b98 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -414,6 +414,8 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode "--role", "rank", "--tee", "3", "-m", "torchtitan.train", + "--training.seed", str(self.seed), + "--training.deterministic", "--job.config_file", str(config_file) ] From ed892a2cfeb6a060d645a8d032bf24ecd7c2847b Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 24 Sep 2025 12:40:58 +0000 Subject: [PATCH 037/129] better extract and compare metrics --- .../compare_distributed_run.py | 129 ++++++++++-------- .../compare_distributed_run.sh | 2 +- 2 files changed, 76 insertions(+), 55 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 6f7b539b98..a933f9ed56 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -49,8 +49,9 @@ import sys from pathlib import Path from typing import List, Optional -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum +import torch # Configure logging with colors class Colors: @@ -110,18 +111,26 @@ class ParallelismConfig: @dataclass class TrainingMetrics: """Training metrics extracted from logs.""" - loss: Optional[float] = None - grad_norm: Optional[float] = None + steps: List[int] = field(default_factory=list) + loss: List[float] = field(default_factory=list) + grad_norm: List[float] = field(default_factory=list) + memory: List[float] = field(default_factory=list) + tps: List[int] = field(default_factory=list) + tflops: List[float] = field(default_factory=list) + mfu: List[float] = field(default_factory=list) class CompareDistributedRun: """Main class for running distributed parallelism comparison tests.""" # Default values - DEFAULT_THRESHOLD_LOSS = 1e-4 - DEFAULT_THRESHOLD_GRAD_NORM = 1e-3 DEFAULT_STEPS = 10 DEFAULT_SEED = 42 DEFAULT_FLAVOR = "debugmodel" + # value chosen based on diff of llama3 1GPU + DEFAULT_LOSS_ATOL = 0.02 + DEFAULT_LOSS_RTOL = 1e-5 + DEFAULT_GRAD_NORM_ATOL = 0.005 + DEFAULT_GRAD_NORM_RTOL = 1e-5 MODEL_LISTS = { "torchtitan": ["llama3", "deepseek_v3"], @@ -151,14 +160,16 @@ def __init__(self): self.base_results_dir = self.script_dir / "results" # Configuration parameters - self.loss_threshold = self.DEFAULT_THRESHOLD_LOSS - self.grad_norm_threshold = self.DEFAULT_THRESHOLD_GRAD_NORM self.nd_parallel_to_nb_gpus = self.ND_PARALLEL_TO_NB_GPUS self.steps = self.DEFAULT_STEPS self.seed = self.DEFAULT_SEED self.model_filter = "" self.flavor = self.DEFAULT_FLAVOR self.verbose = False + self.loss_atol = self.DEFAULT_LOSS_ATOL + self.loss_rtol = self.DEFAULT_LOSS_RTOL + self.grad_norm_atol = self.DEFAULT_GRAD_NORM_ATOL + self.grad_norm_rtol = self.DEFAULT_GRAD_NORM_RTOL self.parallelism_configs: List[ParallelismConfig] = [] self.results_dir: Optional[Path] = None @@ -301,12 +312,6 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})") return config_file - - with open(config_file, 'w') as f: - f.write(content) - - log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})") - return config_file def extract_metrics(self, log_file: Path) -> TrainingMetrics: """Extract metrics from log file.""" @@ -315,20 +320,23 @@ def extract_metrics(self, log_file: Path) -> TrainingMetrics: try: with open(log_file, 'r') as f: content = f.read() - - # Extract final loss and grad_norm from the last step - loss_matches = re.findall(r'loss:\s*([0-9]+\.?[0-9]*)', content) - grad_norm_matches = re.findall(r'grad_norm:\s*([0-9]+\.?[0-9]*)', content) - - if loss_matches: - metrics.loss = float(loss_matches[-1]) - if grad_norm_matches: - metrics.grad_norm = float(grad_norm_matches[-1]) + + # Regex to capture all metrics from a log line, ignoring ANSI color codes + pattern = re.compile( + r"step:\s*(\d+)\s*" + r".*?loss:\s*([0-9]+\.?[0-9]*)\s*" + r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*" + ) + + for match in pattern.finditer(content): + metrics.steps.append(int(match.group(1))) + metrics.loss.append(float(match.group(2))) + metrics.grad_norm.append(float(match.group(3))) except Exception as e: log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}") - if metrics.loss is None or metrics.grad_norm is None: + if not metrics.loss or not metrics.grad_norm: log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}") return metrics @@ -336,28 +344,33 @@ def extract_metrics(self, log_file: Path) -> TrainingMetrics: def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, config_name: str) -> bool: """Compare metrics between baseline and test configuration.""" - if (baseline_metrics.loss is None or baseline_metrics.grad_norm is None or - test_metrics.loss is None or test_metrics.grad_norm is None): + if not baseline_metrics.loss or not test_metrics.loss: log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics") return False - # Calculate absolute differences - loss_diff = abs(baseline_metrics.loss - test_metrics.loss) - grad_norm_diff = abs(baseline_metrics.grad_norm - test_metrics.grad_norm) + # Convert to tensors + baseline_loss = torch.tensor(baseline_metrics.loss) + test_loss = torch.tensor(test_metrics.loss) + baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm) + test_grad_norm = torch.tensor(test_metrics.grad_norm) - # Check if differences are within thresholds - loss_pass = loss_diff < self.loss_threshold - grad_pass = grad_norm_diff < self.grad_norm_threshold + # Check if tensors are close + loss_pass = torch.allclose(baseline_loss, test_loss, atol=self.loss_atol, rtol=self.loss_rtol) + grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=self.grad_norm_atol, rtol=self.grad_norm_rtol) + + # Calculate max absolute differences for logging + loss_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 + grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 if loss_pass and grad_pass: log_message(LogLevel.TEST_PASS, - f"{config_name} - Loss diff: {loss_diff:.2e} (< {self.loss_threshold:.2e}), " - f"Grad norm diff: {grad_norm_diff:.2e} (< {self.grad_norm_threshold:.2e})") + f"{config_name} - Max loss diff: {loss_diff:.2e}, " + f"Max grad norm diff: {grad_norm_diff:.2e}") return True else: log_message(LogLevel.TEST_FAIL, - f"{config_name} - Loss diff: {loss_diff:.2e} (threshold: {self.loss_threshold:.2e}), " - f"Grad norm diff: {grad_norm_diff:.2e} (threshold: {self.grad_norm_threshold:.2e})") + f"{config_name} - Max loss diff: {loss_diff:.2e}, " + f"Max grad norm diff: {grad_norm_diff:.2e}") return False def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None: @@ -452,10 +465,6 @@ def run(self) -> int: ) parser.add_argument("-m", "--model-filter", default="", help="Filter models by name pattern (e.g., 'llama3')") - parser.add_argument("-t", "--loss-threshold", type=float, default=self.DEFAULT_THRESHOLD_LOSS, - help=f"Loss difference threshold (default: {self.DEFAULT_THRESHOLD_LOSS})") - parser.add_argument("-g", "--grad-threshold", type=float, default=self.DEFAULT_THRESHOLD_GRAD_NORM, - help=f"Grad norm difference threshold (default: {self.DEFAULT_THRESHOLD_GRAD_NORM})") parser.add_argument("-nd", "--nd_parallel", type=str, default="2d", help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})") parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS, @@ -465,21 +474,29 @@ def run(self) -> int: f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") + parser.add_argument("--loss-atol", type=float, default=self.DEFAULT_LOSS_ATOL, + help=f"Absolute tolerance for loss comparison (default: {self.DEFAULT_LOSS_ATOL})") + parser.add_argument("--loss-rtol", type=float, default=self.DEFAULT_LOSS_RTOL, + help=f"Relative tolerance for loss comparison (default: {self.DEFAULT_LOSS_RTOL})") + parser.add_argument("--grad-norm-atol", type=float, default=self.DEFAULT_GRAD_NORM_ATOL, + help=f"Absolute tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_ATOL})") + parser.add_argument("--grad-norm-rtol", type=float, default=self.DEFAULT_GRAD_NORM_RTOL, + help=f"Relative tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_RTOL})") args = parser.parse_args() - self.loss_threshold = args.loss_threshold - self.grad_norm_threshold = args.grad_threshold self.nd_parallel = args.nd_parallel self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] self.steps = args.steps self.model_filter = args.model_filter self.flavor = args.flavor self.verbose = args.verbose + self.loss_atol = args.loss_atol + self.loss_rtol = args.loss_rtol + self.grad_norm_atol = args.grad_norm_atol + self.grad_norm_rtol = args.grad_norm_rtol log_message(LogLevel.INFO, "=== Distributed Parallelism Comparison ===") - log_message(LogLevel.INFO, f"Loss threshold: {self.loss_threshold}") - log_message(LogLevel.INFO, f"Grad norm threshold: {self.grad_norm_threshold}") log_message(LogLevel.INFO, f"GPUs: {self.ngpu}") log_message(LogLevel.INFO, f"Steps: {self.steps}") log_message(LogLevel.INFO, f"Seed: {self.seed}") @@ -523,7 +540,7 @@ def run(self) -> int: # raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") hf_baseline_metrics = self.extract_metrics(baseline_log_hf) - if hf_baseline_metrics.loss is None or hf_baseline_metrics.grad_norm is None: + if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm: log_message(LogLevel.ERROR, f"Could not extract huggingface baseline metrics for {hf_model_name}") # raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") @@ -538,9 +555,13 @@ def run(self) -> int: raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") tt_baseline_metrics = self.extract_metrics(baseline_log_tt) - if tt_baseline_metrics.loss is None or tt_baseline_metrics.grad_norm is None: + if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") + + if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"): + raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") + log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---") passed_tests = 0 @@ -569,9 +590,9 @@ def run(self) -> int: else: failed_tests += 1 # Generate diff with baseline (HF) - diff_file_hf_vs_baseline = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" - self.generate_diff(baseline_log_hf, log_path_hf, diff_file_hf_vs_baseline) - log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_file_hf_vs_baseline}") + diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" + self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism) + log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}") # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" @@ -581,14 +602,14 @@ def run(self) -> int: raise ValueError(f"TorchTitan training failed for {tt_model_name}") # generated diff between nd-paralellism TT and current hf nd-parallelism run - diff_file_tt_vs_hf = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" - self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_vs_hf) - log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_vs_hf}") + diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" + self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism) + log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}") # generated diff between baseline TT and current hf nd-parallelism run - diff_file_tt_baseline_vs_hf = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" - self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf) - log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf}") + diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" + self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism) + log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}") print() diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh index 4d0319a03f..d7e5b77bcb 100755 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh @@ -1,5 +1,5 @@ #!/usr/bin/bash -python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 1d --verbose +python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d --verbose # debugpy-run compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d From 1c1452fa29922403f16b6a74fc9abe9c6f645eb9 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 24 Sep 2025 13:07:51 +0000 Subject: [PATCH 038/129] refactor to introduce slurm --- .../compare_distributed_run.py | 109 +++++++++++------- 1 file changed, 66 insertions(+), 43 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index a933f9ed56..a72e2abf7f 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -166,6 +166,8 @@ def __init__(self): self.model_filter = "" self.flavor = self.DEFAULT_FLAVOR self.verbose = False + self.use_slurm = False + self.slurm_options = [] self.loss_atol = self.DEFAULT_LOSS_ATOL self.loss_rtol = self.DEFAULT_LOSS_RTOL self.grad_norm_atol = self.DEFAULT_GRAD_NORM_ATOL @@ -420,7 +422,7 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}") cmd = [ "torchrun", - f"--nproc_per_node={self.nd_parallel_to_nb_gpus[self.nd_parallel]}", + f"--nproc_per_node={self.ngpu}", "--rdzv_backend", "c10d", "--rdzv_endpoint=localhost:0", "--local-ranks-filter", "0", @@ -431,7 +433,6 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode "--training.deterministic", "--job.config_file", str(config_file) ] - env = os.environ.copy() env["SEED"] = str(self.seed) env["MODEL_TYPE"] = model_name @@ -458,11 +459,62 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode log_message(LogLevel.ERROR, f"Training failed: {config_name}") return False + def _compare_one_parallelism_config( + self, + config: "ParallelismConfig", + hf_model_name: str, + tt_model_name: str, + hf_baseline_metrics: "TrainingMetrics", + baseline_log_hf: Path, + baseline_log_tt: Path, + ) -> bool: + """Compares a single parallelism configuration against the baseline.""" + # Create a subdirectory for each test configuration + test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface" + test_dir = self.results_dir / test_dir_name + test_dir.mkdir(exist_ok=True) + + config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" + config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf) + log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" + + successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name) + + # Compare metrics between baseline (HF) and current (HF) nd-parallelism run + hf_metrics = self.extract_metrics(log_path_hf) + successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)") + + if successful_hf_run and successful_hf_extract: + return True + else: + # Generate diff with baseline (HF) + diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" + self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism) + log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}") + + # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run + config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt) + log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" + if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name): + raise ValueError(f"TorchTitan training failed for {tt_model_name}") + + # generated diff between nd-paralellism TT and current hf nd-parallelism run + diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" + self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism) + log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}") + + # generated diff between baseline TT and current hf nd-parallelism run + diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" + self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism) + log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}") + return False + def run(self) -> int: """Main execution function. Runs all test suites for all models.""" parser = argparse.ArgumentParser( description="Test different parallelism configurations against a baseline FSDP model.", - ) + ) parser.add_argument("-m", "--model-filter", default="", help="Filter models by name pattern (e.g., 'llama3')") parser.add_argument("-nd", "--nd_parallel", type=str, default="2d", @@ -558,58 +610,29 @@ def run(self) -> int: if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") - if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"): - raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") + log_message(LogLevel.ERROR, f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") + # raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---") - passed_tests = 0 failed_tests = 0 test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"] total_tests = len(test_configs) for config in test_configs: - # Create a subdirectory for each test configuration - test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface" - test_dir = self.results_dir / test_dir_name - test_dir.mkdir(exist_ok=True) - - config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf) - log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" - - successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name) - - # Compare metrics between baseline (HF) and current (HF) nd-parallelism run - hf_metrics = self.extract_metrics(log_path_hf) - successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)") - - if successful_hf_run and successful_hf_extract: + passed = self._compare_one_parallelism_config( + config, + hf_model_name, + tt_model_name, + hf_baseline_metrics, + baseline_log_hf, + baseline_log_tt, + ) + if passed: passed_tests += 1 else: failed_tests += 1 - # Generate diff with baseline (HF) - diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" - self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism) - log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}") - - # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run - config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" - config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt) - log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" - if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name): - raise ValueError(f"TorchTitan training failed for {tt_model_name}") - - # generated diff between nd-paralellism TT and current hf nd-parallelism run - diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" - self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism) - log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}") - - # generated diff between baseline TT and current hf nd-parallelism run - diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" - self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism) - log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}") print() From 5e4911fbfd64b93f2803d5f89506a7f20103f602 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 24 Sep 2025 13:27:38 +0000 Subject: [PATCH 039/129] error handling with subprocess --- .../compare_distributed_run.py | 58 +++++++++++-------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index a72e2abf7f..ec58b2e729 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -144,6 +144,7 @@ class CompareDistributedRun: "deepseek-ai/DeepSeek-V3": ["debugmodel"], } + #TODO(3outeille): handle slurm later for 4D/5D. Might need to rethink the whole script for that # Available ND parallelisms <-> number of GPUs ND_PARALLEL_TO_NB_GPUS = { "0d": 1, @@ -417,7 +418,7 @@ def _filter_log(log_file: Path) -> Path: except Exception as e: log_message(LogLevel.WARNING, f"Could not generate diff: {e}") - def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> bool: + def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> Optional[subprocess.CalledProcessError]: """Run training with given configuration.""" log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}") cmd = [ @@ -441,23 +442,33 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}") try: + # Capture output to include it in the exception, while still writing to log file + result = subprocess.run( + cmd, + cwd=self.torchtitan_root, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, # decodes stdout/stderr as text + env=env, + check=True + ) with open(log_file, 'w') as f: - result = subprocess.run( - cmd, - cwd=self.torchtitan_root, - stdout=f, - stderr=subprocess.STDOUT, - env=env, - check=True - ) + f.write(result.stdout) if self.verbose: log_message(LogLevel.SUCCESS, f"Training completed: {config_name}") - return True + return None except subprocess.CalledProcessError as e: log_message(LogLevel.ERROR, f"Training failed: {config_name}") - return False + + # Write the failed output to the log file + with open(log_file, 'w') as f: + if e.stdout: + f.write(e.stdout) + + e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or ''}") + return e def _compare_one_parallelism_config( self, @@ -478,7 +489,8 @@ def _compare_one_parallelism_config( config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf) log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" - successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name) + hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name) + successful_hf_run = hf_run_error is None # Compare metrics between baseline (HF) and current (HF) nd-parallelism run hf_metrics = self.extract_metrics(log_path_hf) @@ -496,8 +508,9 @@ def _compare_one_parallelism_config( config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt) log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" - if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name): - raise ValueError(f"TorchTitan training failed for {tt_model_name}") + tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name) + if tt_run_error: + raise ValueError(f"TorchTitan training failed for {tt_model_name}") from tt_run_error # generated diff between nd-paralellism TT and current hf nd-parallelism run diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" @@ -587,14 +600,13 @@ def run(self) -> int: baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf) baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log" - if not self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name): - log_message(LogLevel.ERROR, f"Huggingface baseline (FSDP) training failed for {hf_model_name}") - # raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") + hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name) + if hf_baseline_run_error: + raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error hf_baseline_metrics = self.extract_metrics(baseline_log_hf) if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm: - log_message(LogLevel.ERROR, f"Could not extract huggingface baseline metrics for {hf_model_name}") - # raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") + raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") log_message(LogLevel.INFO, "--- Running baseline (FSDP) for torchtitan backend ---") @@ -603,16 +615,16 @@ def run(self) -> int: baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt) baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log" - if not self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name): - raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") + tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name) + if tt_baseline_run_error: + raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error tt_baseline_metrics = self.extract_metrics(baseline_log_tt) if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"): - log_message(LogLevel.ERROR, f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") - # raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") + raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---") passed_tests = 0 From 4891a4783788ea448329be0a6134de169ae1dca4 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 24 Sep 2025 14:18:10 +0000 Subject: [PATCH 040/129] FSDP for llama in 1D works --- .../compare_distributed_run.py | 11 +++++----- .../infra/parallelize_hf_transformers.py | 22 +++++++++---------- .../model/hf_transformers_args.py | 16 ++++++++++++++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index ec58b2e729..44adbbd57a 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -129,7 +129,7 @@ class CompareDistributedRun: # value chosen based on diff of llama3 1GPU DEFAULT_LOSS_ATOL = 0.02 DEFAULT_LOSS_RTOL = 1e-5 - DEFAULT_GRAD_NORM_ATOL = 0.005 + DEFAULT_GRAD_NORM_ATOL = 0.02 DEFAULT_GRAD_NORM_RTOL = 1e-5 MODEL_LISTS = { @@ -392,10 +392,6 @@ def _filter_log(log_file: Path) -> Path: 'torchrun ... --master_port=XXXX', line) line = re.sub(r'PID [0-9]+', 'PID XXXX', line) line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line) - line = re.sub(r'memory: [0-9]+\.[0-9]+GiB', 'memory: XX.XXGiB', line) - line = re.sub(r'tps: [0-9,]+', 'tps: XXXXX', line) - line = re.sub(r'tflops: [0-9]+\.[0-9]+', 'tflops: XX.XX', line) - line = re.sub(r'mfu: [0-9]+\.[0-9]+%', 'mfu: XX.XX%', line) outfile.write(line) return filtered_file @@ -443,6 +439,7 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode try: # Capture output to include it in the exception, while still writing to log file + log_message(LogLevel.INFO, f"Running command: {' '.join(cmd)}") result = subprocess.run( cmd, cwd=self.torchtitan_root, @@ -619,6 +616,10 @@ def run(self) -> int: if tt_baseline_run_error: raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error + diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" + self.generate_diff(baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline) + log_message(LogLevel.INFO, f"Diff between baseline (TT) and baseline (HF) saved to: {diff_file_tt_baseline_vs_hf_baseline}") + tt_baseline_metrics = self.extract_metrics(baseline_log_tt) if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 76d2d8adb4..1d2b792898 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -315,7 +315,7 @@ def apply_non_moe_tp( model, tp_mesh, { - "tok_embeddings": RowwiseParallel( + "embed_tokens": RowwiseParallel( input_layouts=Replicate(), output_layouts=Shard(1), ), @@ -437,18 +437,18 @@ def apply_fsdp( f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}." ) - if model.tok_embeddings is not None: + if model.embed_tokens is not None: fully_shard( - model.tok_embeddings, + model.embed_tokens, **fsdp_config, reshard_after_forward=reshard_after_forward, ) - for layer_id, transformer_block in model.layers.items(): + for transformer_block in model.layers: # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping # - the router and the shared experts are sharded together with the TransformerBlock # - the routed experts are sharded with the remaining dp_mod_ep_mesh - if transformer_block.moe_enabled and ep_degree > 1: + if hasattr(transformer_block, "moe_enabled") and transformer_block.moe_enabled and ep_degree > 1: fsdp_mod_ep_config = fsdp_config.copy() fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh @@ -489,9 +489,9 @@ def apply_fsdp( # As an optimization, do not reshard_after_forward the last layers by default # since FSDP would prefetch them immediately after the forward pass - if model.norm is not None and model.output is not None: + if model.norm is not None and model.model.lm_head is not None: fully_shard( - [model.norm, model.output], + [model.norm, model.model.lm_head], **fsdp_config, reshard_after_forward=reshard_after_forward_policy == "always", ) @@ -507,8 +507,8 @@ def apply_fsdp( transformer_blocks = list(model.layers.values()) next_transformer_blocks = transformer_blocks[1:] + [None] - if model.tok_embeddings is not None and model.layers is not None: - model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]]) + if model.embed_tokens is not None and model.layers is not None: + model.embed_tokens.set_modules_to_forward_prefetch([transformer_blocks[0]]) for transformer_block, next_transformer_block in zip( transformer_blocks, next_transformer_blocks @@ -546,8 +546,8 @@ def apply_fsdp( transformer_block.set_modules_to_backward_prefetch( [prev_transformer_block] ) - elif model.tok_embeddings is not None: - transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings]) + elif model.embed_tokens is not None: + transformer_block.set_modules_to_backward_prefetch([model.embed_tokens]) def apply_moe_ep_tp( diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 704f83a534..3ecdbddad6 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -294,6 +294,22 @@ def layers(self): # Add more cases here if needed for other model architectures raise AttributeError("Could not find layers in the model. Please check the model structure.") + @property + def embed_tokens(self): + """Returns the model's embed_tokens, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like + return self.model.model.embed_tokens + else: + raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + + @property + def norm(self): + """Returns the model's norm, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like + return self.model.model.norm + else: + raise AttributeError("Could not find norm in the model. Please check the model structure.") + def forward(self, *args, **kwargs): output = self.model(*args, **kwargs) if isinstance(output, CausalLMOutputWithPast): From 9e260a0364758e45f85aa2365984d68cff6b4c74 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 24 Sep 2025 14:37:20 +0000 Subject: [PATCH 041/129] better formatting of compare_distributed_run + display min/max grad_norm and loss --- .../compare_distributed_run.py | 307 +++++++++++++----- 1 file changed, 225 insertions(+), 82 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 44adbbd57a..1ac6f8d0da 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -52,16 +52,20 @@ from dataclasses import dataclass, field from enum import Enum import torch +from rich.console import Console +from rich.panel import Panel +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + TextColumn, + TimeElapsedColumn, +) +from rich.table import Table + + +console = Console() -# Configure logging with colors -class Colors: - RED = '\033[0;31m' - GREEN = '\033[0;32m' - YELLOW = '\033[1;33m' - BLUE = '\033[0;34m' - MAGENTA = '\033[0;35m' - CYAN = '\033[0;36m' - NC = '\033[0m' # No Color class LogLevel(Enum): INFO = "INFO" @@ -71,17 +75,18 @@ class LogLevel(Enum): TEST_PASS = "TEST_PASS" TEST_FAIL = "TEST_FAIL" + def log_message(level: LogLevel, message: str) -> None: """Log a message with appropriate color coding.""" - color_map = { - LogLevel.INFO: Colors.BLUE, - LogLevel.SUCCESS: Colors.GREEN, - LogLevel.WARNING: Colors.YELLOW, - LogLevel.ERROR: Colors.RED, - LogLevel.TEST_PASS: Colors.GREEN, - LogLevel.TEST_FAIL: Colors.RED, + style_map = { + LogLevel.INFO: "blue", + LogLevel.SUCCESS: "green", + LogLevel.WARNING: "yellow", + LogLevel.ERROR: "bold red", + LogLevel.TEST_PASS: "green", + LogLevel.TEST_FAIL: "bold red", } - + prefix_map = { LogLevel.INFO: "[INFO]", LogLevel.SUCCESS: "[SUCCESS]", @@ -90,10 +95,11 @@ def log_message(level: LogLevel, message: str) -> None: LogLevel.TEST_PASS: "✅ TEST PASS", LogLevel.TEST_FAIL: "❌ TEST FAIL", } - - color = color_map[level] + + style = style_map[level] prefix = prefix_map[level] - print(f"{color}{prefix}{Colors.NC} {message}") + console.print(f"[{style}]{prefix}[/] {message}") + @dataclass class ParallelismConfig: @@ -257,10 +263,37 @@ def _get_factors(n: int) -> List[int]: self.parallelism_configs = unique_configs - log_message(LogLevel.INFO, f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.") - if self.verbose: - for config in self.parallelism_configs: - log_message(LogLevel.INFO, f" - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, cp={config.cp}, ep={config.ep}, eptp={config.eptp}") + log_message( + LogLevel.INFO, + f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.", + ) + table = Table( + title="[bold]Generated Parallelism Configurations[/bold]", + show_header=True, + header_style="bold magenta", + ) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("dp_replicate", justify="right") + table.add_column("dp_shard", justify="right") + table.add_column("tp", justify="right") + table.add_column("pp", justify="right") + table.add_column("cp", justify="right") + table.add_column("ep", justify="right") + table.add_column("eptp", justify="right") + + for config in self.parallelism_configs: + table.add_row( + config.name, + str(config.dp_replicate), + str(config.dp_shard), + str(config.tp), + str(config.pp), + str(config.cp), + str(config.ep), + str(config.eptp), + ) + console.print(table) + console.print() def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None) -> Path: """Generate configuration file for a parallelism setup.""" @@ -313,7 +346,8 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam with open(config_file, 'w') as f: toml.dump(config_data, f) - log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})") + if self.verbose: + log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})") return config_file def extract_metrics(self, log_file: Path) -> TrainingMetrics: @@ -362,18 +396,26 @@ def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: Train grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=self.grad_norm_atol, rtol=self.grad_norm_rtol) # Calculate max absolute differences for logging - loss_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 + loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 + # Calculate min absolute differences for logging + loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 + grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 + if loss_pass and grad_pass: log_message(LogLevel.TEST_PASS, - f"{config_name} - Max loss diff: {loss_diff:.2e}, " - f"Max grad norm diff: {grad_norm_diff:.2e}") + f"{config_name} - Max loss diff: {loss_max_diff:.2e}, " + f"Min loss diff: {loss_min_diff:.2e}, " + f"Max grad norm diff: {grad_norm_diff:.2e}, " + f"Min grad norm diff: {grad_norm_min_diff:.2e}") return True else: log_message(LogLevel.TEST_FAIL, - f"{config_name} - Max loss diff: {loss_diff:.2e}, " - f"Max grad norm diff: {grad_norm_diff:.2e}") + f"{config_name} - Max loss diff: {loss_max_diff:.2e}, " + f"Min loss diff: {loss_min_diff:.2e}, " + f"Max grad norm diff: {grad_norm_diff:.2e}, " + f"Min grad norm diff: {grad_norm_min_diff:.2e}") return False def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None: @@ -439,7 +481,6 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode try: # Capture output to include it in the exception, while still writing to log file - log_message(LogLevel.INFO, f"Running command: {' '.join(cmd)}") result = subprocess.run( cmd, cwd=self.torchtitan_root, @@ -497,27 +538,56 @@ def _compare_one_parallelism_config( return True else: # Generate diff with baseline (HF) - diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" - self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism) - log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}") + diff_hf_baseline_vs_hf_nd_parallelism = ( + test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" + ) + self.generate_diff( + baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism + ) + log_message( + LogLevel.INFO, + f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}", + ) # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run - config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + config_filename_tt = ( + test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + ) config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt) log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name) if tt_run_error: - raise ValueError(f"TorchTitan training failed for {tt_model_name}") from tt_run_error + raise ValueError( + f"TorchTitan training failed for {tt_model_name}" + ) from tt_run_error # generated diff between nd-paralellism TT and current hf nd-parallelism run - diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" - self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism) - log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}") + diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = ( + test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" + ) + self.generate_diff( + log_path_tt, + log_path_hf, + diff_file_tt_nd_parallelism_vs_hf_nd_parallelism, + ) + log_message( + LogLevel.INFO, + f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}", + ) # generated diff between baseline TT and current hf nd-parallelism run - diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" - self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism) - log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}") + diff_file_tt_baseline_vs_hf_nd_parallelism = ( + test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" + ) + self.generate_diff( + baseline_log_tt, + log_path_hf, + diff_file_tt_baseline_vs_hf_nd_parallelism, + ) + log_message( + LogLevel.INFO, + f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}", + ) return False def run(self) -> int: @@ -557,20 +627,29 @@ def run(self) -> int: self.loss_rtol = args.loss_rtol self.grad_norm_atol = args.grad_norm_atol self.grad_norm_rtol = args.grad_norm_rtol - - log_message(LogLevel.INFO, "=== Distributed Parallelism Comparison ===") - log_message(LogLevel.INFO, f"GPUs: {self.ngpu}") - log_message(LogLevel.INFO, f"Steps: {self.steps}") - log_message(LogLevel.INFO, f"Seed: {self.seed}") - log_message(LogLevel.INFO, f"Model filter: {self.model_filter or 'all'}") - log_message(LogLevel.INFO, f"Model flavor: {self.flavor}") - print() - + + console.print( + Panel( + ( + f"[bold]GPUs:[/bold] {self.ngpu}\n" + f"[bold]Steps:[/bold] {self.steps}\n" + f"[bold]Seed:[/bold] {self.seed}\n" + f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n" + f"[bold]Model flavor:[/bold] {self.flavor}" + ), + title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]", + expand=False, + border_style="blue", + padding=(1, 2), + ) + ) + console.print() + self.base_results_dir.mkdir(exist_ok=True) self.generate_parallelism_configs() - - #TODO(3outeille): make it more generic later + + # TODO(3outeille): make it more generic later if self.model_filter == "llama3": hf_model_name = "meta-llama/Llama-3.2-1B" tt_model_name = "llama3" @@ -588,9 +667,14 @@ def run(self) -> int: if self.verbose: log_message(LogLevel.INFO, f"Results directory: {self.results_dir}") - log_message(LogLevel.INFO, "--- Running baseline (FSDP) for huggingface backend ---") - - log_message(LogLevel.INFO, f"Testing model {hf_model_name} (HF) for {self.nd_parallel} parallelism") + console.print( + Panel( + "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]", + expand=False, + border_style="blue", + padding=(0, 2), + ) + ) baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None) @@ -604,10 +688,6 @@ def run(self) -> int: hf_baseline_metrics = self.extract_metrics(baseline_log_hf) if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm: raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") - - log_message(LogLevel.INFO, "--- Running baseline (FSDP) for torchtitan backend ---") - - log_message(LogLevel.INFO, f"Testing model {hf_model_name} (TT) for {self.nd_parallel} parallelism") baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt) @@ -624,40 +704,103 @@ def run(self) -> int: if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") - if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"): - raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") + if not self.compare_metrics( + tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)" + ): + raise ValueError( + f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}" + ) - log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---") - passed_tests = 0 + console.print() + console.print( + Panel( + "[bold cyan]Comparing ND Parallelism Configurations[/bold cyan]", + expand=False, + border_style="blue", + padding=(0, 2), + ) + ) + passed_tests = 1 # +1 for the baseline (FSDP) failed_tests = 0 test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"] - total_tests = len(test_configs) - - for config in test_configs: - passed = self._compare_one_parallelism_config( - config, - hf_model_name, - tt_model_name, - hf_baseline_metrics, - baseline_log_hf, - baseline_log_tt, + total_tests = len(test_configs) + 1 # +1 for the baseline (FSDP) + results = [] + + console.print() + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), + console=console, + ) as progress: + task = progress.add_task( + "[cyan]Comparing configurations...", total=total_tests + ) + for config in test_configs: + progress.update( + task, description=f"[cyan]Testing [bold]{config.name}[/bold]" + ) + passed = self._compare_one_parallelism_config( + config, + hf_model_name, + tt_model_name, + hf_baseline_metrics, + baseline_log_hf, + baseline_log_tt, + ) + results.append((config.name, passed)) + if passed: + passed_tests += 1 + else: + failed_tests += 1 + progress.advance(task) + console.print() + + console.print( + Panel( + "[bold cyan]Final Summary[/bold cyan]", + expand=False, + border_style="blue", + padding=(0, 2), ) - if passed: - passed_tests += 1 - else: - failed_tests += 1 + ) + + summary_table = Table(show_header=True, header_style="bold magenta") + summary_table.add_column("Configuration", style="cyan") + summary_table.add_column("Status", justify="center") + + for name, passed in results: + status = ( + "[bold green]✅ PASS[/bold green]" + if passed + else "[bold red]❌ FAIL[/bold red]" + ) + summary_table.add_row(name, status) + + console.print(summary_table) + console.print() + + overall_summary = Table(title="Overall Test Summary") + overall_summary.add_column("Metric", style="cyan") + overall_summary.add_column("Value", justify="right") + overall_summary.add_row("Total Configurations Tested", str(total_tests)) + overall_summary.add_row("[green]Passed[/green]", str(passed_tests)) + overall_summary.add_row("[red]Failed[/red]", str(failed_tests)) + console.print(overall_summary) - print() - - log_message(LogLevel.INFO, "=== FINAL SUMMARY ===") if passed_tests == total_tests: log_message(LogLevel.SUCCESS, "All model tests passed! 🎉") return 0 else: log_message(LogLevel.TEST_FAIL, f"{failed_tests} model(s) had test failures") - log_message(LogLevel.INFO, f"Check the diff files in {self.results_dir} for details") + log_message( + LogLevel.INFO, f"Check the diff files in {self.results_dir} for details" + ) return 1 + def main(): """Entry point for the script.""" runner = CompareDistributedRun() From a604beea57ba25c446f4887eb2ded3f00c170d8b Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 25 Sep 2025 13:25:56 +0000 Subject: [PATCH 042/129] make FSDP work in a cleaner way (mapping instead of renaming) --- .../infra/parallelize_hf_transformers.py | 4 ++-- .../transformers_backend/model/hf_transformers_args.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 1d2b792898..a97479b216 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -489,9 +489,9 @@ def apply_fsdp( # As an optimization, do not reshard_after_forward the last layers by default # since FSDP would prefetch them immediately after the forward pass - if model.norm is not None and model.model.lm_head is not None: + if model.norm is not None and model.output is not None: fully_shard( - [model.norm, model.model.lm_head], + [model.norm, model.output], **fsdp_config, reshard_after_forward=reshard_after_forward_policy == "always", ) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 3ecdbddad6..3eb74c6b4b 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -310,6 +310,15 @@ def norm(self): else: raise AttributeError("Could not find norm in the model. Please check the model structure.") + @property + def output(self): + """Returns the model's output layer, handling different Hugging Face model structures.""" + if hasattr(self.model, "lm_head"): # For models like LlamaForCausalLM + return self.model.lm_head + else: + # Add more cases here if needed for other model architectures + raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") + def forward(self, *args, **kwargs): output = self.model(*args, **kwargs) if isinstance(output, CausalLMOutputWithPast): From 0b38d0d0f8e605c3edd2f312264363172044c546 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 26 Sep 2025 14:32:22 +0000 Subject: [PATCH 043/129] Improve logging in compare_distributed_run --- .../compare_distributed_run.py | 162 ++++++++++++------ 1 file changed, 112 insertions(+), 50 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 1ac6f8d0da..1a432b68bd 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -68,6 +68,7 @@ class LogLevel(Enum): + COMMAND = "COMMAND" INFO = "INFO" SUCCESS = "SUCCESS" WARNING = "WARNING" @@ -76,9 +77,10 @@ class LogLevel(Enum): TEST_FAIL = "TEST_FAIL" -def log_message(level: LogLevel, message: str) -> None: +def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None: """Log a message with appropriate color coding.""" style_map = { + LogLevel.COMMAND: "dim", LogLevel.INFO: "blue", LogLevel.SUCCESS: "green", LogLevel.WARNING: "yellow", @@ -88,6 +90,7 @@ def log_message(level: LogLevel, message: str) -> None: } prefix_map = { + LogLevel.COMMAND: "[COMMAND]", LogLevel.INFO: "[INFO]", LogLevel.SUCCESS: "[SUCCESS]", LogLevel.WARNING: "[WARNING]", @@ -98,7 +101,21 @@ def log_message(level: LogLevel, message: str) -> None: style = style_map[level] prefix = prefix_map[level] - console.print(f"[{style}]{prefix}[/] {message}") + if indent > 0: + indent_str = " " * (indent - 1) + "└─ " + else: + indent_str = "" + + output = "" + if level == LogLevel.COMMAND: + output = f"{indent_str}[{style}]{prefix} {message}[/]" + else: + output = f"{indent_str}[{style}]{prefix}[/] {message}" + + if dim: + console.print(f"[dim]{output}[/dim]") + else: + console.print(output) @dataclass @@ -196,7 +213,7 @@ def _get_factors(n: int) -> List[int]: return sorted(list(factors)) # Baseline FSDP - configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="Interleaved1F1B", cp=1, ep=1, eptp=1)) + configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="1F1B", cp=1, ep=1, eptp=1)) #NOTE(3outeille): No need to handle DDP (dp_replicate) as DDP is not supported > 1D parallelism" #(cf https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/infra/parallelize.py#L139) @@ -228,7 +245,7 @@ def _get_factors(n: int) -> List[int]: dp_shard=dp_shard, tp=tp, pp=pp, - pp_schedule="Interleaved1F1B", + pp_schedule="1F1B", cp=cp, ep=1, eptp=1 @@ -243,7 +260,7 @@ def _get_factors(n: int) -> List[int]: dp_shard=dp_shard, tp=tp, pp=pp, - pp_schedule="Interleaved1F1B", + pp_schedule="1F1B", cp=cp, ep=dp_shard, eptp=1 @@ -295,7 +312,7 @@ def _get_factors(n: int) -> List[int]: console.print(table) console.print() - def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None) -> Path: + def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None, indent: int = 0, dim: bool = False) -> Path: """Generate configuration file for a parallelism setup.""" import toml @@ -322,7 +339,7 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam if self.flavor not in self.MODEL_FLAVORS[model_name]: log_message(LogLevel.WARNING, f"Flavor '{self.flavor}' not available for {model_name}. " - f"Available: {self.MODEL_FLAVORS[model_name]}") + f"Available: {self.MODEL_FLAVORS[model_name]}", indent=indent, dim=dim) # Update [training] section if "training" not in config_data: @@ -347,10 +364,10 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam toml.dump(config_data, f) if self.verbose: - log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})") + log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})", indent=indent, dim=dim) return config_file - def extract_metrics(self, log_file: Path) -> TrainingMetrics: + def extract_metrics(self, log_file: Path, indent: int = 0, dim: bool = False) -> TrainingMetrics: """Extract metrics from log file.""" metrics = TrainingMetrics() @@ -371,18 +388,18 @@ def extract_metrics(self, log_file: Path) -> TrainingMetrics: metrics.grad_norm.append(float(match.group(3))) except Exception as e: - log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}") + log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}", indent=indent, dim=dim) if not metrics.loss or not metrics.grad_norm: - log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}") + log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}", indent=indent, dim=dim) return metrics def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, - config_name: str) -> bool: + config_name: str, indent: int = 0, dim: bool = False) -> bool: """Compare metrics between baseline and test configuration.""" if not baseline_metrics.loss or not test_metrics.loss: - log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics") + log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics", indent=indent, dim=dim) return False # Convert to tensors @@ -408,17 +425,17 @@ def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: Train f"{config_name} - Max loss diff: {loss_max_diff:.2e}, " f"Min loss diff: {loss_min_diff:.2e}, " f"Max grad norm diff: {grad_norm_diff:.2e}, " - f"Min grad norm diff: {grad_norm_min_diff:.2e}") + f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim) return True else: log_message(LogLevel.TEST_FAIL, f"{config_name} - Max loss diff: {loss_max_diff:.2e}, " f"Min loss diff: {loss_min_diff:.2e}, " f"Max grad norm diff: {grad_norm_diff:.2e}, " - f"Min grad norm diff: {grad_norm_min_diff:.2e}") + f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim) return False - def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None: + def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path, indent: int = 0, dim: bool = False) -> None: """Generate diff between baseline and test logs.""" def _filter_log(log_file: Path) -> Path: @@ -454,17 +471,17 @@ def _filter_log(log_file: Path) -> Path: test_filtered.unlink() except Exception as e: - log_message(LogLevel.WARNING, f"Could not generate diff: {e}") + log_message(LogLevel.WARNING, f"Could not generate diff: {e}", indent=indent, dim=dim) - def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> Optional[subprocess.CalledProcessError]: + def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]: """Run training with given configuration.""" - log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}") + log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}", indent=indent, dim=dim) cmd = [ "torchrun", f"--nproc_per_node={self.ngpu}", "--rdzv_backend", "c10d", "--rdzv_endpoint=localhost:0", - "--local-ranks-filter", "0", + "--local-ranks-filter", str(self.ngpu - 1), "--role", "rank", "--tee", "3", "-m", "torchtitan.train", @@ -475,10 +492,10 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode env = os.environ.copy() env["SEED"] = str(self.seed) env["MODEL_TYPE"] = model_name - - if self.verbose: - log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}") - + env["LOG_RANK"] = str(self.ngpu - 1) + + log_message(LogLevel.COMMAND, f"Command: {' '.join(cmd)}", indent=indent, dim=dim) + try: # Capture output to include it in the exception, while still writing to log file result = subprocess.run( @@ -494,17 +511,25 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode f.write(result.stdout) if self.verbose: - log_message(LogLevel.SUCCESS, f"Training completed: {config_name}") + log_message(LogLevel.SUCCESS, f"Training completed: {config_name}", indent=indent, dim=dim) return None except subprocess.CalledProcessError as e: - log_message(LogLevel.ERROR, f"Training failed: {config_name}") + log_message(LogLevel.ERROR, f"Training failed: {config_name}", indent=indent, dim=dim) # Write the failed output to the log file with open(log_file, 'w') as f: if e.stdout: f.write(e.stdout) + # Print the tail of the error log to the console for quick debugging + if e.stdout: + console.print("[bold red]--- Error Log Tail ---[/bold red]") + error_lines = e.stdout.strip().split('\n') + for line in error_lines[-15:]: + console.print(f"[red]{line}[/red]") + console.print("[bold red]--- End Error Log Tail ---[/bold red]") + e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or ''}") return e @@ -514,8 +539,10 @@ def _compare_one_parallelism_config( hf_model_name: str, tt_model_name: str, hf_baseline_metrics: "TrainingMetrics", + tt_baseline_metrics: "TrainingMetrics", baseline_log_hf: Path, baseline_log_tt: Path, + indent: int = 0, ) -> bool: """Compares a single parallelism configuration against the baseline.""" # Create a subdirectory for each test configuration @@ -524,17 +551,23 @@ def _compare_one_parallelism_config( test_dir.mkdir(exist_ok=True) config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf) + config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf, indent=indent) log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" - hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name) - successful_hf_run = hf_run_error is None - - # Compare metrics between baseline (HF) and current (HF) nd-parallelism run - hf_metrics = self.extract_metrics(log_path_hf) - successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)") + hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name, indent=indent) + + test_passed = True + hf_metrics = None + if hf_run_error: + log_message(LogLevel.TEST_FAIL, f"{config.name} (huggingface) - Training script failed.", indent=indent + 5, dim=True) + test_passed = False + else: + # Compare metrics only if training was successful + hf_metrics = self.extract_metrics(log_path_hf, indent=indent) + if not self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)", indent=indent + 5, dim=True): + test_passed = False - if successful_hf_run and successful_hf_extract: + if test_passed: return True else: # Generate diff with baseline (HF) @@ -542,25 +575,29 @@ def _compare_one_parallelism_config( test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" ) self.generate_diff( - baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism + baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism, indent=indent + 5, dim=True ) log_message( LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}", + indent=indent + 5, + dim=True, ) # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run config_filename_tt = ( test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" ) - config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt) + config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt, indent=indent + 5, dim=True) log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" - tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name) + tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name, indent=indent + 5, dim=True) if tt_run_error: raise ValueError( f"TorchTitan training failed for {tt_model_name}" ) from tt_run_error + tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True) + # generated diff between nd-paralellism TT and current hf nd-parallelism run diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = ( test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" @@ -569,10 +606,22 @@ def _compare_one_parallelism_config( log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism, + indent=indent + 5, + dim=True, ) + if hf_metrics: + self.compare_metrics( + tt_metrics, + hf_metrics, + f"{config.name} (TT nd-parallel vs HF nd-parallel)", + indent=indent + 5, + dim=True, + ) log_message( LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}", + indent=indent + 5, + dim=True, ) # generated diff between baseline TT and current hf nd-parallelism run @@ -583,10 +632,22 @@ def _compare_one_parallelism_config( baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism, + indent=indent + 5, + dim=True, ) + if hf_metrics: + self.compare_metrics( + tt_baseline_metrics, + hf_metrics, + f"{config.name} (TT baseline vs HF nd-parallel)", + indent=indent + 5, + dim=True, + ) log_message( LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}", + indent=indent + 5, + dim=True, ) return False @@ -679,33 +740,29 @@ def run(self) -> int: baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None) baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf) + baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf, indent=0) baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log" - hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name) + hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name, indent=0) if hf_baseline_run_error: raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error - hf_baseline_metrics = self.extract_metrics(baseline_log_hf) + hf_baseline_metrics = self.extract_metrics(baseline_log_hf, indent=0) if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm: raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" - baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt) + baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt, indent=0) baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log" - tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name) + tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name, indent=0) if tt_baseline_run_error: raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error - diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" - self.generate_diff(baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline) - log_message(LogLevel.INFO, f"Diff between baseline (TT) and baseline (HF) saved to: {diff_file_tt_baseline_vs_hf_baseline}") - - tt_baseline_metrics = self.extract_metrics(baseline_log_tt) + tt_baseline_metrics = self.extract_metrics(baseline_log_tt, indent=0) if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") if not self.compare_metrics( - tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)" + tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0 ): raise ValueError( f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}" @@ -738,7 +795,10 @@ def run(self) -> int: task = progress.add_task( "[cyan]Comparing configurations...", total=total_tests ) - for config in test_configs: + for i, config in enumerate(test_configs): + if i > 0: + console.rule(style="dim") + progress.update( task, description=f"[cyan]Testing [bold]{config.name}[/bold]" ) @@ -747,8 +807,10 @@ def run(self) -> int: hf_model_name, tt_model_name, hf_baseline_metrics, + tt_baseline_metrics, baseline_log_hf, baseline_log_tt, + indent=1, ) results.append((config.name, passed)) if passed: @@ -794,7 +856,7 @@ def run(self) -> int: log_message(LogLevel.SUCCESS, "All model tests passed! 🎉") return 0 else: - log_message(LogLevel.TEST_FAIL, f"{failed_tests} model(s) had test failures") + log_message(LogLevel.TEST_FAIL, f"{failed_tests} configuration(s) had test failures") log_message( LogLevel.INFO, f"Check the diff files in {self.results_dir} for details" ) From 025a86f9d411d3f2f9e8e0b27a551a1ae16c7bae Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 26 Sep 2025 14:33:08 +0000 Subject: [PATCH 044/129] PP for llama in 1D works --- .../transformers_backend/__init__.py | 4 +- .../infra/parallelize_hf_transformers.py | 14 +- .../transformers_backend/infra/pipeline_hf.py | 495 ++++++++++++++++++ .../model/hf_transformers_args.py | 28 +- 4 files changed, 530 insertions(+), 11 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/infra/pipeline_hf.py diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index fa8cc4c119..6e6894b109 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -13,7 +13,7 @@ from torchtitan.datasets.hf_datasets import build_hf_dataloader from torchtitan.components.tokenizer import build_hf_tokenizer -from torchtitan.models.llama3 import pipeline_llama +from .infra.pipeline_hf import pipeline_hf_transformers from torchtitan.protocols.train_spec import register_train_spec, TrainSpec from .infra.parallelize_hf_transformers import parallelize_hf_transformers @@ -143,7 +143,7 @@ class DeepSeekV3Args: model_cls=HFTransformerModel, model_args=flavors, parallelize_fn=parallelize_hf_transformers, - pipelining_fn=pipeline_llama, + pipelining_fn=pipeline_hf_transformers, build_optimizers_fn=build_optimizers, build_lr_schedulers_fn=build_lr_schedulers, build_dataloader_fn=build_hf_dataloader, diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index a97479b216..4ac6d6cd83 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -315,7 +315,7 @@ def apply_non_moe_tp( model, tp_mesh, { - "embed_tokens": RowwiseParallel( + "tok_embeddings": RowwiseParallel( input_layouts=Replicate(), output_layouts=Shard(1), ), @@ -437,9 +437,9 @@ def apply_fsdp( f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}." ) - if model.embed_tokens is not None: + if model.tok_embeddings is not None: fully_shard( - model.embed_tokens, + model.tok_embeddings, **fsdp_config, reshard_after_forward=reshard_after_forward, ) @@ -507,8 +507,8 @@ def apply_fsdp( transformer_blocks = list(model.layers.values()) next_transformer_blocks = transformer_blocks[1:] + [None] - if model.embed_tokens is not None and model.layers is not None: - model.embed_tokens.set_modules_to_forward_prefetch([transformer_blocks[0]]) + if model.tok_embeddings is not None and model.layers is not None: + model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]]) for transformer_block, next_transformer_block in zip( transformer_blocks, next_transformer_blocks @@ -546,8 +546,8 @@ def apply_fsdp( transformer_block.set_modules_to_backward_prefetch( [prev_transformer_block] ) - elif model.embed_tokens is not None: - transformer_block.set_modules_to_backward_prefetch([model.embed_tokens]) + elif model.tok_embeddings is not None: + transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings]) def apply_moe_ep_tp( diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py new file mode 100644 index 0000000000..178610343a --- /dev/null +++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py @@ -0,0 +1,495 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import copy +import os +from typing import Callable + +import torch +import torch.nn as nn +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.pipelining import PipelineStage + +from torch.distributed.pipelining.schedules import ( + _PipelineSchedule, + _PipelineScheduleRuntime, + get_schedule_class, + PipelineScheduleMulti, + PipelineScheduleSingle, + ScheduleDualPipeV, + ScheduleZBVZeroBubble, +) + +from torchtitan.config import JobConfig +from torchtitan.tools.logging import logger + +from torchtitan.distributed import ParallelDims +from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction +from torchtitan.components.loss import LossFunction + +import math + + +def build_pipeline_schedule( + job_config: JobConfig, stages: list[PipelineStage], loss_fn: Callable +) -> _PipelineSchedule: + """Builds a pipeline schedule for the given job configuration and stages. + + Args: + job_config (JobConfig): The job configuration. + stages (list[PipelineStage]): The stages to be scheduled. + loss_fn (Callable): The loss function. + + Returns: + _PipelineSchedule: The pipeline schedule for the given stages. + """ + pp_schedule_csv = job_config.parallelism.pipeline_parallel_schedule_csv + + # Validate that pp_schedule_csv is a valid path + if pp_schedule_csv: + if not os.path.isfile(pp_schedule_csv): + raise FileNotFoundError( + f"The specified path {pp_schedule_csv} does not exist or is not a file." + ) + schedule_class = _PipelineScheduleRuntime + else: + schedule_class = get_schedule_class( + job_config.parallelism.pipeline_parallel_schedule + ) + + looped_schedule = issubclass(schedule_class, PipelineScheduleMulti) + microbatch_size = job_config.parallelism.pipeline_parallel_microbatch_size + batch_size = job_config.training.local_batch_size + # validate that the batch size is divisible by the microbatch_size otherwise we'll hang or error during training + if batch_size % microbatch_size != 0: + raise ValueError( + f"Batch size {job_config.training.local_batch_size} must be divisible by microbatch_size {microbatch_size}. " + "Update the config arguments for either batch_size or pipeline_parallel_microbatch_size." + ) + n_microbatches = batch_size // microbatch_size + # We expect that the number of local stages (`len(stages)`) is the same across all ranks + num_total_stages = job_config.parallelism.pipeline_parallel_degree * len(stages) + if n_microbatches < num_total_stages: + logger.warning( + f"Number of microbatches ({n_microbatches}) is less than the total number " + f"of stages ({num_total_stages}) which may result in a bubble in the pipeline." + ) + + schedule = schedule_class( + stages if looped_schedule else stages[0], + n_microbatches=n_microbatches, + loss_fn=loss_fn, + ) + logger.info( + f"Using pipeline schedule {job_config.parallelism.pipeline_parallel_schedule} " + f"with {n_microbatches} microbatches and {num_total_stages} stages." + ) + + if pp_schedule_csv: + assert schedule_class in [ + PipelineScheduleSingle, + PipelineScheduleMulti, + _PipelineScheduleRuntime, + ], ( + "Only PipelineScheduleSingle (single stage), PipelineScheduleMulti (multistage), " + "and _PipelineScheduleRuntime support csv schedules" + ) + schedule._load_csv(pp_schedule_csv) + + return schedule + + +# TODO(whc) should this be a utility inside torch.pipelining? +def stage_ids_this_rank( + pp_rank: int, pp_size: int, num_stages: int, style: str = "loop" +) -> tuple[int]: + """Compute the stage ids for the stages that will run on this pp rank for either a looped or V style schedule""" + assert ( + num_stages % pp_size == 0 + ), f"num_stages {num_stages} must be evenly divisible by pp_size {pp_size}" + stages_per_rank = num_stages // pp_size + if style == "loop": + return tuple(pp_rank + s * pp_size for s in range(stages_per_rank)) + elif style == "v": + assert ( + stages_per_rank == 2 + ), f"v schedules assume 2 stages per rank, got {stages_per_rank}" + stage_v_pairs = list( + zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1)) + ) + return stage_v_pairs[pp_rank] + + +def generate_llm_fqn_per_model_part( + num_stages: int, + num_layers: int, + input_weight: int = 1, + output_weight: int = 1, +) -> list[list[str]]: + """ + Programmatically generates module names model part, focused on LLMs models. + + Args: + num_stages: Number of pipeline stages + num_layers: Total number of transformer layers in the model + input_weight: Weight for input modules (embed_tokens) in layer calculation + output_weight: Weight for output modules (norm + output) in layer calculation + + Returns: + List of lists containing module names for each model part + + Example: + generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2) + treats embeddings as 2 layers and norm+output as 2 layers for distribution + """ + if num_stages < 1: + raise ValueError("Number of stages must be at least 1") + + if num_stages == 1: + # Single stage gets everything + layer_names = [f"model.model.layers.{i}" for i in range(num_layers)] + return [ + ["model.model.embed_tokens"] + + layer_names + + ["model.model.norm", "model.lm_head", "model.model.rotary_emb"] + ] + + # Calculate effective layers including weights + num_effective_layers = num_layers + input_weight + output_weight + + if num_stages > num_effective_layers: + raise ValueError( + f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})" + ) + + # Calculate layers per stage (distribute evenly) + layers_per_stage = num_effective_layers // num_stages + extra_layers = num_effective_layers % num_stages + + # Feasibility check: Ensure at least 1 layer in each PP stage + if layers_per_stage == 0: + raise ValueError( + f"Configuration would result in empty stages. " + f"With {num_stages} stages and {num_effective_layers} effective layers " + f"(num_layers={num_layers} + input_weight={input_weight} + output_weight={output_weight}), " + f"each stage would get {layers_per_stage} layers on average. " + f"Reduce num_stages or increase num_layers/weights." + ) + + # Balance check: Ensure weights don't exceed minimum layers per stage + if input_weight > layers_per_stage: + raise ValueError( + f"input_weight ({input_weight}) exceeds minimum layers per stage ({layers_per_stage})." + ) + if output_weight > layers_per_stage: + raise ValueError( + f"output_weight ({output_weight}) exceeds minimum layers per stage ({layers_per_stage})." + ) + + module_names_per_stage = [] + current_layer = 0 + + for stage_idx in range(num_stages): + stage_modules = [] + + # Calculate effective layers for this stage + effective_layers_for_stage = layers_per_stage + if stage_idx < extra_layers: + effective_layers_for_stage += 1 + + # First stage: handle input modules with weighting + if stage_idx == 0: + stage_modules.append("model.model.embed_tokens") + # Account for input weight in layer distribution + remaining_layers_for_stage = effective_layers_for_stage - input_weight + + # Add transformer layers + for _ in range(remaining_layers_for_stage): + if current_layer < num_layers: + stage_modules.append(f"model.model.layers.{current_layer}") + current_layer += 1 + + # Last stage: handle output modules with weighting + elif stage_idx == num_stages - 1: + # Account for output weight in layer distribution + remaining_layers_for_stage = effective_layers_for_stage - output_weight + + # Add transformer layers + for _ in range(remaining_layers_for_stage): + if current_layer < num_layers: + stage_modules.append(f"model.model.layers.{current_layer}") + current_layer += 1 + + # Add output modules + stage_modules.extend(["model.model.norm", "model.lm_head"]) + + # Middle stages: only transformer layers + else: + for _ in range(effective_layers_for_stage): + if current_layer < num_layers: + stage_modules.append(f"model.model.layers.{current_layer}") + current_layer += 1 + + stage_modules.append("model.model.rotary_emb") + module_names_per_stage.append(stage_modules) + + return module_names_per_stage + + +def pipeline_module_split( + whole_model: nn.Module, + pp_mesh: DeviceMesh, + pp_schedule: str, + device: torch.device, + module_names_per_stage: list[list[str]], +) -> tuple[list[PipelineStage], list[nn.Module]]: + """ + This API creates pipeline stages based on specified module names for each stage. + + Some model restrictions include: + - forward() method should tolerate deleted layers + - weight initialization methods should tolerate deleted layers + - Does not support nested moduledict and modulelist structures + + Args: + whole_model: The complete model to be split + pp_mesh: Pipeline parallel device mesh + pp_schedule: Name of pipeline parallelism schedule + device: Device + module_names_per_stage: List of lists, where each inner list contains the module names + that should be included in that stage. Module names should be + dot-separated paths. Examples: + - "embed_tokens" for token embeddings + - "layers.0", "layers.1" for specific transformer layers + - "norm" for the final normalization layer + - "output" for the output projection layer + + Returns: + Tuple of (stages, models) where stages are PipelineStage objects and models are the + corresponding model chunks + + Example usage: + module_names_per_stage = [ + ["embed_tokens", "layers.0"], # Stage 0: embeddings + first layer + ["layers.1", "layers.2"], # Stage 1: middle layers + ["norm", "output"] # Stage 2: final norm + output + ] + """ + pp_rank = pp_mesh.get_local_rank() + pp_size = pp_mesh.size() + + def _build_stage_from_modules( + stage_idx: int, module_names: list[str], num_stages: int + ) -> tuple[PipelineStage, nn.Module]: + model = copy.deepcopy(whole_model) + + # Create a set of modules to keep for faster lookup + modules_to_keep = set(module_names) + print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}") + + def _prune_modules_recursive(current_module: nn.Module, prefix: str): + for name, child in current_module.named_children(): + child_prefix = f"{prefix}{name}" + + # If the child module is a container, we need to check its children + if isinstance(child, (nn.ModuleDict, nn.ModuleList)): + layers_to_keep = { + m.split(".")[-1] + for m in modules_to_keep + if m.startswith(f"{child_prefix}.") + } + if layers_to_keep: + # This container has some layers we need to keep. + if isinstance(child, nn.ModuleDict): + for layer_name in list(child.keys()): + if layer_name not in layers_to_keep: + del child[layer_name] + elif isinstance(child, nn.ModuleList): + indices_to_keep = { + int(idx) for idx in layers_to_keep if idx.isdigit() + } + new_layers = nn.ModuleList( + [ + layer + for i, layer in enumerate(child) + if i in indices_to_keep + ] + ) + setattr(current_module, name, new_layers) + else: + # If no sub-modules are kept, replace with an empty container. + if isinstance(child, nn.ModuleDict): + setattr(current_module, name, nn.ModuleDict()) + elif isinstance(child, nn.ModuleList): + setattr(current_module, name, nn.ModuleList()) + elif isinstance(child, nn.Module): + # For a generic nn.Module, check if it or its children should be kept + is_kept = child_prefix in modules_to_keep + is_parent_of_kept = any( + m.startswith(f"{child_prefix}.") for m in modules_to_keep + ) + + if is_kept or is_parent_of_kept: + _prune_modules_recursive(child, f"{child_prefix}.") + else: + # Handle simple module attributes (e.g., "linear", "norm") + setattr(current_module, name, nn.Identity()) + + _prune_modules_recursive(model, "") + + stage = PipelineStage( + model, + stage_idx, + num_stages, + device, + group=pp_mesh.get_group("pp"), + ) + return stage, model + + num_stages = len(module_names_per_stage) + stages = [] + models = [] + + schedule_class = get_schedule_class(pp_schedule) + style = ( + "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop" + ) + + for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style): + module_names = module_names_per_stage[stage_idx] + stage, model_chunk = _build_stage_from_modules( + stage_idx, + module_names, + num_stages, + ) + logger.info( + f"PP rank {pp_rank} is building stage_idx {stage_idx} " + f"with modules {module_names}" + ) + stages.append(stage) + models.append(model_chunk) + + return stages, models + + +def pipeline_hf_transformers( + model: nn.Module, + parallel_dims: ParallelDims, + job_config: JobConfig, + device: torch.device, + model_args: BaseModelArgs, + parallelize_fn: ParallelizeFunction, + loss_fn: LossFunction, +) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]: + if job_config.parallelism.pipeline_parallel_split_points != []: + raise ValueError( + "pipeline_parallel_split_points is deprecated. Please use module_fqns_per_model_part instead." + "You can generate module_fqns_per_model_part programmatically with generate_llm_fqn_per_model_part" + ) + + pp_mesh = parallel_dims.world_mesh["pp"] + + # Determine the number of virtual stages based on schedule type + schedule_class = get_schedule_class( + job_config.parallelism.pipeline_parallel_schedule + ) + is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle) + layers_per_stage = job_config.parallelism.pipeline_parallel_layers_per_stage + if hasattr(model_args, "n_layers"): + num_layers = model_args.n_layers + else: + raise ValueError("Model does not have n_layers attribute.") + + # You can adjust these weights based on the computational cost of embeddings and output layers + # Higher weights mean these modules are treated as "heavier" in the distribution + input_weight = job_config.parallelism.pipeline_parallel_first_stage_less_layers + output_weight = job_config.parallelism.pipeline_parallel_last_stage_less_layers + + # Calculate number of virtual stages + if layers_per_stage is not None: + + # Calculate number of virtual stages needed (using ceiling division) + # This allows for unequal distribution where stages can differ by at most 1 layer + num_virtual_stages = math.ceil( + (num_layers + input_weight + output_weight) / layers_per_stage + ) + + # Validation: check stages per rank based on schedule type + model_config_info = f"Model has {num_layers} layers with pipeline_parallel_layers_per_stage={layers_per_stage}" + stage_distribution_info = ( + f"resulting in {num_virtual_stages=} across {parallel_dims.pp} PP ranks" + ) + + if num_virtual_stages % parallel_dims.pp != 0: + raise ValueError( + f"Number of virtual stages ({num_virtual_stages}) must be divisible by " + f"pipeline parallel size ({parallel_dims.pp}). " + f"{model_config_info}. " + f"Please adjust pipeline_parallel_layers_per_stage to a value that results in a number of stages " + f"divisible by {parallel_dims.pp}." + ) + + stages_per_rank = num_virtual_stages // parallel_dims.pp + + if is_single_stage_schedule and stages_per_rank != 1: + raise ValueError( + f"Single stage schedule requires exactly 1 stage per rank, but got {stages_per_rank} stages per rank. " + f"{model_config_info}, {stage_distribution_info}. " + f"Please increase pipeline_parallel_layers_per_stage to {num_layers // parallel_dims.pp} or higher " + f"to achieve 1 stage per rank." + ) + + if not is_single_stage_schedule and stages_per_rank < 2: + raise ValueError( + f"Multi-stage schedule requires at least 2 stages per rank, but got {stages_per_rank} stages per rank. " + f"{model_config_info}, {stage_distribution_info}. " + f"Please decrease pipeline_parallel_layers_per_stage to achieve at least 2 stages per rank." + ) + else: + # Fallback to default behavior when layers_per_stage is not provided + # For multi-stage schedules, default is 2 virtual stages per rank + # For single-stage schedules, default is 1 virtual stage per rank + stages_per_rank = 1 if is_single_stage_schedule else 2 + num_virtual_stages = parallel_dims.pp * stages_per_rank + + module_names_per_stage = job_config.parallelism.module_fqns_per_model_part + if module_names_per_stage is None: + module_names_per_stage = generate_llm_fqn_per_model_part( + num_virtual_stages, num_layers, input_weight, output_weight + ) + for i, stage_ms in enumerate(module_names_per_stage): + logger.debug(f"Stage {i}: {stage_ms}") + + stages, model_parts = pipeline_module_split( + model, + pp_mesh, + job_config.parallelism.pipeline_parallel_schedule, + device, + module_names_per_stage, + ) + + # For PP with looped schedules, each item in model_parts is one stage-model-chunk. + # We need to iterate through model_parts to apply SPMD parallelisms, compilation, + # optimizer, and checkpointing + for i, m in enumerate(model_parts): + # apply SPMD-style PT-D techniques + m = parallelize_fn(m, parallel_dims, job_config) + model_parts[i] = m + # NOTE: this is to update the model in the stage + # in case the model is modified e.g. by torch.compile + stages[i].submod = m + + pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn) + + # This is used in the train loop to determine whether to pass in the input_ids and labels + has_first_stage = False + has_last_stage = False + for stage in stages: + if stage.is_first: + has_first_stage = True + if stage.is_last: + has_last_stage = True + + return pp_schedule, model_parts, has_first_stage, has_last_stage diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 3eb74c6b4b..9ac01bcb86 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -295,7 +295,7 @@ def layers(self): raise AttributeError("Could not find layers in the model. Please check the model structure.") @property - def embed_tokens(self): + def tok_embeddings(self): """Returns the model's embed_tokens, handling different Hugging Face model structures.""" if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like return self.model.model.embed_tokens @@ -310,6 +310,13 @@ def norm(self): else: raise AttributeError("Could not find norm in the model. Please check the model structure.") + @norm.setter + def norm(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like + setattr(self.model.model, "norm", value) + else: + raise AttributeError("Could not find norm in the model. Please check the model structure.") + @property def output(self): """Returns the model's output layer, handling different Hugging Face model structures.""" @@ -326,4 +333,21 @@ def forward(self, *args, **kwargs): return output def init_weights(self, *args, **kwargs): - self.model.post_init() \ No newline at end of file + # This method replicates the behavior of the original PreTrainedModel.init_weights, + # but with a custom weight initialization function that skips nn.Identity modules (when PP is enabled) + + if self.model.config.pruned_heads: + logger.info("Pruning heads as per model configuration.") + self.model.prune_heads(self.model.config.pruned_heads) + + original_init_weights_fn = self.model._init_weights + + def selective_init(module): + # For pipeline parallel, we need to skip nn.Identity modules + if not isinstance(module, nn.Identity): + original_init_weights_fn(module) + + logger.info("Applying selective weight initialization, skipping nn.Identity modules when PP is enabled.") + self.model.apply(selective_init) + + self.model.tie_weights() \ No newline at end of file From 590737f9dec1fd383e34afedcdc8bc892ce39a30 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sun, 28 Sep 2025 10:49:40 +0000 Subject: [PATCH 045/129] simplify PP logic by flattening the named_children hierarchy. This will be easier for TP later --- .../transformers_backend/infra/pipeline_hf.py | 231 +++++------------- .../model/hf_transformers_args.py | 75 +++++- 2 files changed, 130 insertions(+), 176 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py index 178610343a..fb707b2509 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py @@ -4,123 +4,34 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import copy -import os -from typing import Callable +import math import torch import torch.nn as nn -from torch.distributed.device_mesh import DeviceMesh -from torch.distributed.pipelining import PipelineStage - from torch.distributed.pipelining.schedules import ( _PipelineSchedule, - _PipelineScheduleRuntime, get_schedule_class, - PipelineScheduleMulti, PipelineScheduleSingle, - ScheduleDualPipeV, - ScheduleZBVZeroBubble, ) +from torchtitan.components.loss import LossFunction from torchtitan.config import JobConfig -from torchtitan.tools.logging import logger - from torchtitan.distributed import ParallelDims +from torchtitan.distributed.pipeline_parallel import ( + build_pipeline_schedule, + pipeline_module_split, + stage_ids_this_rank, +) +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.pipelining import PipelineStage from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction -from torchtitan.components.loss import LossFunction - -import math - - -def build_pipeline_schedule( - job_config: JobConfig, stages: list[PipelineStage], loss_fn: Callable -) -> _PipelineSchedule: - """Builds a pipeline schedule for the given job configuration and stages. - - Args: - job_config (JobConfig): The job configuration. - stages (list[PipelineStage]): The stages to be scheduled. - loss_fn (Callable): The loss function. - - Returns: - _PipelineSchedule: The pipeline schedule for the given stages. - """ - pp_schedule_csv = job_config.parallelism.pipeline_parallel_schedule_csv - - # Validate that pp_schedule_csv is a valid path - if pp_schedule_csv: - if not os.path.isfile(pp_schedule_csv): - raise FileNotFoundError( - f"The specified path {pp_schedule_csv} does not exist or is not a file." - ) - schedule_class = _PipelineScheduleRuntime - else: - schedule_class = get_schedule_class( - job_config.parallelism.pipeline_parallel_schedule - ) - - looped_schedule = issubclass(schedule_class, PipelineScheduleMulti) - microbatch_size = job_config.parallelism.pipeline_parallel_microbatch_size - batch_size = job_config.training.local_batch_size - # validate that the batch size is divisible by the microbatch_size otherwise we'll hang or error during training - if batch_size % microbatch_size != 0: - raise ValueError( - f"Batch size {job_config.training.local_batch_size} must be divisible by microbatch_size {microbatch_size}. " - "Update the config arguments for either batch_size or pipeline_parallel_microbatch_size." - ) - n_microbatches = batch_size // microbatch_size - # We expect that the number of local stages (`len(stages)`) is the same across all ranks - num_total_stages = job_config.parallelism.pipeline_parallel_degree * len(stages) - if n_microbatches < num_total_stages: - logger.warning( - f"Number of microbatches ({n_microbatches}) is less than the total number " - f"of stages ({num_total_stages}) which may result in a bubble in the pipeline." - ) - - schedule = schedule_class( - stages if looped_schedule else stages[0], - n_microbatches=n_microbatches, - loss_fn=loss_fn, - ) - logger.info( - f"Using pipeline schedule {job_config.parallelism.pipeline_parallel_schedule} " - f"with {n_microbatches} microbatches and {num_total_stages} stages." - ) - - if pp_schedule_csv: - assert schedule_class in [ - PipelineScheduleSingle, - PipelineScheduleMulti, - _PipelineScheduleRuntime, - ], ( - "Only PipelineScheduleSingle (single stage), PipelineScheduleMulti (multistage), " - "and _PipelineScheduleRuntime support csv schedules" - ) - schedule._load_csv(pp_schedule_csv) - - return schedule - - -# TODO(whc) should this be a utility inside torch.pipelining? -def stage_ids_this_rank( - pp_rank: int, pp_size: int, num_stages: int, style: str = "loop" -) -> tuple[int]: - """Compute the stage ids for the stages that will run on this pp rank for either a looped or V style schedule""" - assert ( - num_stages % pp_size == 0 - ), f"num_stages {num_stages} must be evenly divisible by pp_size {pp_size}" - stages_per_rank = num_stages // pp_size - if style == "loop": - return tuple(pp_rank + s * pp_size for s in range(stages_per_rank)) - elif style == "v": - assert ( - stages_per_rank == 2 - ), f"v schedules assume 2 stages per rank, got {stages_per_rank}" - stage_v_pairs = list( - zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1)) - ) - return stage_v_pairs[pp_rank] +from torchtitan.tools.logging import logger +from torch.distributed.pipelining.schedules import ( + ScheduleDualPipeV, + ScheduleZBVZeroBubble, +) +# NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part def generate_llm_fqn_per_model_part( num_stages: int, @@ -130,16 +41,13 @@ def generate_llm_fqn_per_model_part( ) -> list[list[str]]: """ Programmatically generates module names model part, focused on LLMs models. - Args: num_stages: Number of pipeline stages num_layers: Total number of transformer layers in the model input_weight: Weight for input modules (embed_tokens) in layer calculation output_weight: Weight for output modules (norm + output) in layer calculation - Returns: List of lists containing module names for each model part - Example: generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2) treats embeddings as 2 layers and norm+output as 2 layers for distribution @@ -149,11 +57,11 @@ def generate_llm_fqn_per_model_part( if num_stages == 1: # Single stage gets everything - layer_names = [f"model.model.layers.{i}" for i in range(num_layers)] + layer_names = [f"layers.{i}" for i in range(num_layers)] return [ - ["model.model.embed_tokens"] + ["tok_embeddings"] + layer_names - + ["model.model.norm", "model.lm_head", "model.model.rotary_emb"] + + ["norm", "output", "rotary_emb"] ] # Calculate effective layers including weights @@ -201,14 +109,14 @@ def generate_llm_fqn_per_model_part( # First stage: handle input modules with weighting if stage_idx == 0: - stage_modules.append("model.model.embed_tokens") + stage_modules.append("tok_embeddings") # Account for input weight in layer distribution remaining_layers_for_stage = effective_layers_for_stage - input_weight # Add transformer layers for _ in range(remaining_layers_for_stage): if current_layer < num_layers: - stage_modules.append(f"model.model.layers.{current_layer}") + stage_modules.append(f"layers.{current_layer}") current_layer += 1 # Last stage: handle output modules with weighting @@ -219,25 +127,24 @@ def generate_llm_fqn_per_model_part( # Add transformer layers for _ in range(remaining_layers_for_stage): if current_layer < num_layers: - stage_modules.append(f"model.model.layers.{current_layer}") + stage_modules.append(f"layers.{current_layer}") current_layer += 1 # Add output modules - stage_modules.extend(["model.model.norm", "model.lm_head"]) + stage_modules.extend(["norm", "output"]) # Middle stages: only transformer layers else: for _ in range(effective_layers_for_stage): if current_layer < num_layers: - stage_modules.append(f"model.model.layers.{current_layer}") + stage_modules.append(f"layers.{current_layer}") current_layer += 1 - stage_modules.append("model.model.rotary_emb") + stage_modules.append("rotary_emb") module_names_per_stage.append(stage_modules) return module_names_per_stage - def pipeline_module_split( whole_model: nn.Module, pp_mesh: DeviceMesh, @@ -261,7 +168,7 @@ def pipeline_module_split( module_names_per_stage: List of lists, where each inner list contains the module names that should be included in that stage. Module names should be dot-separated paths. Examples: - - "embed_tokens" for token embeddings + - "tok_embeddings" for token embeddings - "layers.0", "layers.1" for specific transformer layers - "norm" for the final normalization layer - "output" for the output projection layer @@ -272,7 +179,7 @@ def pipeline_module_split( Example usage: module_names_per_stage = [ - ["embed_tokens", "layers.0"], # Stage 0: embeddings + first layer + ["tok_embeddings", "layers.0"], # Stage 0: embeddings + first layer ["layers.1", "layers.2"], # Stage 1: middle layers ["norm", "output"] # Stage 2: final norm + output ] @@ -288,56 +195,42 @@ def _build_stage_from_modules( # Create a set of modules to keep for faster lookup modules_to_keep = set(module_names) print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}") - - def _prune_modules_recursive(current_module: nn.Module, prefix: str): - for name, child in current_module.named_children(): - child_prefix = f"{prefix}{name}" - - # If the child module is a container, we need to check its children - if isinstance(child, (nn.ModuleDict, nn.ModuleList)): - layers_to_keep = { - m.split(".")[-1] - for m in modules_to_keep - if m.startswith(f"{child_prefix}.") - } - if layers_to_keep: - # This container has some layers we need to keep. - if isinstance(child, nn.ModuleDict): - for layer_name in list(child.keys()): - if layer_name not in layers_to_keep: - del child[layer_name] - elif isinstance(child, nn.ModuleList): - indices_to_keep = { - int(idx) for idx in layers_to_keep if idx.isdigit() - } - new_layers = nn.ModuleList( - [ - layer - for i, layer in enumerate(child) - if i in indices_to_keep - ] - ) - setattr(current_module, name, new_layers) - else: - # If no sub-modules are kept, replace with an empty container. - if isinstance(child, nn.ModuleDict): - setattr(current_module, name, nn.ModuleDict()) - elif isinstance(child, nn.ModuleList): - setattr(current_module, name, nn.ModuleList()) - elif isinstance(child, nn.Module): - # For a generic nn.Module, check if it or its children should be kept - is_kept = child_prefix in modules_to_keep - is_parent_of_kept = any( - m.startswith(f"{child_prefix}.") for m in modules_to_keep - ) - - if is_kept or is_parent_of_kept: - _prune_modules_recursive(child, f"{child_prefix}.") - else: - # Handle simple module attributes (e.g., "linear", "norm") - setattr(current_module, name, nn.Identity()) - - _prune_modules_recursive(model, "") + for module_name, module_value in model.named_children(): + # Handle layer-like structures (e.g., "layers.0", "layers.1") + if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)): + layers_to_keep = { + name.split(".", 1)[1] + for name in modules_to_keep + if name.startswith(f"{module_name}.") + } + if layers_to_keep: + # Keep only specified layers + if isinstance(module_value, nn.ModuleDict): + for layer_name in list(module_value.keys()): + if layer_name not in layers_to_keep: + del module_value[layer_name] + elif isinstance(module_value, nn.ModuleList): + indices_to_keep = { + int(idx) for idx in layers_to_keep if idx.isdigit() + } + new_layers = nn.ModuleList( + [ + layer + for i, layer in enumerate(module_value) + if i in indices_to_keep + ] + ) + setattr(model, module_name, new_layers) + else: + # No layers from this structure needed, set to empty structure + if isinstance(module_value, nn.ModuleDict): + setattr(model, module_name, nn.ModuleDict()) + elif isinstance(module_value, nn.ModuleList): + setattr(model, module_name, nn.ModuleList()) + # Handle simple module attributes (e.g., "linear", "norm") + elif module_name not in modules_to_keep: + # Replace with Identity + setattr(model, module_name, nn.Identity()) stage = PipelineStage( model, diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 9ac01bcb86..e74459760a 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -285,6 +285,21 @@ def __init__(self, model_args: HFTransformerModelArgs): ) self.model = model_cls(config=model_args) + @property + def tok_embeddings(self): + """Returns the model's embed_tokens, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like + return self.model.model.embed_tokens + else: + raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + + @tok_embeddings.setter + def tok_embeddings(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like + setattr(self.model.model, "embed_tokens", value) + else: + raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + @property def layers(self): """Returns the model's layers, handling different Hugging Face model structures.""" @@ -294,13 +309,12 @@ def layers(self): # Add more cases here if needed for other model architectures raise AttributeError("Could not find layers in the model. Please check the model structure.") - @property - def tok_embeddings(self): - """Returns the model's embed_tokens, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like - return self.model.model.embed_tokens + @layers.setter + def layers(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like + setattr(self.model.model, "layers", value) else: - raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + raise AttributeError("Could not find layers in the model. Please check the model structure.") @property def norm(self): @@ -326,6 +340,28 @@ def output(self): # Add more cases here if needed for other model architectures raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") + @output.setter + def output(self, value): + if hasattr(self.model, "lm_head"): # For models like LlamaForCausalLM + setattr(self.model, "lm_head", value) + else: + raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") + + @property + def rotary_emb(self): + """Returns the model's rotary_emb, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like + return self.model.model.rotary_emb + else: + raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") + + @rotary_emb.setter + def rotary_emb(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like + setattr(self.model.model, "rotary_emb", value) + else: + raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") + def forward(self, *args, **kwargs): output = self.model(*args, **kwargs) if isinstance(output, CausalLMOutputWithPast): @@ -350,4 +386,29 @@ def selective_init(module): logger.info("Applying selective weight initialization, skipping nn.Identity modules when PP is enabled.") self.model.apply(selective_init) - self.model.tie_weights() \ No newline at end of file + self.model.tie_weights() + + def named_children(self): + """ + Provides a flattened view of the model's main components, + making it compatible with TorchTitan's expectations. + """ + yield "tok_embeddings", self.tok_embeddings + yield "layers", self.layers + yield "norm", self.norm + yield "output", self.output + yield "rotary_emb", self.rotary_emb + + def __setattr__(self, name, value): + # If a property with a setter exists for this name, use it. + # This is to bypass the nn.Module.__setattr__ logic that + # directly registers modules and skips property setters. + cls = self.__class__ + if hasattr(cls, name): + prop = getattr(cls, name) + if isinstance(prop, property) and prop.fset is not None: + prop.fset(self, value) + return + + # Otherwise, fall back to the default nn.Module behavior. + super().__setattr__(name, value) \ No newline at end of file From 1a9af6884cf61ad5cd974a5156343ff9201240a5 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sun, 28 Sep 2025 13:59:05 +0000 Subject: [PATCH 046/129] TP now works in 1D --- .../infra/parallelize_hf_transformers.py | 30 ++++---- .../model/hf_deepseek_v3_patch.py | 1 + .../model/hf_llama_patch.py | 74 ++++++++++++++++++- .../model/hf_transformers_args.py | 19 +++-- 4 files changed, 99 insertions(+), 25 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 4ac6d6cd83..d36bc0589a 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -351,29 +351,29 @@ def apply_non_moe_tp( ) # Apply tensor + sequence parallelism to every transformer block - for transformer_block in model.layers.values(): + for transformer_block in model.layers: layer_plan = { - "attention_norm": SequenceParallel(), - "attention": prepare_module_input( - input_layouts=(Shard(1), None), - desired_input_layouts=(Replicate(), None), + "input_layernorm": SequenceParallel(), + "self_attn": prepare_module_input( + input_kwarg_layouts={"hidden_states": Shard(1)}, + desired_input_kwarg_layouts={"hidden_states": Replicate()}, ), - "attention.wq": colwise_parallel(), - "attention.wk": colwise_parallel(), - "attention.wv": colwise_parallel(), - "attention.wo": rowwise_parallel(output_layouts=Shard(1)), - "ffn_norm": SequenceParallel(), + "self_attn.q_proj": colwise_parallel(), + "self_attn.k_proj": colwise_parallel(), + "self_attn.v_proj": colwise_parallel(), + "self_attn.o_proj": rowwise_parallel(output_layouts=Shard(1)), + "post_attention_layernorm": SequenceParallel(), } if not transformer_block.moe_enabled: layer_plan.update( { - "feed_forward": prepare_module_input( + "mlp": prepare_module_input( input_layouts=(Shard(1),), desired_input_layouts=(Replicate(),), ), - "feed_forward.w1": colwise_parallel(), - "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)), - "feed_forward.w3": colwise_parallel(), + "mlp.gate_proj": colwise_parallel(), + "mlp.up_proj": colwise_parallel(), + "mlp.down_proj": rowwise_parallel(output_layouts=Shard(1)), } ) @@ -557,7 +557,7 @@ def apply_moe_ep_tp( ep_tp_mesh: DeviceMesh | None, etp_enabled: bool, ): - for transformer_block in model.layers.values(): + for transformer_block in model.layers: if not transformer_block.moe_enabled: continue diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py index 346a400260..68594dc2be 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py +++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py @@ -37,6 +37,7 @@ def seeded_trunc_normal(*args, **kwargs): def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int): _original_deepseek_v3_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx self.mlp.layer_idx = layer_idx if hasattr(self.mlp, 'experts'): diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py index 28888f61a6..ddde904cae 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py +++ b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py @@ -1,15 +1,18 @@ - - +import torch import torch.nn as nn - from transformers.models.llama.configuration_llama import LlamaConfig -from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer +from transformers.models.llama.modeling_llama import LlamaModel, LlamaAttention, LlamaMLP, LlamaDecoderLayer from transformers.modeling_utils import PreTrainedModel +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_outputs import BaseModelOutputWithPast +from typing import Optional + _original_llama_decoder_layer_init = LlamaDecoderLayer.__init__ def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int): _original_llama_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx self.mlp.layer_idx = layer_idx def _initialize_weights_patched(self, module): @@ -83,8 +86,71 @@ def _init_weights_patched(self, module): if hasattr(module, "bias") and module.bias is not None: module.bias.data.zero_() +def _patched_forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + **kwargs, +) -> BaseModelOutputWithPast: + """ + A patched version of LlamaModel.forward that disables the causal mask. + This is a direct copy of the original method with one line changed. + """ + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if cache_position is None: + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) + cache_position: torch.Tensor = torch.arange( + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # --- START OF PATCH --- + # NOTE(3outeille): When TP enabled, the causal_mask will be created based on input_embeds which has sharded seq_len. + # We set it to False so that SDPA is creating the causal mask based on query & key seq_len. + causal_mask = None + # --- END OF PATCH --- + + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + hidden_states = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + hidden_states = self.norm(hidden_states) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + ) def patch_hf_llama(): + LlamaModel.forward = _patched_forward LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched PreTrainedModel._init_weights = _init_weights_patched PreTrainedModel._initialize_weights = _initialize_weights_patched \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index e74459760a..66fa558a58 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -6,8 +6,7 @@ import importlib from dataclasses import dataclass -from typing import Optional - +import torch from torch import nn from torchtitan.config import JobConfig from torchtitan.protocols import BaseModelArgs @@ -285,6 +284,12 @@ def __init__(self, model_args: HFTransformerModelArgs): ) self.model = model_cls(config=model_args) + for layer in self.model.model.layers: + if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace: + layer.moe_enabled = True + else: + layer.moe_enabled = False + @property def tok_embeddings(self): """Returns the model's embed_tokens, handling different Hugging Face model structures.""" @@ -363,9 +368,10 @@ def rotary_emb(self, value): raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") def forward(self, *args, **kwargs): - output = self.model(*args, **kwargs) - if isinstance(output, CausalLMOutputWithPast): - return output.logits + position_ids = torch.arange(args[0].shape[1], device=args[0].device).unsqueeze(0) + kwargs["position_ids"] = position_ids + output = self.model.model(*args, **kwargs) + output = self.model.lm_head(output.last_hidden_state) return output def init_weights(self, *args, **kwargs): @@ -382,8 +388,9 @@ def selective_init(module): # For pipeline parallel, we need to skip nn.Identity modules if not isinstance(module, nn.Identity): original_init_weights_fn(module) + else: + logger.info("Skipping nn.Identity module during weight initialization.") - logger.info("Applying selective weight initialization, skipping nn.Identity modules when PP is enabled.") self.model.apply(selective_init) self.model.tie_weights() From e6b9ff5f8cc394130241b891c8d025b65d223819 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sun, 28 Sep 2025 13:59:22 +0000 Subject: [PATCH 047/129] add test filtering in compare distributed run --- .../compare_distributed_run.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 1a432b68bd..5ee59e60ea 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -198,6 +198,7 @@ def __init__(self): self.grad_norm_rtol = self.DEFAULT_GRAD_NORM_RTOL self.parallelism_configs: List[ParallelismConfig] = [] self.results_dir: Optional[Path] = None + self.test_filter = "" def generate_parallelism_configs(self) -> None: """Generate parallelism configurations based on the number of GPUs.""" @@ -284,8 +285,16 @@ def _get_factors(n: int) -> List[int]: LogLevel.INFO, f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.", ) + configs_to_display = self.parallelism_configs + table_title = "[bold]Generated Parallelism Configurations[/bold]" + + if self.test_filter: + # Keep fsdp baseline and anything that matches the filter + configs_to_display = [c for c in self.parallelism_configs if c.name == "fsdp" or self.test_filter in c.name] + table_title = f"[bold]Filtered Parallelism Configurations (filter: [cyan]'{self.test_filter}'[/cyan])[/bold]" + table = Table( - title="[bold]Generated Parallelism Configurations[/bold]", + title=table_title, show_header=True, header_style="bold magenta", ) @@ -298,7 +307,7 @@ def _get_factors(n: int) -> List[int]: table.add_column("ep", justify="right") table.add_column("eptp", justify="right") - for config in self.parallelism_configs: + for config in configs_to_display: table.add_row( config.name, str(config.dp_replicate), @@ -658,6 +667,8 @@ def run(self) -> int: ) parser.add_argument("-m", "--model-filter", default="", help="Filter models by name pattern (e.g., 'llama3')") + parser.add_argument("-t", "--test-filter", default="", + help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')") parser.add_argument("-nd", "--nd_parallel", type=str, default="2d", help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})") parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS, @@ -682,6 +693,7 @@ def run(self) -> int: self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] self.steps = args.steps self.model_filter = args.model_filter + self.test_filter = args.test_filter self.flavor = args.flavor self.verbose = args.verbose self.loss_atol = args.loss_atol @@ -696,6 +708,7 @@ def run(self) -> int: f"[bold]Steps:[/bold] {self.steps}\n" f"[bold]Seed:[/bold] {self.seed}\n" f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n" + f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n" f"[bold]Model flavor:[/bold] {self.flavor}" ), title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]", @@ -780,6 +793,11 @@ def run(self) -> int: passed_tests = 1 # +1 for the baseline (FSDP) failed_tests = 0 test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"] + if self.test_filter: + filtered_configs = [c for c in test_configs if self.test_filter in c.name] + if not filtered_configs: + log_message(LogLevel.WARNING, f"Test filter '{self.test_filter}' did not match any test configurations.") + test_configs = filtered_configs total_tests = len(test_configs) + 1 # +1 for the baseline (FSDP) results = [] From a4cb8c3b39c542a3bd34d9fa53d9a49a7bc129ce Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sun, 28 Sep 2025 14:06:22 +0000 Subject: [PATCH 048/129] dont generate EP config if model is not a MoE --- .../compare_distributed_run.py | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 5ee59e60ea..cc8f54f51b 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -200,8 +200,18 @@ def __init__(self): self.results_dir: Optional[Path] = None self.test_filter = "" - def generate_parallelism_configs(self) -> None: + def generate_parallelism_configs(self, hf_model_name: str) -> None: """Generate parallelism configurations based on the number of GPUs.""" + from transformers import AutoConfig + + try: + model_config = AutoConfig.from_pretrained(hf_model_name) + is_moe = getattr(model_config, "num_local_experts", 0) > 1 + except Exception: + # Fallback for models not on Hub or other errors + is_moe = False + log_message(LogLevel.WARNING, f"Could not determine if {hf_model_name} is a MoE model from HuggingFace Hub. EP configurations will not be generated.") + ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] configs = [] @@ -253,20 +263,21 @@ def _get_factors(n: int) -> List[int]: ) ) - # NOTE(3outeille): EP borrowing degree from dp_shard - configs.append( - ParallelismConfig( - name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}", - dp_replicate=1, - dp_shard=dp_shard, - tp=tp, - pp=pp, - pp_schedule="1F1B", - cp=cp, - ep=dp_shard, - eptp=1 + if is_moe: + # NOTE(3outeille): EP borrowing degree from dp_shard + configs.append( + ParallelismConfig( + name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}", + dp_replicate=1, + dp_shard=dp_shard, + tp=tp, + pp=pp, + pp_schedule="1F1B", + cp=cp, + ep=dp_shard, + eptp=1 + ) ) - ) # Remove duplicates and assign to instance @@ -721,8 +732,6 @@ def run(self) -> int: self.base_results_dir.mkdir(exist_ok=True) - self.generate_parallelism_configs() - # TODO(3outeille): make it more generic later if self.model_filter == "llama3": hf_model_name = "meta-llama/Llama-3.2-1B" @@ -733,6 +742,8 @@ def run(self) -> int: else: raise ValueError(f"Model filter {self.model_filter} not supported") + self.generate_parallelism_configs(hf_model_name) + model_owner, model_repo = hf_model_name.split("/", 1) nd_parallel_upper = self.nd_parallel.upper() self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor From 12c0c474a38340750afee1bf15da5c5f49720af7 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sun, 28 Sep 2025 14:08:11 +0000 Subject: [PATCH 049/129] disable torch.utils.deterministic.fill_uninitialized_memory for Moe during testing --- torchtitan/train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchtitan/train.py b/torchtitan/train.py index b15cd73e2c..881f353734 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -253,6 +253,9 @@ def __init__(self, job_config: JobConfig): del model for m in self.model_parts: + if is_torch_deterministic(): + # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan + torch.utils.deterministic.fill_uninitialized_memory = False m.to_empty(device=init_device) with torch.no_grad(): m.init_weights(buffer_device=buffer_device) From 13edc66cc4e8c1764590226fb6b1d16a1b171a1a Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 29 Sep 2025 12:37:53 +0000 Subject: [PATCH 050/129] CP is now supported --- .../infra/parallelize_hf_transformers.py | 2 +- .../model/hf_transformers_args.py | 1 - torchtitan/train.py | 13 ++++++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index d36bc0589a..56d6cf9ca6 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -194,7 +194,7 @@ def parallelize_hf_transformers( job_config.parallelism.context_parallel_degree > 1 and model.model_args.use_flex_attn ): - raise NotImplementedError("CP support for FlexAttention is still in progress.") + logger.warning("CP support for FlexAttention is still in progress.") if parallel_dims.tp_enabled: enable_float8_linear = "float8" in job_config.model.converters diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 66fa558a58..afafddd900 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -13,7 +13,6 @@ from torchtitan.tools.logging import logger from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_outputs import CausalLMOutputWithPast @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): diff --git a/torchtitan/train.py b/torchtitan/train.py index 881f353734..735180ee5a 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -34,9 +34,6 @@ maybe_enable_profiling, ) -from transformers.models.llama.modeling_llama import CausalLMOutputWithPast - - class Trainer(torch.distributed.checkpoint.stateful.Stateful): # core configs job_config: JobConfig @@ -429,11 +426,17 @@ def forward_backward_step( # apply context parallelism if cp is enabled # ensure CP handles the separate freqs_cis buffer for each pp stage + cp_buffers = [inputs, labels] + cp_seq_dims = [1, 1] + if hasattr(model_parts[0], "freqs_cis"): + cp_buffers += [m.freqs_cis for m in model_parts] + cp_seq_dims += [0 for _ in model_parts] + optional_context_parallel_ctx = ( dist_utils.create_context_parallel_ctx( cp_mesh=parallel_dims.world_mesh["cp"], - cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts], - cp_seq_dims=[1, 1] + [0 for _ in model_parts], + cp_buffers=cp_buffers, + cp_seq_dims=cp_seq_dims, cp_no_restore_buffers={inputs, labels}, cp_rotate_method=self.job_config.parallelism.context_parallel_rotate_method, ) From 52250fb4de667727b4d92eb3be1de3afe8a1f92f Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 29 Sep 2025 12:43:30 +0000 Subject: [PATCH 051/129] some cleaning --- .../compare_distributed_run.sh | 9 +- .../transformers_backend/compare_tt_hf_run.sh | 104 ----------- .../configs/debug_1_gpu_hf.toml | 62 ------- .../configs/debug_fsdp_2_gpu.toml | 65 ------- ...debug_1_gpu_tt.toml => test_template.toml} | 3 + .../reference_diff_deepseekv3_1gpu.log | 163 ------------------ .../reference_diff_llama3_1gpu.log | 133 -------------- .../transformers_backend/run_train.sh | 44 ----- 8 files changed, 9 insertions(+), 574 deletions(-) delete mode 100755 torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh delete mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml delete mode 100644 torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml rename torchtitan/experiments/transformers_backend/configs/{debug_1_gpu_tt.toml => test_template.toml} (95%) delete mode 100644 torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log delete mode 100644 torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log delete mode 100755 torchtitan/experiments/transformers_backend/run_train.sh diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh index d7e5b77bcb..2ca9bbee62 100755 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh @@ -1,5 +1,8 @@ #!/usr/bin/bash -python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d --verbose - -# debugpy-run compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d +if [[ "$1" == "--debug" ]]; then + shift + debugpy-run compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@" +else + python compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@" +fi diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh deleted file mode 100755 index 703a9b55c9..0000000000 --- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/bash - -set -ex -set -o pipefail - -# Common settings -NGPU=${NGPU:-"1"} -export LOG_RANK=${LOG_RANK:-0} - -# Parse command line arguments for model selection -MODEL_TYPE=${1:-"llama"} -export MODEL_TYPE -SEED=${SEED:-42} -export SEED -# Set model names based on argument -case $MODEL_TYPE in - "llama") - TT_MODEL_NAME="llama3" - HF_MODEL_NAME="meta-llama/Llama-3.2-1B" - ;; - "deepseek") - TT_MODEL_NAME="deepseek_v3" - HF_MODEL_NAME="deepseek-ai/DeepSeek-V3" - ;; - *) - echo "Error: Unsupported model type '$MODEL_TYPE'" - echo "Usage: $0 [llama|deepseek] [additional_args...]" - echo " llama - Uses llama3 for TT and meta-llama/Llama-3.2-1B for HF" - echo " deepseek - Uses deepseek_v3 for TT and deepseek-ai/DeepSeek-V3 for HF" - exit 1 - ;; -esac - -echo "Using model type: $MODEL_TYPE" -echo " TT model: $TT_MODEL_NAME" -echo " HF model: $HF_MODEL_NAME" - -# Shift to remove the model type argument, pass remaining args to training -shift - -run_tt() { - echo "##############################################" - echo "### Running TorchTitan (native) training ###" - echo "##############################################" - TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml" - - # Use CUDA_VISIBLE_DEVICES=0 for TT run - CUDA_VISIBLE_DEVICES=0 \ - torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ - --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${TT_MODEL_NAME} "$@" -} - -run_hf() { - echo "#######################################################" - echo "### Running TorchTitan with HF backend training ###" - echo "#######################################################" - HF_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml" - - # Use CUDA_VISIBLE_DEVICES=1 for HF run - CUDA_VISIBLE_DEVICES=1 \ - torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ - --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ - -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${HF_MODEL_NAME} "$@" -} - -TT_LOG="tt_run.log" -HF_LOG="hf_run.log" -DIFF_LOG="run_diff.log" - -export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_hf" -run_hf "$@" 2>&1 | tee ${HF_LOG} || true -export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_tt" -run_tt "$@" 2>&1 | tee ${TT_LOG} || true -# run_tt "$@" 2>&1 | tee ${HF_LOG} - - -# Filter logs to remove noisy differences -TT_LOG_FILTERED="${TT_LOG}.filtered" -HF_LOG_FILTERED="${HF_LOG}.filtered" - -# This sed command removes timestamps, PIDs, master ports, and other -# volatile details that change between runs. -# Feel free to adjust the regex patterns to better suit your log format. -sed -E \ - -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \ - -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \ - -e 's/PID [0-9]+/PID XXXX/g' \ - -e 's/localhost:[0-9]+/localhost:XXXX/g' \ - < "${TT_LOG}" > "${TT_LOG_FILTERED}" - -sed -E \ - -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \ - -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \ - -e 's/PID [0-9]+/PID XXXX/g' \ - -e 's/localhost:[0-9]+/localhost:XXXX/g' \ - < "${HF_LOG}" > "${HF_LOG_FILTERED}" - -echo "############################################" -echo "### Diff between TT and HF run logs ###" -echo "############################################" -echo "### Log diff is being saved to ${DIFF_LOG}" -echo "############################################" -git diff --no-index --color=always --word-diff=color "${TT_LOG_FILTERED}" "${HF_LOG_FILTERED}" | tee "${DIFF_LOG}" || true diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml deleted file mode 100644 index 95aa9599b2..0000000000 --- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml +++ /dev/null @@ -1,62 +0,0 @@ -[job] -dump_folder = "./outputs" -description = "HF Llama 3 debug training" -print_args = false -use_for_integration_test = true - -[profiling] -enable_profiling = true -save_traces_folder = "profile_trace_hf" -profile_freq = 5 -enable_memory_snapshot = false -save_memory_snapshot_folder = "memory_snapshot" - -[metrics] -log_freq = 1 -disable_color_printing = false -enable_tensorboard = false -save_tb_folder = "tb" -enable_wandb = false - -[model] -name = "meta-llama/Llama-3.2-1B" -flavor = "debugmodel" -tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" - -[optimizer] -name = "AdamW" -lr = 8e-4 -eps = 1e-8 - -[lr_scheduler] -warmup_steps = 2 -decay_ratio = 0.8 -decay_type = "linear" -min_lr_factor = 0.0 - -[training] -local_batch_size = 8 -seq_len = 2048 -max_norm = 1.0 -steps = 10 -compile = false -dataset = "c4_test" -dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" - -[parallelism] -data_parallel_replicate_degree = 1 -data_parallel_shard_degree = 1 -tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 -context_parallel_degree = 1 -expert_parallel_degree = 1 - -[checkpoint] -enable_checkpoint = false - -[activation_checkpoint] -mode = "selective" -selective_ac_option = '2' - -[validation] -enabled = false \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml deleted file mode 100644 index db97c9b339..0000000000 --- a/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml +++ /dev/null @@ -1,65 +0,0 @@ -# FSDP-only configuration for a 2-GPU setup. -# Model is sharded across GPUs. - -[job] -dump_folder = "./outputs" -description = "Llama 3 debug training with FSDP on 2 GPUs" -print_args = false -use_for_integration_test = true - -[profiling] -enable_profiling = false -save_traces_folder = "profile_trace" -profile_freq = 10 -enable_memory_snapshot = false -save_memory_snapshot_folder = "memory_snapshot" - -[metrics] -log_freq = 1 -disable_color_printing = false -enable_tensorboard = false -save_tb_folder = "tb" -enable_wandb = false - -[model] -name = "llama3" -flavor = "debugmodel" -tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" - -[optimizer] -name = "AdamW" -lr = 8e-4 -eps = 1e-8 - -[lr_scheduler] -warmup_steps = 2 -decay_ratio = 0.8 -decay_type = "linear" -min_lr_factor = 0.0 - -[training] -local_batch_size = 8 -seq_len = 2048 -max_norm = 1.0 -steps = 10 -compile = false -dataset = "c4_test" -dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" - -[parallelism] -data_parallel_replicate_degree = 1 -data_parallel_shard_degree = 2 -tensor_parallel_degree = 1 -pipeline_parallel_degree = 1 -context_parallel_degree = 1 -expert_parallel_degree = 1 - -[checkpoint] -enable_checkpoint = false - -[activation_checkpoint] -mode = "selective" -selective_ac_option = '2' - -[validation] -enabled = false \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml similarity index 95% rename from torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml rename to torchtitan/experiments/transformers_backend/configs/test_template.toml index b153a98f21..f56a0332d7 100644 --- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml +++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml @@ -53,7 +53,10 @@ fsdp_reshard_after_forward = "default" # default / never / always tensor_parallel_degree = 1 enable_async_tensor_parallel = false pipeline_parallel_degree = 1 +pipeline_parallel_schedule = "1F1B" context_parallel_degree = 1 +expert_parallel_degree = 1 +expert_tensor_parallel_degree = 1 [checkpoint] enable = false diff --git a/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log deleted file mode 100644 index 1155c9a5db..0000000000 --- a/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log +++ /dev/null @@ -1,163 +0,0 @@ -diff --git a/tt_run.log.filtered b/hf_run.log.filtered -index 9726db6..84b6138 100644 ---- a/tt_run.log.filtered -+++ b/hf_run.log.filtered -@@ -1,85 +1,153 @@ -+ echo '##############################################' -##############################################'#######################################################' -####################################################### -+ echo '### Running TorchTitan (native)with HF backend training ###' -### Running TorchTitan (native)with HF backend training ### -+ echo '##############################################' -##############################################'#######################################################' -####################################################### -+ TT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.tomlHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml -+ CUDA_VISIBLE_DEVICES=0CUDA_VISIBLE_DEVICES=1 -+ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml --training.seed 42 --training.deterministic --model.name deepseek_v3deepseek-ai/DeepSeek-V3 -[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. -[rank0]: warnings.warn( -[rank0]:[titan] TIMESTAMP - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily. -[rank0]:[titan] TIMESTAMP - root - INFO - Starting job: HF Llama 3 debug training -[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config -[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [] -[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation). -[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json -[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test -[rank0]:[titan] TIMESTAMP - root - INFO - Building deepseek_v3deepseek-ai/DeepSeek-V3 debugmodel with DeepSeekV3ModelArgs(_enforced='This field is used to enforce all fields have defaults.', max_batch_size=8, max_seq_len=2048, vocab_size=2000, dim=256, inter_dim=1024, moe_inter_dim=256, n_layers=2, n_dense_layers=1, n_heads=16, norm_eps=1e-05, moe_args=MoEArgs(num_experts=8,HFTransformerModelArgs( -[rank0]:attn_implementation='sdpa' -[rank0]:attn_mask_type='causal' -[rank0]:beta_fast=None -[rank0]:beta_slow=None -[rank0]:depth_init=True -[rank0]:dim=256 -[rank0]:eos_id=0 -[rank0]:ffn_dim_multiplier=None -[rank0]:inter_dim=1024 -[rank0]:kv_lora_rank=512 -[rank0]:max_seq_len=2048 -[rank0]:moe_args=MoEArgs(num_experts=8, num_shared_experts=2, score_func='softmax', route_norm=True, route_scale=1.0, score_before_experts=False, top_k=3, use_grouped_mm=True, load_balance_coeff=0.001), n_expert_groups=1, n_limited_groups=1, q_lora_rank=0, kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64, v_head_dim=128, use_flex_attn=False, attn_mask_type='causal', original_seq_len=4096, rope_theta=10000.0, rope_factor=40, beta_fast=32, beta_slow=1, mscale=0.7)load_balance_coeff=0.001) -[rank0]:moe_inter_dim=256 -[rank0]:moe_intermediate_size=256 -[rank0]:mscale=0.7 -[rank0]:multiple_of=256 -[rank0]:n_dense_layers=1 -[rank0]:n_expert_groups=None -[rank0]:n_group=2 -[rank0]:n_heads=16 -[rank0]:n_kv_heads=16 -[rank0]:n_layers=2 -[rank0]:n_limited_groups=None -[rank0]:n_routed_experts=8 -[rank0]:n_shared_experts=2 -[rank0]:norm_eps=1e-05 -[rank0]:num_experts_per_tok=3 -[rank0]:original_seq_len=None -[rank0]:partial_rotary_factor=4.0 -[rank0]:q_lora_rank=None -[rank0]:qk_nope_head_dim=128 -[rank0]:qk_rope_head_dim=64 -[rank0]:rope_factor=None -[rank0]:rope_theta=10000 -[rank0]:topk_group=1 -[rank0]:use_flex_attn=False -[rank0]:v_head_dim=128 -[rank0]:vocab_size=2000 -[rank0]:) -[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory -[rank0]:[titan] TIMESTAMP - root - INFO - Total parameter count: dense 8,923,392, sparse 1,968,128, active 9,908,480 -[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown: -[rank0]:[titan] TIMESTAMP - root - INFO - DeepSeekV3ModelHFTransformerModel - 10,891,520 params -[rank0]:[titan] TIMESTAMP - root - INFO - (tok_embeddings):(embed_tokens): Embedding - 512,000 params -[rank0]:[titan] TIMESTAMP - root - INFO - (layers): ModuleDictModuleList - 9,867,264 params -[rank0]:[titan] TIMESTAMP - root - INFO - (0): TransformerBlockDeepseekV3DecoderLayer - 4,342,784 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): DeepseekV3Attention - 3,555,840 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 786,432 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_a):(kv_a_proj_with_mqa): Linear - 147,456 params -[rank0]:[titan] TIMESTAMP - root - INFO - (kv_norm): RMSNorm(kv_a_layernorm): DeepseekV3RMSNorm - 512 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_b):(kv_b_proj): Linear - 2,097,152 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 524,288 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(mlp): DeepseekV3MLP - 256786,432 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(gate_proj): Linear - 256262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(up_proj): Linear - 786,432262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(down_proj): Linear - 262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2): Linear(input_layernorm): DeepseekV3RMSNorm - 262,144256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3): Linear(post_attention_layernorm): DeepseekV3RMSNorm - 262,144256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (1): TransformerBlockDeepseekV3DecoderLayer - 5,524,480 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): DeepseekV3Attention - 3,555,840 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 786,432 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_a):(kv_a_proj_with_mqa): Linear - 147,456 params -[rank0]:[titan] TIMESTAMP - root - INFO - (kv_norm): RMSNorm(kv_a_layernorm): DeepseekV3RMSNorm - 512 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wkv_b):(kv_b_proj): Linear - 2,097,152 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 524,288 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(mlp): DeepseekV3MoE - 2561,968,128 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(experts): ModuleList - 2561,572,864 params -[rank0]:[titan] TIMESTAMP - root - INFO - (moe): MoE(0): DeepseekV3MLP - 1,968,128196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (experts): GroupedExperts(gate_proj): Linear - 1,572,86465,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (router): TokenChoiceTopKRouter(up_proj): Linear - 2,04865,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate):(down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (1): DeepseekV3MLP - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (2): DeepseekV3MLP - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (3): DeepseekV3MLP - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (4): DeepseekV3MLP - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (5): DeepseekV3MLP - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (6): DeepseekV3MLP - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (7): DeepseekV3MLP - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (up_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (down_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (gate): DeepseekV3TopkRouter - 2,048 params -[rank0]:[titan] TIMESTAMP - root - INFO - (shared_experts): FeedForwardDeepseekV3MLP - 393,216 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 131,072 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 131,072 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 131,072 params -[rank0]:[titan] TIMESTAMP - root - INFO - (input_layernorm): DeepseekV3RMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (post_attention_layernorm): DeepseekV3RMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (norm): RMSNormDeepseekV3RMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (output):(lm_head): Linear - 512,000 params -[rank0]:[titan] TIMESTAMP - root - INFO - Model deepseek_v3deepseek-ai/DeepSeek-V3 debugmodel size: 10,891,520 total parameters -[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model -[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14 -[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.05GiB(0.06%) -[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP -[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2) -[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1 -[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace -[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/lib/python3.12/site-packages/torch/nn/functional.py:2920: UserWarning: Mismatch dtype between input and weight: input dtype = c10::BFloat16, weight dtype = float, Cannot dispatch to fused implementation. (Triggered internally at /pytorch/aten/src/ATen/native/layer_norm.cpp:344.) -[rank0]: return torch.rms_norm(input, normalized_shape, weight, eps)./outputs/profile_trace_hf -[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 8.13818.1218 grad_norm: 2.73742.7807 memory: 2.14GiB(2.70%)2.48GiB(3.13%) tps: 18,02411,445 tflops: 1.240.89 mfu: 0.13%0.09% -[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 -[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.02086.8905 grad_norm: 3.26153.2709 memory: 2.15GiB(2.71%)2.49GiB(3.13%) tps: 20,23217,755 tflops: 1.401.38 mfu: 0.14% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 5.26425.1682 grad_norm: 2.87352.8229 memory: 2.15GiB(2.71%)2.49GiB(3.13%) tps: 325,066119,606 tflops: 22.429.32 mfu: 2.27%0.94% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 4.82864.7719 grad_norm: 2.18852.2433 memory: 2.15GiB(2.71%)2.51GiB(3.15%) tps: 345,536135,937 tflops: 23.8310.59 mfu: 2.41%1.07% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 4.43704.3827 grad_norm: 2.30532.3779 memory: 2.15GiB(2.71%)2.51GiB(3.15%) tps: 296,009133,266 tflops: 20.4110.39 mfu: 2.06%1.05% -[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5 -[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.05 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.30634.2368 grad_norm: 2.24452.2557 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 136,06566,465 tflops: 9.385.18 mfu: 0.95%0.52% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.12534.0403 grad_norm: 1.96261.9132 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 299,863131,077 tflops: 20.6810.22 mfu: 2.09%1.03% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.06453.9796 grad_norm: 1.82991.8154 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 343,855147,955 tflops: 23.7111.53 mfu: 2.40%1.17% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.47584.4010 grad_norm: 1.47431.4965 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 346,707139,416 tflops: 23.9110.87 mfu: 2.42%1.10% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.94833.8448 grad_norm: 1.62401.6185 memory: 2.15GiB(2.71%)2.71GiB(3.41%) tps: 303,029139,581 tflops: 20.9010.88 mfu: 2.11%1.10% -[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10 -[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.020.04 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete -[rank0]:[titan] TIMESTAMP - root - INFO - Training completed -[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log deleted file mode 100644 index 84eff10ff8..0000000000 --- a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log +++ /dev/null @@ -1,133 +0,0 @@ -diff --git a/tt_run.log.filtered b/hf_run.log.filtered -index 1f72d39..c1856a6 100644 ---- a/tt_run.log.filtered -+++ b/hf_run.log.filtered -@@ -1,125 +1,125 @@ -+ echo '##############################################' -##############################################'#######################################################' -####################################################### -+ echo '### Running TorchTitan (native)with HF backend training ###' -### Running TorchTitan (native)with HF backend training ### -+ echo '##############################################' -##############################################'#######################################################' -####################################################### -+ TT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.tomlHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml -+ CUDA_VISIBLE_DEVICES=0CUDA_VISIBLE_DEVICES=1 -+ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml --training.seed 42 --training.deterministic --model.name llama3meta-llama/Llama-3.2-1B -[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead. -[rank0]: warnings.warn( -[rank0]:[titan] TIMESTAMP - root - WARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily. -[rank0]:[titan] TIMESTAMP - root - INFO - Starting job: HF Llama 3 debug training -[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config -[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [] -[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation). -[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json -[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test -[rank0]:[titan] TIMESTAMP - root - INFO - Building llama3meta-llama/Llama-3.2-1B debugmodel with TransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256,HFTransformerModelArgs(dim=256, n_layers=6, n_heads=16, n_kv_heads=None,n_kv_heads=16, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)eos_id=0, attn_implementation='sdpa') -[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory -[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown: -[rank0]:[titan] TIMESTAMP - root - INFO - TransformerHFTransformerModel - 6,139,136 params -[rank0]:[titan] TIMESTAMP - root - INFO - (tok_embeddings):(embed_tokens): Embedding - 512,000 params -[rank0]:[titan] TIMESTAMP - root - INFO - (layers): ModuleDictModuleList - 5,114,880 params -[rank0]:[titan] TIMESTAMP - root - INFO - (0): TransformerBlockLlamaDecoderLayer - 852,480 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (1): TransformerBlockLlamaDecoderLayer - 852,480 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (2): TransformerBlockLlamaDecoderLayer - 852,480 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (3): TransformerBlockLlamaDecoderLayer - 852,480 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (4): TransformerBlockLlamaDecoderLayer - 852,480 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (5): TransformerBlockLlamaDecoderLayer - 852,480 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention): Attention(self_attn): LlamaAttention - 262,144 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wq):(q_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wk):(k_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wv):(v_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (wo):(o_proj): Linear - 65,536 params -[rank0]:[titan] TIMESTAMP - root - INFO - (feed_forward): FeedForward(mlp): LlamaMLP - 589,824 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w1):(gate_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w2):(up_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (w3):(down_proj): Linear - 196,608 params -[rank0]:[titan] TIMESTAMP - root - INFO - (attention_norm): RMSNorm(input_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (ffn_norm): RMSNorm(post_attention_layernorm): LlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (norm): RMSNormLlamaRMSNorm - 256 params -[rank0]:[titan] TIMESTAMP - root - INFO - (output):(lm_head): Linear - 512,000 params -[rank0]:[titan] TIMESTAMP - root - INFO - Model llama3meta-llama/Llama-3.2-1B debugmodel size: 6,139,136 total parameters -[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model -[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14 -[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.04GiB(0.05%) -[rank0]:[titan] TIMESTAMP - root - WARNING - model.safetensors.index.json not found at hf_assets_path: /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer/model.safetensors.index.json. Defaulting to saving a single safetensors file if checkpoint is saved in HF format -[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP -[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2) -[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1 -[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace./outputs/profile_trace_hf -[rank0]:[titan] TIMESTAMP - root - INFO - step: 1 loss: 7.87237.8704 grad_norm: 1.51671.5185 memory: 1.39GiB(1.75%)1.67GiB(2.10%) tps: 43,37532,685 tflops: 3.102.44 mfu: 0.31%0.25% -[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 -[rank0]:[titan] TIMESTAMP - root - INFO - step: 2 loss: 7.52467.5209 grad_norm: 1.63591.6373 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 20,83419,798 tflops: 1.491.48 mfu: 0.15% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 3 loss: 6.79006.7789 grad_norm: 2.03452.0390 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 338,323199,161 tflops: 24.1914.85 mfu: 2.45%1.50% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 4 loss: 5.98295.9673 grad_norm: 2.41292.4176 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 362,741207,198 tflops: 25.9415.45 mfu: 2.62%1.56% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 5 loss: 5.05365.0388 grad_norm: 2.53052.5275 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 287,315187,882 tflops: 20.5514.01 mfu: 2.08%1.42% -[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5 -[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.04 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - step: 6 loss: 4.63704.6283 grad_norm: 2.28262.2818 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 130,12183,115 tflops: 9.316.20 mfu: 0.94%0.63% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 7 loss: 4.31334.3077 grad_norm: 2.10192.1023 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 295,546174,068 tflops: 21.1312.98 mfu: 2.14%1.31% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 8 loss: 4.13984.1349 grad_norm: 1.93421.9334 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 361,129206,837 tflops: 25.8215.43 mfu: 2.61%1.56% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 9 loss: 4.53264.5289 grad_norm: 1.51111.5103 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 363,728208,233 tflops: 26.0115.53 mfu: 2.63%1.57% -[rank0]:[titan] TIMESTAMP - root - INFO - step: 10 loss: 3.98593.9828 grad_norm: 1.77991.7849 memory: 1.52GiB(1.91%)1.79GiB(2.26%) tps: 294,013188,295 tflops: 21.0314.04 mfu: 2.13%1.42% -[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10 -[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in 0.030.04 seconds -[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete -[rank0]:[titan] TIMESTAMP - root - INFO - Training completed -[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh deleted file mode 100755 index 6151fcda64..0000000000 --- a/torchtitan/experiments/transformers_backend/run_train.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -ex - -# use envs as local overwrites for convenience -# e.g. -# BACKEND=tt LOG_RANK=0,1 NGPU=4 ./run_train.sh -NGPU=${NGPU:-"8"} -export LOG_RANK=${LOG_RANK:-0} - -DEBUG_PORT=${DEBUG_PORT:-5678} -# Option to switch between debug and train -MODE=${MODE:-"train"} # Set MODE=debug or MODE=train - -# Option to switch between hf and tt backend -BACKEND=${BACKEND:-"hf"} - -if [ "$BACKEND" = "tt" ]; then - CONFIG_FILE=${CONFIG_FILE:-"/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml"} -elif [ "$BACKEND" = "hf" ]; then - CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu_hf.toml"} -else - echo "Invalid BACKEND set: ${BACKEND}" - exit 1 -fi - -if [ "$MODE" = "debug" ]; then - PYTHON_CMD="debugpy-run -p ${DEBUG_PORT} -m torch.distributed.run --" -else - PYTHON_CMD="torchrun" -fi - -TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} - -PYTORCH_ALLOC_CONF="expandable_segments:True" \ -TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \ -$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ ---local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ --m torchtitan.train --job.config_file ${CONFIG_FILE} "$@" \ No newline at end of file From c523ede6e930d30a84553b4f2233f8fd0691d1d6 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 29 Sep 2025 14:09:06 +0000 Subject: [PATCH 052/129] cleaner way to make create_causal_mask = None --- .../infra/parallelize_hf_transformers.py | 1 - .../model/hf_llama_patch.py | 69 +------------------ .../model/hf_transformers_args.py | 6 +- 3 files changed, 6 insertions(+), 70 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 56d6cf9ca6..469c3407a8 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -192,7 +192,6 @@ def parallelize_hf_transformers( if ( job_config.parallelism.context_parallel_degree > 1 - and model.model_args.use_flex_attn ): logger.warning("CP support for FlexAttention is still in progress.") diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py index ddde904cae..c3557f6973 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py +++ b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py @@ -1,11 +1,8 @@ import torch import torch.nn as nn from transformers.models.llama.configuration_llama import LlamaConfig -from transformers.models.llama.modeling_llama import LlamaModel, LlamaAttention, LlamaMLP, LlamaDecoderLayer +from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer from transformers.modeling_utils import PreTrainedModel -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_outputs import BaseModelOutputWithPast -from typing import Optional _original_llama_decoder_layer_init = LlamaDecoderLayer.__init__ @@ -86,71 +83,7 @@ def _init_weights_patched(self, module): if hasattr(module, "bias") and module.bias is not None: module.bias.data.zero_() -def _patched_forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - cache_position: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - **kwargs, -) -> BaseModelOutputWithPast: - """ - A patched version of LlamaModel.forward that disables the causal mask. - This is a direct copy of the original method with one line changed. - """ - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if inputs_embeds is None: - inputs_embeds: torch.Tensor = self.embed_tokens(input_ids) - - if use_cache and past_key_values is None: - past_key_values = DynamicCache() - - if cache_position is None: - past_seen_tokens = ( - past_key_values.get_seq_length() if past_key_values is not None else 0 - ) - cache_position: torch.Tensor = torch.arange( - past_seen_tokens, - past_seen_tokens + inputs_embeds.shape[1], - device=inputs_embeds.device, - ) - - if position_ids is None: - position_ids = cache_position.unsqueeze(0) - - # --- START OF PATCH --- - # NOTE(3outeille): When TP enabled, the causal_mask will be created based on input_embeds which has sharded seq_len. - # We set it to False so that SDPA is creating the causal mask based on query & key seq_len. - causal_mask = None - # --- END OF PATCH --- - - hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - - for decoder_layer in self.layers[: self.config.num_hidden_layers]: - hidden_states = decoder_layer( - hidden_states, - attention_mask=causal_mask, - position_ids=position_ids, - past_key_value=past_key_values, - cache_position=cache_position, - position_embeddings=position_embeddings, - **kwargs, - ) - - hidden_states = self.norm(hidden_states) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=past_key_values, - ) - def patch_hf_llama(): - LlamaModel.forward = _patched_forward LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched PreTrainedModel._init_weights = _init_weights_patched PreTrainedModel._initialize_weights = _initialize_weights_patched \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index afafddd900..21fe8f1786 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -13,6 +13,8 @@ from torchtitan.tools.logging import logger from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import AttentionInterface +from transformers.integrations.sdpa_attention import sdpa_attention_forward @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): @@ -47,7 +49,7 @@ def __init__( titan_args, deepseek_v3_args=None, # HuggingFace specific args - attn_implementation: str = "sdpa", + attn_implementation: str = "sdpa_torchtitan", **kwargs, ): assert titan_args is not None, "titan_args is required" @@ -72,6 +74,8 @@ def __init__( # HuggingFace specific args self.attn_implementation = attn_implementation + #NOTE:(3outeille):This will force create_causal_mask to return None + AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward # Start with passed_args as just titan_args self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} From f9f5c66b1ceea0d49de22344078dfeaa724c7dc9 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 29 Sep 2025 14:58:41 +0000 Subject: [PATCH 053/129] uniformize llama and moe args passing --- .../transformers_backend/__init__.py | 106 +++++++----------- .../compare_distributed_run.py | 10 +- .../model/hf_transformers_args.py | 12 +- 3 files changed, 61 insertions(+), 67 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 6e6894b109..ac0431ec3f 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -20,10 +20,6 @@ from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel from torchtitan.models.moe import MoEArgs -from .model.hf_llama_patch import patch_hf_llama -from .model.hf_deepseek_v3_patch import patch_hf_deepseek_v3 - - __all__ = [ "HFTransformerModelArgs", @@ -74,69 +70,49 @@ class DeepSeekV3Args: mscale: Optional[float] = None partial_rotary_factor: Optional[float] = None -# #TODO(3outeille): identify that if MoE model is used, we add a moe_args field - -if os.environ.get("MODEL_TYPE") == "llama3" or os.environ.get("MODEL_TYPE") == "meta-llama/Llama-3.2-1B": - print("Using llama model") - patch_hf_llama() - flavors = { - "debugmodel": HFTransformerModelArgs( - titan_args=TitanModelArgs( - max_seq_len=2048, - dim=256, - n_layers=6, - n_heads=16, - n_kv_heads=16, - vocab_size=2000, - rope_theta=500000 - ), - ), - "medium": HFTransformerModelArgs( - titan_args=TitanModelArgs( - dim=1024, - n_layers=12, - ), - ), - "full": HFTransformerModelArgs( - titan_args=TitanModelArgs(), +flavors = { + "debugmodel": HFTransformerModelArgs( + titan_args=TitanModelArgs( + vocab_size=2000, + dim=256, + n_layers=6, + n_heads=16, + n_kv_heads=16, ), - } -else: - print("Using deepseek model") - patch_hf_deepseek_v3() - flavors = { - "debugmodel": HFTransformerModelArgs( - titan_args=TitanModelArgs( - vocab_size=2000, - dim=256, - n_layers=2, - n_heads=16, - n_kv_heads=16, - ), - deepseek_v3_args=DeepSeekV3Args( - partial_rotary_factor=4.0, - inter_dim=1024, - moe_inter_dim=256, - n_dense_layers=1, - n_group=2, - topk_group=1, - kv_lora_rank=512, - q_lora_rank=0, - qk_nope_head_dim=128, - qk_rope_head_dim=64, - v_head_dim=128, - mscale=0.70, - moe_args=MoEArgs( - num_experts=8, - num_shared_experts=2, - top_k=3, - score_func="softmax", - route_norm=True, - score_before_experts=False, - ), - ) + deepseek_v3_args=None + # deepseek_v3_args=DeepSeekV3Args( + # partial_rotary_factor=4.0, + # inter_dim=1024, + # moe_inter_dim=256, + # n_dense_layers=1, + # n_group=2, + # topk_group=1, + # kv_lora_rank=512, + # q_lora_rank=0, + # qk_nope_head_dim=128, + # qk_rope_head_dim=64, + # v_head_dim=128, + # mscale=0.70, + # moe_args=MoEArgs( + # num_experts=8, + # num_shared_experts=2, + # top_k=3, + # score_func="softmax", + # route_norm=True, + # score_before_experts=False, + # ), + # ) + ), + "medium": HFTransformerModelArgs( + titan_args=TitanModelArgs( + dim=1024, + n_layers=12, ), - } + ), + "full": HFTransformerModelArgs( + titan_args=TitanModelArgs(), + ), +} hf_train_spec = TrainSpec( name="hf_auto_model", diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index cc8f54f51b..3211326caf 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -511,7 +511,6 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode ] env = os.environ.copy() env["SEED"] = str(self.seed) - env["MODEL_TYPE"] = model_name env["LOG_RANK"] = str(self.ngpu - 1) log_message(LogLevel.COMMAND, f"Command: {' '.join(cmd)}", indent=indent, dim=dim) @@ -788,6 +787,15 @@ def run(self) -> int: if not self.compare_metrics( tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0 ): + # generate diff between baseline TT and baseline HF + diff_file_tt_baseline_vs_hf_baseline = ( + self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" + ) + self.generate_diff( + baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0 + ) + log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0) + raise ValueError( f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}" ) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 21fe8f1786..2b9cec5678 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -15,6 +15,8 @@ from transformers.configuration_utils import PretrainedConfig from transformers.modeling_utils import AttentionInterface from transformers.integrations.sdpa_attention import sdpa_attention_forward +from torchtitan.experiments.transformers_backend.model.hf_llama_patch import patch_hf_llama +from torchtitan.experiments.transformers_backend.model.hf_deepseek_v3_patch import patch_hf_deepseek_v3 @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): @@ -81,7 +83,7 @@ def __init__( self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} self._passed_args.update(kwargs) - # If DeepSeekV3 args are provided, fill the rest + #NOTE(3outeille): Wait for transformers uniformization of MoE args if deepseek_v3_args is not None: # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to # setting it to None in HuggingFace. @@ -285,6 +287,14 @@ def __init__(self, model_args: HFTransformerModelArgs): f"Could not find model class '{model_class_name}' in globals or transformers. " f"Make sure the class is available. Original error: {e}" ) + + if model_args.architectures[0] == "DeepseekV3Model": + print("Patching deepseek") + patch_hf_deepseek_v3() + else: + print("Patching llama") + patch_hf_llama() + self.model = model_cls(config=model_args) for layer in self.model.model.layers: From 5a875b66a0947f87053369a2b565f731e11777be Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 29 Sep 2025 15:24:10 +0000 Subject: [PATCH 054/129] cleaning code --- .../model/hf_deepseek_v3_patch.py | 29 +---------- .../model/hf_transformers_args.py | 25 +--------- torchtitan/models/deepseek_v3/__init__.py | 2 +- torchtitan/models/deepseek_v3/model/args.py | 16 ------ torchtitan/models/deepseek_v3/model/model.py | 32 +----------- torchtitan/models/llama3/infra/parallelize.py | 1 + torchtitan/models/llama3/model/args.py | 16 ------ torchtitan/models/moe.py | 31 +----------- torchtitan/train.py | 11 +---- torchtitan/utils/test_utils.py | 49 +++++++++++++++++++ 10 files changed, 58 insertions(+), 154 deletions(-) create mode 100644 torchtitan/utils/test_utils.py diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py index 68594dc2be..c2cb960ac5 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py +++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py @@ -1,38 +1,13 @@ import os -import torch import torch.nn as nn -import functools +from torchtitan.utils.test_utils import seeded_init_decorator_for_test from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer from transformers.modeling_utils import PreTrainedModel -_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__ - -def seeded_init_decorator_for_test(seed): - """ - Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call - and prints layer weights after initialization. - """ - import lovely_tensors as lt; lt.monkey_patch() - def decorator(func): - @functools.wraps(func) - def wrapper(self, module): - original_trunc_normal = nn.init.trunc_normal_ - def seeded_trunc_normal(*args, **kwargs): - torch.manual_seed(seed) - tensor = args[0] # First argument is always the tensor - result = original_trunc_normal(*args, **kwargs) - # module_name = getattr(module, "__class__", type(module)).__name__ - # print(f"Module: {module_name}, Tensor value: {tensor}") - return result - - nn.init.trunc_normal_ = seeded_trunc_normal - return func(self, module) - - return wrapper - return decorator +_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__ def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int): _original_deepseek_v3_decoder_layer_init(self, config, layer_idx) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 2b9cec5678..917d50a43f 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -12,6 +12,7 @@ from torchtitan.protocols import BaseModelArgs from torchtitan.tools.logging import logger from transformers import AutoConfig +from transformers.utils import is_torch_deterministic from transformers.configuration_utils import PretrainedConfig from transformers.modeling_utils import AttentionInterface from transformers.integrations.sdpa_attention import sdpa_attention_forward @@ -247,30 +248,6 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in return nparams, num_flops_per_token - def debug_structure_param(self, model: nn.Module): - logger.info("Model Structure Parameter Breakdown:") - - def _format_module(module: nn.Module, prefix: str = ""): - for name, sub_module in module.named_children(): - sub_module_params = sum(p.numel() for p in sub_module.parameters()) - if sub_module_params == 0: - continue - - # For HF models, we want to "unwrap" the ".model" attribute - # to get a view comparable to the native TorchTitan models. - if name == "model": - _format_module(sub_module, prefix) - else: - logger.info( - f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" - ) - _format_module(sub_module, prefix + " ") - - total_params = sum(p.numel() for p in model.parameters()) - logger.info(f"{model.__class__.__name__} - {total_params:,} params") - _format_module(model, " ") - - class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): super().__init__() diff --git a/torchtitan/models/deepseek_v3/__init__.py b/torchtitan/models/deepseek_v3/__init__.py index 3322ad0a83..1c3d2b19d2 100644 --- a/torchtitan/models/deepseek_v3/__init__.py +++ b/torchtitan/models/deepseek_v3/__init__.py @@ -35,7 +35,7 @@ dim=256, inter_dim=1024, moe_inter_dim=256, - n_layers=2, + n_layers=3, n_dense_layers=1, n_heads=16, moe_args=MoEArgs( diff --git a/torchtitan/models/deepseek_v3/model/args.py b/torchtitan/models/deepseek_v3/model/args.py index 9451f01b01..d6afedfa34 100644 --- a/torchtitan/models/deepseek_v3/model/args.py +++ b/torchtitan/models/deepseek_v3/model/args.py @@ -159,19 +159,3 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in ) return nparams, num_flops_per_token - - def debug_structure_param(self, model: nn.Module): - logger.info("Model Structure Parameter Breakdown:") - - def _format_module(module: nn.Module, prefix: str = ""): - for name, sub_module in module.named_children(): - sub_module_params = sum(p.numel() for p in sub_module.parameters()) - if sub_module_params > 0: - logger.info( - f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" - ) - _format_module(sub_module, prefix + " ") - - total_params = sum(p.numel() for p in model.parameters()) - logger.info(f"{model.__class__.__name__} - {total_params:,} params") - _format_module(model, " ") diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py index 5547840e27..260c7bf49a 100644 --- a/torchtitan/models/deepseek_v3/model/model.py +++ b/torchtitan/models/deepseek_v3/model/model.py @@ -6,7 +6,6 @@ import math import os -import functools from typing import Tuple import torch @@ -15,39 +14,10 @@ from torchtitan.models.attention import build_attention from torchtitan.models.moe import FeedForward, MoE from torchtitan.protocols.train_spec import ModelProtocol - +from torchtitan.utils.test_utils import seeded_init_decorator_for_test from .args import DeepSeekV3ModelArgs -def seeded_init_decorator_for_test(seed): - """ - Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call - and prints layer weights after initialization. - """ - import lovely_tensors as lt; lt.monkey_patch() - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - original_trunc_normal = nn.init.trunc_normal_ - - def seeded_trunc_normal(*trunc_args, **trunc_kwargs): - torch.manual_seed(seed) - tensor = trunc_args[0] # First argument is always the tensor - result = original_trunc_normal(*trunc_args, **trunc_kwargs) - # # Try to get module info from the calling context - # module_name = "Unknown" - # if len(args) > 0 and hasattr(args[0], "__class__"): - # module_name = args[0].__class__.__name__ - # print(f"Module: {module_name}, Tensor value: {tensor}") - return result - - nn.init.trunc_normal_ = seeded_trunc_normal - return func(*args, **kwargs) - - return wrapper - return decorator - - # Adapted from https://github.com/DeepSeek-ai/DeepSeek-V3/blob/main/inference/model.py#L294 def precompute_freqs_cis(args: DeepSeekV3ModelArgs) -> torch.Tensor: """ diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py index 1a2528be6d..7d0b5de92b 100644 --- a/torchtitan/models/llama3/infra/parallelize.py +++ b/torchtitan/models/llama3/infra/parallelize.py @@ -34,6 +34,7 @@ from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp from torchtitan.tools.logging import logger + def parallelize_llama( model: nn.Module, parallel_dims: ParallelDims, diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py index 5aaf3839ed..e2f698f8b1 100644 --- a/torchtitan/models/llama3/model/args.py +++ b/torchtitan/models/llama3/model/args.py @@ -75,19 +75,3 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t return nparams, num_flops_per_token - - def debug_structure_param(self, model: nn.Module): - logger.info("Model Structure Parameter Breakdown:") - - def _format_module(module: nn.Module, prefix: str = ""): - for name, sub_module in module.named_children(): - sub_module_params = sum(p.numel() for p in sub_module.parameters()) - if sub_module_params > 0: - logger.info( - f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" - ) - _format_module(sub_module, prefix + " ") - - total_params = sum(p.numel() for p in model.parameters()) - logger.info(f"{model.__class__.__name__} - {total_params:,} params") - _format_module(model, " ") \ No newline at end of file diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py index 5ba63b9157..e2e3981625 100644 --- a/torchtitan/models/moe.py +++ b/torchtitan/models/moe.py @@ -13,36 +13,7 @@ from torchtitan.distributed.expert_parallel import expert_parallel import os -import functools - - -def seeded_init_decorator_for_test(seed): - """ - Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call - and prints layer weights after initialization. - """ - import lovely_tensors as lt; lt.monkey_patch() - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - original_trunc_normal = nn.init.trunc_normal_ - - def seeded_trunc_normal(*trunc_args, **trunc_kwargs): - torch.manual_seed(seed) - tensor = trunc_args[0] # First argument is always the tensor - result = original_trunc_normal(*trunc_args, **trunc_kwargs) - # # Try to get module info from the calling context - # module_name = "Unknown" - # if len(args) > 0 and hasattr(args[0], "__class__"): - # module_name = args[0].__class__.__name__ - # print(f"Module: {module_name}, Tensor value: {tensor}") - return result - - nn.init.trunc_normal_ = seeded_trunc_normal - return func(*args, **kwargs) - - return wrapper - return decorator +from torchtitan.utils.test_utils import seeded_init_decorator_for_test @dataclass class MoEArgs: diff --git a/torchtitan/train.py b/torchtitan/train.py index 735180ee5a..6fee3d587f 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -8,12 +8,11 @@ import os import time from datetime import timedelta -from transformers.utils import is_torch_deterministic from typing import Any, Generator, Iterable, Optional import torch from torch.distributed.elastic.multiprocessing.errors import record - +from torchtitan.utils.test_utils import debug_structure_param import torchtitan.protocols.train_spec as train_spec_module from torchtitan.components.checkpoint import CheckpointManager from torchtitan.components.dataloader import DataloaderExhaustedError @@ -178,7 +177,7 @@ def __init__(self, job_config: JobConfig): self.metrics_processor.num_flops_per_token, ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len) - model_args.debug_structure_param(model) + debug_structure_param(model) logger.info( f"{color.blue}Model {self.train_spec.name} {job_config.model.flavor} " @@ -250,9 +249,6 @@ def __init__(self, job_config: JobConfig): del model for m in self.model_parts: - if is_torch_deterministic(): - # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan - torch.utils.deterministic.fill_uninitialized_memory = False m.to_empty(device=init_device) with torch.no_grad(): m.init_weights(buffer_device=buffer_device) @@ -263,9 +259,6 @@ def __init__(self, job_config: JobConfig): else: # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel model = self.train_spec.parallelize_fn(model, parallel_dims, job_config) - if is_torch_deterministic(): - # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan - torch.utils.deterministic.fill_uninitialized_memory = False model.to_empty(device=init_device) with torch.no_grad(): model.init_weights(buffer_device=buffer_device) diff --git a/torchtitan/utils/test_utils.py b/torchtitan/utils/test_utils.py new file mode 100644 index 0000000000..77db8bcfe6 --- /dev/null +++ b/torchtitan/utils/test_utils.py @@ -0,0 +1,49 @@ +import torch +import functools +import torch.nn as nn +from torchtitan.tools.logging import logger +from transformers.utils import is_torch_deterministic +import lovely_tensors as lt; lt.monkey_patch() + +def debug_structure_param(model: nn.Module): + """Print a breakdown of model parameters by module structure.""" + logger.info("Model Structure Parameter Breakdown:") + + if is_torch_deterministic(): + # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan + torch.utils.deterministic.fill_uninitialized_memory = False + + def _format_module(module: nn.Module, prefix: str = ""): + for name, sub_module in module.named_children(): + sub_module_params = sum(p.numel() for p in sub_module.parameters()) + if sub_module_params > 0: + logger.info( + f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" + ) + _format_module(sub_module, prefix + " ") + + total_params = sum(p.numel() for p in model.parameters()) + logger.info(f"{model.__class__.__name__} - {total_params:,} params") + _format_module(model, " ") + +def seeded_init_decorator_for_test(seed): + """ + Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call + and prints layer weights after initialization. + """ + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + original_trunc_normal = nn.init.trunc_normal_ + + def seeded_trunc_normal(*trunc_args, **trunc_kwargs): + torch.manual_seed(seed) + tensor = trunc_args[0] # First argument is always the tensor + result = original_trunc_normal(*trunc_args, **trunc_kwargs) + return result + + nn.init.trunc_normal_ = seeded_trunc_normal + return func(*args, **kwargs) + + return wrapper + return decorator From e4d963c5bfff9b66cc3c1569447cfcd8381cc4df Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 30 Sep 2025 13:40:02 +0000 Subject: [PATCH 055/129] fix same global_batch_size across training + fix float32 for test (even for fsdp) --- .../compare_distributed_run.py | 45 +++++++++++++++---- .../configs/test_template.toml | 5 ++- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 3211326caf..345dc91d33 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -11,6 +11,7 @@ - train the nd-// TT counterpart - diff between TT nd-// and HF nd-// - diff between TT FSDP (baseline) and HF nd-// + - diff between TT FSDP (baseline) and TF nd-// results/ |_ meta-llama |_ Llama-3.2-1B @@ -668,6 +669,32 @@ def _compare_one_parallelism_config( indent=indent + 5, dim=True, ) + + # generated diff between baseline TT and current tt nd-parallelism run + diff_file_tt_baseline_vs_tt_nd_parallelism = ( + test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log" + ) + self.generate_diff( + baseline_log_tt, + log_path_tt, + diff_file_tt_baseline_vs_tt_nd_parallelism, + indent=indent + 5, + dim=True, + ) + if tt_metrics: + self.compare_metrics( + tt_baseline_metrics, + tt_metrics, + f"{config.name} (TT baseline vs TT nd-parallel)", + indent=indent + 5, + dim=True, + ) + log_message( + LogLevel.INFO, + f"Diff between baseline TT and current (TT) nd-parallelism run saved to: {diff_file_tt_baseline_vs_tt_nd_parallelism}", + indent=indent + 5, + dim=True, + ) return False def run(self) -> int: @@ -784,18 +811,18 @@ def run(self) -> int: if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") + # generate diff between baseline TT and baseline HF + diff_file_tt_baseline_vs_hf_baseline = ( + self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" + ) + self.generate_diff( + baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0 + ) + log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0) + if not self.compare_metrics( tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0 ): - # generate diff between baseline TT and baseline HF - diff_file_tt_baseline_vs_hf_baseline = ( - self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" - ) - self.generate_diff( - baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0 - ) - log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0) - raise ValueError( f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}" ) diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml index f56a0332d7..238f325ba2 100644 --- a/torchtitan/experiments/transformers_backend/configs/test_template.toml +++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml @@ -39,12 +39,15 @@ decay_type = "linear" min_lr_factor = 0.0 [training] -local_batch_size = 8 +global_batch_size = 4 +local_batch_size = 2 seq_len = 2048 max_norm = 1.0 # grad norm clipping steps = 10 dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M) dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" +mixed_precision_param = "float32" # force float32 for comparison +mixed_precision_reduce = "float32" [parallelism] data_parallel_replicate_degree = 1 From 957cc4a90007e1822430e435acad8456f6104b49 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 30 Sep 2025 15:29:12 +0000 Subject: [PATCH 056/129] refactor compare_distributed_run to make it slurm compatible --- .../compare_distributed_run.py | 281 ++++++++++-------- 1 file changed, 158 insertions(+), 123 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 345dc91d33..8ec761fda2 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -514,7 +514,7 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode env["SEED"] = str(self.seed) env["LOG_RANK"] = str(self.ngpu - 1) - log_message(LogLevel.COMMAND, f"Command: {' '.join(cmd)}", indent=indent, dim=dim) + log_message(LogLevel.COMMAND, f"{' '.join(cmd)}", indent=indent, dim=dim) try: # Capture output to include it in the exception, while still writing to log file @@ -565,137 +565,134 @@ def _compare_one_parallelism_config( indent: int = 0, ) -> bool: """Compares a single parallelism configuration against the baseline.""" - # Create a subdirectory for each test configuration + # New flow: launch all training, then all diff, then all extract/compare metrics + + # --- 1. Setup directories and config files --- test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface" test_dir = self.results_dir / test_dir_name test_dir.mkdir(exist_ok=True) config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf, indent=indent) + config_file_hf = self.generate_config( + config_dir=test_dir, + config=config, + model_name=hf_model_name, + backend="huggingface", + filename=config_filename_hf, + indent=indent, + ) log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" - hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name, indent=indent) - - test_passed = True - hf_metrics = None - if hf_run_error: - log_message(LogLevel.TEST_FAIL, f"{config.name} (huggingface) - Training script failed.", indent=indent + 5, dim=True) - test_passed = False - else: - # Compare metrics only if training was successful - hf_metrics = self.extract_metrics(log_path_hf, indent=indent) - if not self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)", indent=indent + 5, dim=True): - test_passed = False + config_filename_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + config_file_tt = self.generate_config( + config_dir=test_dir, + config=config, + model_name=tt_model_name, + backend="torchtitan", + filename=config_filename_tt, + indent=indent + 5, + dim=True, + ) + log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" + + # --- 2. Launch all training (HF and TT) --- + hf_run_error = self.run_training( + config_file=config_file_hf, + log_file=log_path_hf, + config_name=config.name, + model_name=hf_model_name, + indent=indent, + ) + tt_run_error = self.run_training( + config_file=config_file_tt, + log_file=log_path_tt, + config_name=config.name, + model_name=tt_model_name, + indent=indent + 5, + dim=True, + ) - if test_passed: - return True - else: - # Generate diff with baseline (HF) - diff_hf_baseline_vs_hf_nd_parallelism = ( - test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log" - ) - self.generate_diff( - baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism, indent=indent + 5, dim=True - ) + # If either training failed, log and skip further steps for this config + if hf_run_error: log_message( - LogLevel.INFO, - f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}", + LogLevel.TEST_FAIL, + f"{config.name} (huggingface) - Training script failed.", indent=indent + 5, dim=True, ) + return False - # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run - config_filename_tt = ( - test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" - ) - config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt, indent=indent + 5, dim=True) - log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" - tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name, indent=indent + 5, dim=True) - if tt_run_error: - raise ValueError( - f"TorchTitan training failed for {tt_model_name}" - ) from tt_run_error - - tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True) - - # generated diff between nd-paralellism TT and current hf nd-parallelism run - diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = ( - test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log" - ) - self.generate_diff( - log_path_tt, - log_path_hf, - diff_file_tt_nd_parallelism_vs_hf_nd_parallelism, + if tt_run_error: + log_message( + LogLevel.TEST_FAIL, + f"{config.name} (torchtitan) - Training script failed.", indent=indent + 5, dim=True, ) - if hf_metrics: - self.compare_metrics( + return False + + # --- 3. Generate all diffs --- + list_of_diffs = { + "HF baseline vs HF nd-parallel": (baseline_log_hf, log_path_hf, test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"), + "TT nd-parallel vs HF nd-parallel": (log_path_tt, log_path_hf, test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"), + "TT baseline vs HF nd-parallel": (baseline_log_tt, log_path_hf, test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"), + "TT baseline vs TT nd-parallel": (baseline_log_tt, log_path_tt, test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log"), + } + for src, dst, output in list_of_diffs.values(): + self.generate_diff(src, dst, output, indent=indent + 5, dim=True) + + # --- 4. Extract all metrics --- + hf_metrics = self.extract_metrics(log_path_hf, indent=indent) + tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True) + + # --- 5. Compare metrics and determine pass/fail --- + test_passed = True + + for diff_name, (src, dst, output) in list_of_diffs.items(): + if "TT nd-parallel vs HF nd-parallel" == diff_name: + metrics_passed = self.compare_metrics( tt_metrics, hf_metrics, - f"{config.name} (TT nd-parallel vs HF nd-parallel)", + diff_name, indent=indent + 5, dim=True, ) - log_message( - LogLevel.INFO, - f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}", - indent=indent + 5, - dim=True, - ) - - # generated diff between baseline TT and current hf nd-parallelism run - diff_file_tt_baseline_vs_hf_nd_parallelism = ( - test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log" - ) - self.generate_diff( - baseline_log_tt, - log_path_hf, - diff_file_tt_baseline_vs_hf_nd_parallelism, - indent=indent + 5, - dim=True, - ) - if hf_metrics: - self.compare_metrics( + elif "TT baseline vs TT nd-parallel" == diff_name: + metrics_passed = self.compare_metrics( tt_baseline_metrics, - hf_metrics, - f"{config.name} (TT baseline vs HF nd-parallel)", + tt_metrics, + diff_name, indent=indent + 5, dim=True, ) - log_message( - LogLevel.INFO, - f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}", - indent=indent + 5, - dim=True, - ) - - # generated diff between baseline TT and current tt nd-parallelism run - diff_file_tt_baseline_vs_tt_nd_parallelism = ( - test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log" - ) - self.generate_diff( - baseline_log_tt, - log_path_tt, - diff_file_tt_baseline_vs_tt_nd_parallelism, - indent=indent + 5, - dim=True, - ) - if tt_metrics: - self.compare_metrics( + elif "TT baseline vs HF nd-parallel" == diff_name: + metrics_passed = self.compare_metrics( tt_baseline_metrics, - tt_metrics, - f"{config.name} (TT baseline vs TT nd-parallel)", + hf_metrics, + diff_name, indent=indent + 5, dim=True, ) + else: # HF baseline vs HF nd-parallel == diff_name + metrics_passed = self.compare_metrics( + hf_baseline_metrics, + hf_metrics, + diff_name, + indent=indent + 5, + dim=True, + ) + + if not metrics_passed: + test_passed = False + log_message( LogLevel.INFO, - f"Diff between baseline TT and current (TT) nd-parallelism run saved to: {diff_file_tt_baseline_vs_tt_nd_parallelism}", - indent=indent + 5, + f"Diff between {diff_name} saved to: {output}", + indent=indent + 10, dim=True, ) - return False + + return test_passed def run(self) -> int: """Main execution function. Runs all test suites for all models.""" @@ -788,44 +785,82 @@ def run(self) -> int: ) baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None) - + # --- 1. Generate configs --- baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf, indent=0) + baseline_config_file_hf = self.generate_config( + config_dir=self.results_dir, + config=baseline_config, + model_name=hf_model_name, + backend="huggingface", + filename=baseline_config_filename_hf, + indent=0 + ) baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log" - hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name, indent=0) + + baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + baseline_config_file_tt = self.generate_config( + config_dir=self.results_dir, + config=baseline_config, + model_name=tt_model_name, + backend="torchtitan", + filename=baseline_config_filename_tt, + indent=0 + ) + baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log" + + # --- 2. Launch all training --- + hf_baseline_run_error = self.run_training( + config_file=baseline_config_file_hf, + log_file=baseline_log_hf, + config_name=baseline_config.name, + model_name=hf_model_name, + indent=0 + ) if hf_baseline_run_error: raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error + tt_baseline_run_error = self.run_training( + config_file=baseline_config_file_tt, + log_file=baseline_log_tt, + config_name=baseline_config.name, + model_name=tt_model_name, + indent=0 + ) + if tt_baseline_run_error: + raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error + + # --- 3. Generate diff --- + diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" + self.generate_diff( + baseline_log_tt, + baseline_log_hf, + diff_file_tt_baseline_vs_hf_baseline, + indent=0 + ) + log_message( + LogLevel.INFO, + f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", + indent=5, + dim=True + ) + + # --- 4. Extract metrics --- hf_baseline_metrics = self.extract_metrics(baseline_log_hf, indent=0) if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm: raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") - baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" - baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt, indent=0) - baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log" - tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name, indent=0) - if tt_baseline_run_error: - raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error - tt_baseline_metrics = self.extract_metrics(baseline_log_tt, indent=0) if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") - - # generate diff between baseline TT and baseline HF - diff_file_tt_baseline_vs_hf_baseline = ( - self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" - ) - self.generate_diff( - baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0 - ) - log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0) - + + # --- 5. Compare metrics --- if not self.compare_metrics( - tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0 + tt_baseline_metrics, + hf_baseline_metrics, + "baseline (TT) vs baseline (HF)", + indent=5 ): - raise ValueError( - f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}" - ) + raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") console.print() console.print( From a317c53dcddc0e0685c63bd2e21af0cfa13631c1 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 1 Oct 2025 12:31:03 +0000 Subject: [PATCH 057/129] breaking test --- .../compare_distributed_run.py | 206 ++++++++++++++---- 1 file changed, 169 insertions(+), 37 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py index 8ec761fda2..b42e8b0138 100644 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py @@ -21,7 +21,7 @@ |_ baseline_tt_fsdp_4gpu.log |_ baseline_fsdp_debugmodel_4gpu_huggingface.toml |_ baseline_fsdp_debugmodel_4gpu_torchtitan.toml - |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface/ + |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu/ |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.toml |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_torchtitan.toml |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.log @@ -33,7 +33,7 @@ |_ baseline_tt_fsdp_4gpu.log |_ baseline_fsdp_full_4gpu_huggingface.toml |_ baseline_fsdp_full_4gpu_torchtitan.toml - |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface/ + |_ fsdp1_cp1_tp2_pp2_full_4gpu/ |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.toml |_ fsdp1_cp1_tp2_pp2_full_4gpu_torchtitan.toml |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.log @@ -494,7 +494,7 @@ def _filter_log(log_file: Path) -> Path: except Exception as e: log_message(LogLevel.WARNING, f"Could not generate diff: {e}", indent=indent, dim=dim) - def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]: + def run_training_local(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]: """Run training with given configuration.""" log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}", indent=indent, dim=dim) cmd = [ @@ -553,6 +553,9 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or ''}") return e + def run_training_slurm(self): + pass + def _compare_one_parallelism_config( self, config: "ParallelismConfig", @@ -568,7 +571,7 @@ def _compare_one_parallelism_config( # New flow: launch all training, then all diff, then all extract/compare metrics # --- 1. Setup directories and config files --- - test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface" + test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu" test_dir = self.results_dir / test_dir_name test_dir.mkdir(exist_ok=True) @@ -596,14 +599,14 @@ def _compare_one_parallelism_config( log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" # --- 2. Launch all training (HF and TT) --- - hf_run_error = self.run_training( + hf_run_error = self.run_training_local( config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name, indent=indent, ) - tt_run_error = self.run_training( + tt_run_error = self.run_training_local( config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, @@ -694,35 +697,8 @@ def _compare_one_parallelism_config( return test_passed - def run(self) -> int: + def run_local(self, args: argparse.Namespace) -> int: """Main execution function. Runs all test suites for all models.""" - parser = argparse.ArgumentParser( - description="Test different parallelism configurations against a baseline FSDP model.", - ) - parser.add_argument("-m", "--model-filter", default="", - help="Filter models by name pattern (e.g., 'llama3')") - parser.add_argument("-t", "--test-filter", default="", - help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')") - parser.add_argument("-nd", "--nd_parallel", type=str, default="2d", - help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})") - parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS, - help=f"Training steps (default: {self.DEFAULT_STEPS})") - parser.add_argument("--flavor", default=self.DEFAULT_FLAVOR, - help=f"Model flavor/size (default: {self.DEFAULT_FLAVOR}). " - f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]") - parser.add_argument("-v", "--verbose", action="store_true", - help="Verbose output") - parser.add_argument("--loss-atol", type=float, default=self.DEFAULT_LOSS_ATOL, - help=f"Absolute tolerance for loss comparison (default: {self.DEFAULT_LOSS_ATOL})") - parser.add_argument("--loss-rtol", type=float, default=self.DEFAULT_LOSS_RTOL, - help=f"Relative tolerance for loss comparison (default: {self.DEFAULT_LOSS_RTOL})") - parser.add_argument("--grad-norm-atol", type=float, default=self.DEFAULT_GRAD_NORM_ATOL, - help=f"Absolute tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_ATOL})") - parser.add_argument("--grad-norm-rtol", type=float, default=self.DEFAULT_GRAD_NORM_RTOL, - help=f"Relative tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_RTOL})") - - args = parser.parse_args() - self.nd_parallel = args.nd_parallel self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] self.steps = args.steps @@ -809,7 +785,7 @@ def run(self) -> int: baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log" # --- 2. Launch all training --- - hf_baseline_run_error = self.run_training( + hf_baseline_run_error = self.run_training_local( config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, @@ -819,7 +795,7 @@ def run(self) -> int: if hf_baseline_run_error: raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error - tt_baseline_run_error = self.run_training( + tt_baseline_run_error = self.run_training_local( config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, @@ -960,12 +936,168 @@ def run(self) -> int: LogLevel.INFO, f"Check the diff files in {self.results_dir} for details" ) return 1 + + def run_slurm(self, args: argparse.Namespace) -> int: + """Main execution function. Runs all test suites for all models.""" + self.nd_parallel = args.nd_parallel + self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] + self.steps = args.steps + self.model_filter = args.model_filter + self.test_filter = args.test_filter + self.flavor = args.flavor + self.verbose = args.verbose + self.loss_atol = args.loss_atol + self.loss_rtol = args.loss_rtol + self.grad_norm_atol = args.grad_norm_atol + self.grad_norm_rtol = args.grad_norm_rtol + + console.print( + Panel( + ( + f"[bold]GPUs:[/bold] {self.ngpu}\n" + f"[bold]Steps:[/bold] {self.steps}\n" + f"[bold]Seed:[/bold] {self.seed}\n" + f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n" + f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n" + f"[bold]Model flavor:[/bold] {self.flavor}" + ), + title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]", + expand=False, + border_style="blue", + padding=(1, 2), + ) + ) + console.print() + + self.base_results_dir.mkdir(exist_ok=True) + + # TODO(3outeille): make it more generic later + if self.model_filter == "llama3": + hf_model_name = "meta-llama/Llama-3.2-1B" + tt_model_name = "llama3" + elif self.model_filter == "deepseek_v3": + hf_model_name = "deepseek-ai/DeepSeek-V3" + tt_model_name = "deepseek_v3" + else: + raise ValueError(f"Model filter {self.model_filter} not supported") + + self.generate_parallelism_configs(hf_model_name) + + model_owner, model_repo = hf_model_name.split("/", 1) + nd_parallel_upper = self.nd_parallel.upper() + self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor + self.results_dir.mkdir(parents=True, exist_ok=True) + + if self.verbose: + log_message(LogLevel.INFO, f"Results directory: {self.results_dir}") + + console.print( + Panel( + "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]", + expand=False, + border_style="blue", + padding=(0, 2), + ) + ) + + # --- 1. Generate configs --- + + L = [] + + for config in self.parallelism_configs: + + config_dir = self.results_dir if config.name == "fsdp" else self.results_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu" + config_dir.mkdir(exist_ok=True) + + config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" + config_file_hf = self.generate_config( + config_dir=config_dir, + config=config, + model_name=hf_model_name, + backend="huggingface", + filename=config_filename_hf, + indent=0 + ) + config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" + config_file_tt = self.generate_config( + config_dir=config_dir, + config=config, + model_name=tt_model_name, + backend="torchtitan", + filename=config_filename_tt, + indent=0 + ) + log_path_hf = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" + log_path_tt = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" + L.append((config_file_hf, config_file_tt, log_path_hf, log_path_tt)) + + + # Launch slurm training + jobs = [] + from slurm_utils import Job, Status + for config_file_hf, config_file_tt, log_path_hf, log_path_tt in L: + job_hf = Job(config_file_hf, log_path_hf, qos="high") + job_tt = Job(config_file_tt, log_path_tt, qos="high") + + job_tt.set_status(Status.INIT) + job_hf.set_status(Status.INIT) + jobs.append(job_hf) + jobs.append(job_tt) + + scheduler = Scheduler() + + scheduler.create_slurm_script(jobs) + # submit in subprocess + scheduler.submit_jobs(jobs) # -> job.set_status(Status.PENDING) + + scheduler.wait_for_all_jobs_to_complete() # spawn tmux to monitor jobs + #NOTE(3outeille): run_slurm() should not be run if + + def run_tests_slurm(self, args: argparse.Namespace) -> int: + # TODO(3outeille): do diff + compare metrics + pass def main(): """Entry point for the script.""" + parser = argparse.ArgumentParser( + description="Test different parallelism configurations against a baseline FSDP model.", + ) + parser.add_argument("--use_slurm", action="store_true", + help="Use SLURM for job submission") + parser.add_argument("--run_tests_slurm", action="store_true", + help="Run tests with SLURM") + parser.add_argument("-m", "--model-filter", default="", + help="Filter models by name pattern (e.g., 'llama3')") + parser.add_argument("-t", "--test-filter", default="", + help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')") + parser.add_argument("-nd", "--nd_parallel", type=str, default="2d", + help=f"Parallelism to use (default: {CompareDistributedRun.ND_PARALLEL_TO_NB_GPUS.keys()})") + parser.add_argument("-s", "--steps", type=int, default=CompareDistributedRun.DEFAULT_STEPS, + help=f"Training steps (default: {CompareDistributedRun.DEFAULT_STEPS})") + parser.add_argument("--flavor", default=CompareDistributedRun.DEFAULT_FLAVOR, + help=f"Model flavor/size (default: {CompareDistributedRun.DEFAULT_FLAVOR}). " + f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]") + parser.add_argument("-v", "--verbose", action="store_true", + help="Verbose output") + parser.add_argument("--loss-atol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_ATOL, + help=f"Absolute tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_ATOL})") + parser.add_argument("--loss-rtol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_RTOL, + help=f"Relative tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_RTOL})") + parser.add_argument("--grad-norm-atol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL, + help=f"Absolute tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL})") + parser.add_argument("--grad-norm-rtol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL, + help=f"Relative tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL})") + + args = parser.parse_args() + runner = CompareDistributedRun() - return runner.run() + if args.use_slurm: + return runner.run_slurm(args) + elif args.run_tests_slurm: + return runner.run_tests_slurm(args) + else: + return runner.run_local(args) if __name__ == "__main__": sys.exit(main()) From d2f80a213564bf64d86bb9c48c446d95a7bb4692 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sat, 4 Oct 2025 10:41:44 +0000 Subject: [PATCH 058/129] refactor test --- .../configs/template.slurm | 54 +++ .../configs/test_template.toml | 5 +- .../test_hf_integration.py | 340 ++++++++++++++++++ 3 files changed, 397 insertions(+), 2 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/configs/template.slurm create mode 100644 torchtitan/experiments/transformers_backend/test_hf_integration.py diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm new file mode 100644 index 0000000000..3d4d5d587d --- /dev/null +++ b/torchtitan/experiments/transformers_backend/configs/template.slurm @@ -0,0 +1,54 @@ +#!/bin/bash +#SBATCH --job-name={{ name }} +#SBATCH --output={{ root_path }}/slurm_%j.out +#SBATCH --error={{ root_path }}/slurm_%j.out +#SBATCH --nodes={{ nodes }} +#SBATCH --ntasks-per-node={{ n_proc_per_node }} +#SBATCH --gpus-per-task=1 +#SBATCH --qos={{ qos }} +#SBATCH --cpus-per-task=12 + +# Misc initializations. +echo "========================" +echo "START TIME: $(date)" +source /etc/profile.d/modules.sh +source /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/bin/activate +echo python3 version = $(python3 --version) +echo "===========" + +# Slurm stuff +export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export MASTER_PORT=$((1024 + RANDOM % 64511)) + +export TMPDIR=/scratch +export TORCH_HOME="/fsx/ferdinandmom/cache/torch" +export HF_HOME="/fsx/ferdinandmom/cache/huggingface" +export HF_DATASETS_CACHE="/fsx/ferdinandmom/cache/huggingface/datasets" +export TRANSFORMERS_CACHE="/fsx/ferdinandmom/cache/huggingface/transformers" +export CUBLAS_WORKSPACE_CONFIG=":4096:8" +export CUDA_DEVICE_MAX_CONNECTIONS="1" +export UV_CACHE_DIR="/fsx/ferdinandmom/.cache/uv" + +module load cuda/12.4 + +echo "Running training job: {{ name }}" +echo "Config file: {{ config_path }}" + +{% if name == "seed_checkpoint" %} +python /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/scripts/download_hf_assets.py --repo_id {{ repo_id }} --local_dir {{ root_path }} --assets tokenizer +{% endif %} + +torchrun \ + --nproc_per_node {{ n_proc_per_node }} \ + --nnodes {{ nodes }} \ + --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + -m torchtitan.train \ + --checkpoint.enable \ + {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \ + --training.seed 42 \ + --training.deterministic \ + --job.config_file {{ config_path }} diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml index 238f325ba2..8521b351a6 100644 --- a/torchtitan/experiments/transformers_backend/configs/test_template.toml +++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml @@ -4,7 +4,7 @@ dump_folder = "./outputs" description = "Llama 3 debug training" print_args = false -use_for_integration_test = true +use_for_integration_test = false [profiling] enable_profiling = true @@ -24,7 +24,8 @@ enable_wandb = false name = "llama3" flavor = "debugmodel" # test folder with tokenizer.json, for debug purpose only -hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" +#hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" +hf_assets_path = "" # converters = ["float8"] [optimizer] diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py new file mode 100644 index 0000000000..ef645eaac7 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py @@ -0,0 +1,340 @@ +import toml +from argparse import ArgumentParser +from pathlib import Path +import re +import os +import subprocess +from enum import Enum +from jinja2 import Template + + +def _create_slurm_script( + config: dict, + config_path: Path, + script_path: Path, + job_name: str, + initial_load_path: str = None, + repo_id: str = None, +): + with open(config_path, "r") as file: + config = toml.load(file) + + pp = config["parallelism"]["pipeline_parallel_degree"] + dp = config["parallelism"]["data_parallel_shard_degree"] + tp = config["parallelism"]["tensor_parallel_degree"] + cp = config["parallelism"]["context_parallel_degree"] + world_size = pp * dp * tp * cp + + nodes = max(1, world_size // 8) + n_proc_per_node = min(8, world_size // nodes) + + print(f"world_size: {world_size}, nodes: {nodes}, n_proc_per_node: {n_proc_per_node}") + + # Read the SLURM script template from the file + template_path = Path(__file__).parent / "configs/template.slurm" + with open(template_path, "r") as f: + slurm_script_template = f.read() + base_bench_template = Template(slurm_script_template) + + context_bench = { + "name": job_name, + "nodes": nodes, + "n_proc_per_node": n_proc_per_node, + "root_path": script_path.parent, + "config_path": config_path, + "initial_load_path": initial_load_path, + "repo_id": repo_id, + "qos": "high" if nodes > 1 else "normal", # Example logic for qos + } + + with open(script_path, "w") as file: + file.write(base_bench_template.render(context_bench)) + + print(f"Slurm script created at {script_path}") + + +def create_configs(model_name: str, out_dir: str, flavor: str): + """ + results/ + |_ meta-llama + |_ Llama-3.2-1B + |_ debugmodel/ + |_ seed_checkpoint/ + |_ config.toml + |_ seed.slurm + |_ step-0/ + |_ .... + |_baseline_fsdp2/ + |_ config.toml + |_ nd_parallelism.slurm + |_ nd_parallelism.log + |_ fsdp2_tp2_cp1_pp1/ + |_ config.toml + |_ nd_parallelism.slurm + |_ nd_parallelism.log + |_ diff_baseline_vs_nd_parallelism.log + |_ fsdp2_tp1_cp1_pp2/ + |_ config.toml + |_ nd_parallelism.slurm + |_ nd_parallelism.log + |_ diff_baseline_vs_nd_parallelism.log + |_ fsdp2_tp1_cp2_pp1/ + |_ config.toml + |_ nd_parallelism.slurm + |_ nd_parallelism.log + |_ diff_baseline_vs_nd_parallelism.log + |_ fsdp2_tp1_cp2_pp2/ + |_ config.toml + |_ nd_parallelism.slurm + |_ nd_parallelism.log + |_ diff_baseline_vs_nd_parallelism.log + |_ fsdp2_tp2_cp2_pp1/ + |_ config.toml + |_ nd_parallelism.slurm + |_ nd_parallelism.log + |_ diff_baseline_vs_nd_parallelism.log + |_ fsdp2_tp2_cp2_pp2/ + |_ config.toml + |_ nd_parallelism.slurm + |_ nd_parallelism.log + |_ diff_baseline_vs_nd_parallelism.log` + |_ full/ + ... + |_ llama3 #torchtitan model + """ + + base_config = "configs/test_template.toml" + with open(base_config, "r") as f: + config = toml.load(f) + + config["model"]["name"] = model_name + config["model"]["flavor"] = flavor + + parallelism_configs = [ + "fsdp2_tp1_cp1_pp1", # baseline + "fsdp2_tp2_cp1_pp1", + "fsdp2_tp1_cp1_pp2", + "fsdp2_tp1_cp2_pp1", + "fsdp2_tp1_cp2_pp2", + "fsdp2_tp2_cp2_pp1", + "fsdp2_tp2_cp2_pp2", + ] + + out_path = Path(out_dir) / model_name / flavor + out_path.mkdir(parents=True, exist_ok=True) + + # Create seed checkpoint + seed_config = toml.loads(toml.dumps(config)) + seed_config["parallelism"]["data_parallel_shard_degree"] = 1 + seed_config["parallelism"]["tensor_parallel_degree"] = 1 + seed_config["parallelism"]["pipeline_parallel_degree"] = 1 + seed_config["parallelism"]["context_parallel_degree"] = 1 + seed_checkpoint_dir = out_path / "seed_checkpoint" + seed_checkpoint_dir.mkdir(exist_ok=True) + seed_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name) + seed_config["model"]["tokenizer_path"] = str(seed_checkpoint_dir / Path(model_name).name) + seed_config_path = seed_checkpoint_dir / "config.toml" + with open(seed_config_path, "w") as f: + toml.dump(seed_config, f) + print(f"Created {seed_config_path}") + _create_slurm_script( + seed_config, + seed_config_path, + seed_checkpoint_dir / "seed.slurm", + "seed_checkpoint", + repo_id=model_name, + ) + + # Create parallelism configs + for pc in parallelism_configs: + iter_config = toml.loads(toml.dumps(config)) + + m = re.match(r"fsdp(\d+)_tp(\d+)_cp(\d+)_pp(\d+)", pc) + if not m: + print(f"Skipping invalid config string: {pc}") + continue + + fsdp, tp, cp, pp = map(int, m.groups()) + + pc_dir = out_path / pc + pc_dir.mkdir(exist_ok=True) + + iter_config["parallelism"]["data_parallel_shard_degree"] = fsdp + iter_config["parallelism"]["tensor_parallel_degree"] = tp + iter_config["parallelism"]["context_parallel_degree"] = cp + iter_config["parallelism"]["pipeline_parallel_degree"] = pp + iter_config["parallelism"]["pipeline_parallel_schedule"] = "1F1B" + iter_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name) + + config_path = pc_dir / "config.toml" + with open(config_path, "w") as f: + toml.dump(iter_config, f) + print(f"Created {config_path}") + _create_slurm_script( + iter_config, + config_path, + pc_dir / "nd_parallelism.slurm", + pc, + initial_load_path=str(seed_checkpoint_dir / "step-0"), + repo_id=model_name, + ) + +class Status(Enum): + # INIT -> PENDING -> [RUNNING | FAIL] -> COMPLETED + INIT = "init" # Job is created + PENDING = "pending" # Job is waiting for ressources + RUNNING = "running" # Job is running + FAIL = "fail" # Job failed + COMPLETED = "completed" # Job is completed + +class Job: + def __init__(self, root_path: str, qos: str) -> None: + self.root_path = root_path + self.name = os.path.basename(root_path) + if self.name == os.path.basename(os.path.normpath(args.inp_dir)): + self.name = "baseline_fsdp2" + self.config = os.path.join(root_path, "baseline_fsdp2_config.toml") + self.slurm_script = os.path.join(root_path, "baseline_fsdp2.slurm") + else: + self.config = os.path.join(root_path, "config.toml") + self.slurm_script = os.path.join(root_path, "nd_parallelism.slurm") + + self.qos = qos + + # Check if the status.txt file exists + status_file_path = os.path.join(self.root_path, "status.txt") + if not os.path.exists(status_file_path): + # Create the status.txt file with INIT status + with open(status_file_path, "w") as f: + f.write(Status.INIT.value) + self.status = self.get_status() + + def get_status(self) -> Status: + """ + Read the status of the job from `status.txt` and return it + """ + is_existing = lambda value_to_check: any( + value.value == value_to_check for value in Status.__members__.values() + ) + + status_file_path = os.path.join(self.root_path, "status.txt") + with open(status_file_path, "r") as f: + status = f.read().strip() + if not is_existing(status): + raise ValueError(f"Invalid status: {status}") + return Status(status) + + def set_status(self, status: Status) -> Status: + """ + Update the status of the job in `status.txt` and return the new status + """ + status_file_path = os.path.join(self.root_path, "status.txt") + with open(status_file_path, "w") as f: + f.write(status.value) + return status + +class Scheduler: + def __init__(self, inp_dir: str, qos: str) -> None: + # Find all leaf directories, and the top-level directory if it contains a config. + jobs_directory_paths = [] + for root, dirs, files in os.walk(inp_dir): + is_job_dir = any(f.endswith(".toml") for f in files) + if is_job_dir: + if not dirs: # leaf node + jobs_directory_paths.append(os.path.abspath(root)) + # also capture baseline job in root + elif root == inp_dir: + jobs_directory_paths.append(os.path.abspath(root)) + + self.job_lists = [Job(job_path, qos) for job_path in jobs_directory_paths] + + def keep_only_jobs(self, status: Status): + return [job for job in self.job_lists if job.status == status] + + def filter_out_jobs(self, status: Status): + return [job for job in self.job_lists if job.status != status] + + def check_status(self): + status_files = [os.path.join(job.root_path, "status.txt") for job in self.job_lists] + + status_counts = {status.value: 0 for status in Status} + + for status_file in status_files: + with open(status_file, "r") as f: + status = f.read().strip() + if status in status_counts: + status_counts[status] += 1 + else: + raise ValueError(f"Invalid status: {status}") + + total = sum(status_counts.values()) + + print(f"{'Status':<10} | {'Count':<6}") + print(f"{'-'*10}-|-{'-'*6}") + for status, count in status_counts.items(): + print(f"{status.capitalize():<10} | {count:<6}") + + print(f"{'-'*10}-|-{'-'*6}") + print(f"{'Total':<10} | {total:<6}") + + +def submit_jobs(inp_dir, qos, only: str = None, seed_checkpoint: str = None): + scheduler = Scheduler(inp_dir, qos) + + env_vars = os.environ.copy() + total_jobs = len(scheduler.job_lists) + + if only: + try: + status_to_filter = Status(only) + scheduler.job_lists = scheduler.keep_only_jobs(status_to_filter) + except ValueError: + print(f"Invalid status for --only: {only}") + return + + if only is not None: + filtered_jobs = len(scheduler.job_lists) + if filtered_jobs == 0: + print(f"No '{only}' jobs to resubmit") + return + print( + f"Only {filtered_jobs}/{total_jobs} jobs with status '{only}' will be resubmitted" + ) + + scheduler.job_lists = scheduler.filter_out_jobs(Status.COMPLETED) + + for job in scheduler.job_lists: + subprocess.run(["sbatch", job.slurm_script], env=env_vars) + job.set_status(Status.PENDING) + + +def report(inp_dir: str): + scheduler = Scheduler(inp_dir, qos="N/A") + scheduler.check_status() + + +if __name__ == "__main__": + parser = ArgumentParser() + subparsers = parser.add_subparsers(dest="action") + + create_configs_parser = subparsers.add_parser("create_configs") + create_configs_parser.add_argument("--model_name", type=str, required=True) + create_configs_parser.add_argument("--out_dir", type=str, required=True) + create_configs_parser.add_argument("--flavor", type=str, required=True) + submit_jobs_parser = subparsers.add_parser("submit_jobs") + submit_jobs_parser.add_argument("--inp_dir", type=str, required=True) + submit_jobs_parser.add_argument("--seed_checkpoint", type=str, default=None) + submit_jobs_parser.add_argument("--qos", type=str, required=True, choices=["low", "normal", "high", "prod"]) + submit_jobs_parser.add_argument("--only", type=str, default=None, choices=[s.value for s in Status]) + + report_parser = subparsers.add_parser("report") + report_parser.add_argument("--inp_dir", type=str, required=True) + + args = parser.parse_args() + + if args.action == "create_configs": + create_configs(args.model_name, args.out_dir, args.flavor) + elif args.action == "submit_jobs": + submit_jobs(args.inp_dir, args.qos, args.only, args.seed_checkpoint) + elif args.action == "report": + report(args.inp_dir) \ No newline at end of file From 6454e40a93ef8a41b39b81893b2a7f4081e7ed03 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sun, 5 Oct 2025 20:40:04 +0000 Subject: [PATCH 059/129] fix running job to slurm --- .../transformers_backend/configs/template.slurm | 6 ++---- .../transformers_backend/configs/test_template.toml | 3 +-- .../transformers_backend/test_hf_integration.py | 7 +++---- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm index 3d4d5d587d..31016c37f2 100644 --- a/torchtitan/experiments/transformers_backend/configs/template.slurm +++ b/torchtitan/experiments/transformers_backend/configs/template.slurm @@ -35,16 +35,14 @@ module load cuda/12.4 echo "Running training job: {{ name }}" echo "Config file: {{ config_path }}" -{% if name == "seed_checkpoint" %} -python /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/scripts/download_hf_assets.py --repo_id {{ repo_id }} --local_dir {{ root_path }} --assets tokenizer -{% endif %} - torchrun \ --nproc_per_node {{ n_proc_per_node }} \ --nnodes {{ nodes }} \ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ --rdzv_backend c10d \ --max_restarts 0 \ + --local-ranks-filter {{ n_proc_per_node - 1 }} \ + --role rank \ --tee 3 \ -m torchtitan.train \ --checkpoint.enable \ diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml index 8521b351a6..fa0c763ed7 100644 --- a/torchtitan/experiments/transformers_backend/configs/test_template.toml +++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml @@ -24,8 +24,7 @@ enable_wandb = false name = "llama3" flavor = "debugmodel" # test folder with tokenizer.json, for debug purpose only -#hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" -hf_assets_path = "" +hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" # converters = ["float8"] [optimizer] diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py index ef645eaac7..0e0dbe3e78 100644 --- a/torchtitan/experiments/transformers_backend/test_hf_integration.py +++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py @@ -131,8 +131,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str): seed_config["parallelism"]["context_parallel_degree"] = 1 seed_checkpoint_dir = out_path / "seed_checkpoint" seed_checkpoint_dir.mkdir(exist_ok=True) - seed_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name) - seed_config["model"]["tokenizer_path"] = str(seed_checkpoint_dir / Path(model_name).name) + seed_config["job"]["dump_folder"] = str(seed_checkpoint_dir) seed_config_path = seed_checkpoint_dir / "config.toml" with open(seed_config_path, "w") as f: toml.dump(seed_config, f) @@ -164,7 +163,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str): iter_config["parallelism"]["context_parallel_degree"] = cp iter_config["parallelism"]["pipeline_parallel_degree"] = pp iter_config["parallelism"]["pipeline_parallel_schedule"] = "1F1B" - iter_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name) + iter_config["job"]["dump_folder"] = str(pc_dir) config_path = pc_dir / "config.toml" with open(config_path, "w") as f: @@ -175,7 +174,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str): config_path, pc_dir / "nd_parallelism.slurm", pc, - initial_load_path=str(seed_checkpoint_dir / "step-0"), + initial_load_path=str(seed_checkpoint_dir / "checkpoint/step-0"), repo_id=model_name, ) From b99a4d2c082ffd44e2c949c929229276a2f3f778 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sun, 5 Oct 2025 21:22:34 +0000 Subject: [PATCH 060/129] finally have a better testing xp with slurm --- .../test_hf_integration.py | 491 ++++++++++++++++-- 1 file changed, 443 insertions(+), 48 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py index 0e0dbe3e78..38c4982319 100644 --- a/torchtitan/experiments/transformers_backend/test_hf_integration.py +++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py @@ -6,6 +6,54 @@ import subprocess from enum import Enum from jinja2 import Template +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn + +console = Console() + +class LogLevel(Enum): + INFO = "INFO" + SUCCESS = "SUCCESS" + WARNING = "WARNING" + ERROR = "ERROR" + TEST_PASS = "TEST_PASS" + TEST_FAIL = "TEST_FAIL" + +def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None: + """Log a message with appropriate color coding.""" + style_map = { + LogLevel.INFO: "blue", + LogLevel.SUCCESS: "green", + LogLevel.WARNING: "yellow", + LogLevel.ERROR: "bold red", + LogLevel.TEST_PASS: "green", + LogLevel.TEST_FAIL: "bold red", + } + + prefix_map = { + LogLevel.INFO: "[INFO]", + LogLevel.SUCCESS: "[SUCCESS]", + LogLevel.WARNING: "[WARNING]", + LogLevel.ERROR: "[ERROR]", + LogLevel.TEST_PASS: "✅ TEST PASS", + LogLevel.TEST_FAIL: "❌ TEST FAIL", + } + + style = style_map[level] + prefix = prefix_map[level] + if indent > 0: + indent_str = " " * (indent - 1) + "└─ " + else: + indent_str = "" + + output = f"{indent_str}[{style}]{prefix}[/] {message}" + + if dim: + console.print(f"[dim]{output}[/dim]") + else: + console.print(output) def _create_slurm_script( @@ -64,7 +112,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str): |_ seed.slurm |_ step-0/ |_ .... - |_baseline_fsdp2/ + |_ fsdp2_tp1_cp1_pp1/ |_ config.toml |_ nd_parallelism.slurm |_ nd_parallelism.log @@ -84,16 +132,6 @@ def create_configs(model_name: str, out_dir: str, flavor: str): |_ nd_parallelism.log |_ diff_baseline_vs_nd_parallelism.log |_ fsdp2_tp1_cp2_pp2/ - |_ config.toml - |_ nd_parallelism.slurm - |_ nd_parallelism.log - |_ diff_baseline_vs_nd_parallelism.log - |_ fsdp2_tp2_cp2_pp1/ - |_ config.toml - |_ nd_parallelism.slurm - |_ nd_parallelism.log - |_ diff_baseline_vs_nd_parallelism.log - |_ fsdp2_tp2_cp2_pp2/ |_ config.toml |_ nd_parallelism.slurm |_ nd_parallelism.log @@ -187,15 +225,15 @@ class Status(Enum): COMPLETED = "completed" # Job is completed class Job: - def __init__(self, root_path: str, qos: str) -> None: + def __init__(self, root_path: str, qos: str, inp_dir: str = None) -> None: self.root_path = root_path self.name = os.path.basename(root_path) - if self.name == os.path.basename(os.path.normpath(args.inp_dir)): - self.name = "baseline_fsdp2" - self.config = os.path.join(root_path, "baseline_fsdp2_config.toml") - self.slurm_script = os.path.join(root_path, "baseline_fsdp2.slurm") + + self.config = os.path.join(root_path, "config.toml") + seed_slurm = os.path.join(root_path, "seed.slurm") + if os.path.exists(seed_slurm): + self.slurm_script = seed_slurm else: - self.config = os.path.join(root_path, "config.toml") self.slurm_script = os.path.join(root_path, "nd_parallelism.slurm") self.qos = qos @@ -245,7 +283,7 @@ def __init__(self, inp_dir: str, qos: str) -> None: elif root == inp_dir: jobs_directory_paths.append(os.path.abspath(root)) - self.job_lists = [Job(job_path, qos) for job_path in jobs_directory_paths] + self.job_lists = [Job(job_path, qos, inp_dir) for job_path in jobs_directory_paths] def keep_only_jobs(self, status: Status): return [job for job in self.job_lists if job.status == status] @@ -253,31 +291,8 @@ def keep_only_jobs(self, status: Status): def filter_out_jobs(self, status: Status): return [job for job in self.job_lists if job.status != status] - def check_status(self): - status_files = [os.path.join(job.root_path, "status.txt") for job in self.job_lists] - - status_counts = {status.value: 0 for status in Status} - - for status_file in status_files: - with open(status_file, "r") as f: - status = f.read().strip() - if status in status_counts: - status_counts[status] += 1 - else: - raise ValueError(f"Invalid status: {status}") - - total = sum(status_counts.values()) - - print(f"{'Status':<10} | {'Count':<6}") - print(f"{'-'*10}-|-{'-'*6}") - for status, count in status_counts.items(): - print(f"{status.capitalize():<10} | {count:<6}") - - print(f"{'-'*10}-|-{'-'*6}") - print(f"{'Total':<10} | {total:<6}") - -def submit_jobs(inp_dir, qos, only: str = None, seed_checkpoint: str = None): +def submit_jobs(inp_dir, qos, only: str = None): scheduler = Scheduler(inp_dir, qos) env_vars = os.environ.copy() @@ -307,10 +322,385 @@ def submit_jobs(inp_dir, qos, only: str = None, seed_checkpoint: str = None): job.set_status(Status.PENDING) -def report(inp_dir: str): - scheduler = Scheduler(inp_dir, qos="N/A") - scheduler.check_status() +def check_status(inp_dir: str): + """ + Display a table showing the count of jobs in each status. + Reads status.txt from all job directories found in inp_dir. + """ + # Find all directories with status.txt files + jobs_directory_paths = [] + for root, dirs, files in os.walk(inp_dir): + if "status.txt" in files: + jobs_directory_paths.append(os.path.abspath(root)) + + if not jobs_directory_paths: + print(f"No jobs found in {inp_dir}") + return + + # Count jobs by status + status_counts = {status: 0 for status in Status} + for job_path in jobs_directory_paths: + job = Job(job_path, qos="N/A") + status_counts[job.status] += 1 + + total = len(jobs_directory_paths) + + # Print table + print("\nJob Status Summary") + print("=" * 30) + print(f"{'Status':<12} | {'Count':>5}") + print("-" * 30) + print(f"{'Init':<12} | {status_counts[Status.INIT]:>5}") + print(f"{'Pending':<12} | {status_counts[Status.PENDING]:>5}") + print(f"{'Running':<12} | {status_counts[Status.RUNNING]:>5}") + print(f"{'Fail':<12} | {status_counts[Status.FAIL]:>5}") + print(f"{'Completed':<12} | {status_counts[Status.COMPLETED]:>5}") + print("-" * 30) + print(f"{'Total':<12} | {total:>5}") + print("=" * 30) + +def report(inp_dir: str): + """ + Generate diff reports between baseline (fsdp2_tp1_cp1_pp1) and all other parallelism configs. + Creates diff_baseline_vs_nd_parallelism.log in each non-baseline config directory. + Automatically discovers all model/flavor combinations under inp_dir. + """ + # Add imports + import torch + from dataclasses import dataclass, field + from typing import List + + @dataclass + class TrainingMetrics: + """Training metrics extracted from logs.""" + steps: List[int] = field(default_factory=list) + loss: List[float] = field(default_factory=list) + grad_norm: List[float] = field(default_factory=list) + + # Default tolerance values (matching compare_distributed_run.py) + DEFAULT_LOSS_ATOL = 0.02 + DEFAULT_LOSS_RTOL = 1e-5 + DEFAULT_GRAD_NORM_ATOL = 0.02 + DEFAULT_GRAD_NORM_RTOL = 1e-5 + + def _extract_metrics(log_file: Path) -> TrainingMetrics: + """Extract metrics from log file.""" + metrics = TrainingMetrics() + + try: + with open(log_file, 'r') as f: + content = f.read() + + # Regex to capture all metrics from a log line, ignoring ANSI color codes + pattern = re.compile( + r"step:\s*(\d+)\s*" + r".*?loss:\s*([0-9]+\.?[0-9]*)\s*" + r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*" + ) + + for match in pattern.finditer(content): + metrics.steps.append(int(match.group(1))) + metrics.loss.append(float(match.group(2))) + metrics.grad_norm.append(float(match.group(3))) + + except Exception as e: + log_message(LogLevel.WARNING, f"Could not extract metrics: {e}", indent=3, dim=True) + + return metrics + + def _compare_metrics(baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, + config_name: str) -> tuple[bool, str]: + """Compare metrics between baseline and test configuration. + + Returns: + tuple[bool, str]: (passed, summary_message) + """ + if not baseline_metrics.loss or not test_metrics.loss: + return False, f"Unable to extract metrics" + + # Convert to tensors + baseline_loss = torch.tensor(baseline_metrics.loss) + test_loss = torch.tensor(test_metrics.loss) + baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm) + test_grad_norm = torch.tensor(test_metrics.grad_norm) + + # Check if tensors are close + loss_pass = torch.allclose(baseline_loss, test_loss, atol=DEFAULT_LOSS_ATOL, rtol=DEFAULT_LOSS_RTOL) + grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=DEFAULT_GRAD_NORM_ATOL, rtol=DEFAULT_GRAD_NORM_RTOL) + + # Calculate max absolute differences for logging + loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 + grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 + + # Calculate min absolute differences for logging + loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 + grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 + + summary = (f"Max loss diff: {loss_max_diff:.2e}, " + f"Min loss diff: {loss_min_diff:.2e}, " + f"Max grad norm diff: {grad_norm_diff:.2e}, " + f"Min grad norm diff: {grad_norm_min_diff:.2e}") + + return (loss_pass and grad_pass), summary + + def _filter_log(log_file: Path) -> Path: + """Filter log file to normalize volatile information (timestamps, PIDs, ports).""" + filtered_file = log_file.with_suffix(log_file.suffix + '.filtered') + + with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile: + for line in infile: + # Apply filtering patterns to remove volatile information + line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', + 'TIMESTAMP', line) + line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', + 'torchrun ... --master_port=XXXX', line) + line = re.sub(r'PID [0-9]+', 'PID XXXX', line) + line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line) + outfile.write(line) + + return filtered_file + + def _generate_diff(baseline_log: Path, test_log: Path, diff_file: Path) -> tuple[bool, str]: + """Generate diff between baseline and test logs using git diff. + + Returns: + tuple[bool, str]: (success, diff_output or error_message) + """ + # Filter logs to remove timestamps and volatile information + baseline_filtered = _filter_log(baseline_log) + test_filtered = _filter_log(test_log) + + try: + # Generate colored diff using git diff + cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color", + str(baseline_filtered), str(test_filtered)] + + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + # git diff returns exit code 1 when files differ (which is expected), not an error + if result.returncode not in [0, 1]: + error_msg = f"git diff failed with code {result.returncode}\n{result.stderr}" + return False, error_msg + + # Write diff to file + with open(diff_file, 'w') as f: + f.write(result.stdout) + + return True, result.stdout + + finally: + # Clean up filtered files + if baseline_filtered.exists(): + baseline_filtered.unlink() + if test_filtered.exists(): + test_filtered.unlink() + + def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: + """Process a single model/flavor directory. + + Returns: + tuple[int, int]: (passed_count, failed_count) + """ + # Find baseline directory + baseline_dir = flavor_dir / "fsdp2_tp1_cp1_pp1" + if not baseline_dir.exists(): + log_message(LogLevel.WARNING, f"No baseline directory found in {flavor_dir.relative_to(inp_path)}, skipping", indent=1) + return 0, 0 + + # Find baseline .out file + baseline_out_files = list(baseline_dir.glob("*.out")) + if not baseline_out_files: + log_message(LogLevel.WARNING, f"No .out file found in baseline {baseline_dir.relative_to(inp_path)}, skipping", indent=1) + return 0, 0 + baseline_out = baseline_out_files[0] + + # Extract baseline metrics + log_message(LogLevel.INFO, f"Extracting baseline metrics from {baseline_out.name}...", indent=1) + baseline_metrics = _extract_metrics(baseline_out) + if not baseline_metrics.loss or not baseline_metrics.grad_norm: + log_message(LogLevel.WARNING, "Could not extract baseline metrics, skipping comparisons", indent=1) + return 0, 0 + + # Find all parallelism config directories (excluding seed_checkpoint and baseline) + config_dirs = [] + for item in flavor_dir.iterdir(): + if item.is_dir() and item.name.startswith("fsdp2_") and item.name != "fsdp2_tp1_cp1_pp1": + config_dirs.append(item) + + if not config_dirs: + log_message(LogLevel.INFO, f"No test configurations found in {flavor_dir.relative_to(inp_path)}", indent=1) + return 0, 0 + + console.print() + console.print( + Panel( + f"[cyan]Baseline:[/cyan] {baseline_out.relative_to(flavor_dir)}\n" + f"[cyan]Configurations to compare:[/cyan] {len(config_dirs)}", + title=f"[bold cyan]Processing {flavor_dir.relative_to(inp_path)}[/bold cyan]", + expand=False, + border_style="cyan", + padding=(0, 2), + ) + ) + + # Track results for summary + results = [] + + # Generate diffs for each config + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), + console=console, + ) as progress: + task = progress.add_task("[cyan]Processing configurations...", total=len(config_dirs)) + + for i, config_dir in enumerate(sorted(config_dirs)): + if i > 0: + console.rule(style="dim") + + progress.update(task, description=f"[cyan]Testing [bold]{config_dir.name}[/bold]") + + # Find .out file in config directory + test_out_files = list(config_dir.glob("*.out")) + if not test_out_files: + log_message(LogLevel.WARNING, f"{config_dir.name}: No .out file found, skipping", indent=1) + results.append((config_dir.name, False, "No .out file found")) + progress.advance(task) + continue + + test_out = test_out_files[0] + diff_file = config_dir / "diff_baseline_vs_nd_parallelism.log" + + # Extract test metrics + test_metrics = _extract_metrics(test_out) + + # Compare metrics + if test_metrics.loss and test_metrics.grad_norm: + test_passed, metrics_summary = _compare_metrics(baseline_metrics, test_metrics, config_dir.name) + + if test_passed: + log_message(LogLevel.TEST_PASS, f"{config_dir.name} - {metrics_summary}", indent=1) + results.append((config_dir.name, True, metrics_summary)) + else: + log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - {metrics_summary}", indent=1) + results.append((config_dir.name, False, metrics_summary)) + else: + log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - Unable to extract metrics", indent=1) + results.append((config_dir.name, False, "Unable to extract metrics")) + + # Generate diff + try: + success, output = _generate_diff(baseline_out, test_out, diff_file) + + if success: + log_message(LogLevel.INFO, f"Diff between baseline vs HF nd-parallel saved to:", indent=5, dim=True) + console.print(f" [dim]{diff_file.relative_to(flavor_dir)}[/dim]") + else: + log_message(LogLevel.WARNING, f"Failed to generate diff: {output}", indent=5, dim=True) + + except Exception as e: + log_message(LogLevel.WARNING, f"Failed to generate diff - {e}", indent=5, dim=True) + + progress.advance(task) + + console.print() + # Create summary table + summary_table = Table( + title=f"[bold]Summary for {flavor_dir.relative_to(inp_path)}[/bold]", + show_header=True, + header_style="bold magenta" + ) + summary_table.add_column("Configuration", style="cyan") + summary_table.add_column("Status", justify="center") + summary_table.add_column("Metrics", style="dim") + + for name, passed, summary in results: + status = "[bold green]✅ PASS[/bold green]" if passed else "[bold red]❌ FAIL[/bold red]" + # Truncate summary if too long + display_summary = summary if len(summary) < 60 else summary[:57] + "..." + summary_table.add_row(name, status, display_summary) + + console.print(summary_table) + console.print() + + passed_count = sum(1 for _, passed, _ in results if passed) + failed_count = len(results) - passed_count + + return passed_count, failed_count + + inp_path = Path(inp_dir) + + if not inp_path.exists(): + console.print(f"[bold red]Error:[/bold red] Directory not found: {inp_path}") + return + + console.print( + Panel( + "[bold cyan]HuggingFace Integration Test Report Generator[/bold cyan]", + expand=False, + border_style="blue", + padding=(1, 2), + ) + ) + console.print() + + # Find all directories that contain a baseline (fsdp2_tp1_cp1_pp1) subdirectory + flavor_dirs = [] + for root, dirs, files in os.walk(inp_path): + if "fsdp2_tp1_cp1_pp1" in dirs: + flavor_dirs.append(Path(root)) + + if not flavor_dirs: + log_message(LogLevel.ERROR, f"No directories with baseline configuration found under {inp_path}") + console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1_pp1' subdirectory[/yellow]") + return + + log_message(LogLevel.INFO, f"Found {len(flavor_dirs)} model/flavor combination(s) to process:") + for flavor_dir in flavor_dirs: + console.print(f" [cyan]•[/cyan] {flavor_dir.relative_to(inp_path)}") + + # Process each flavor directory + total_passed = 0 + total_failed = 0 + + for flavor_dir in flavor_dirs: + passed, failed = _process_flavor_dir(flavor_dir) + total_passed += passed + total_failed += failed + + # Final summary + console.print() + console.print( + Panel( + "[bold cyan]Overall Summary[/bold cyan]", + expand=False, + border_style="blue", + padding=(0, 2), + ) + ) + + overall_table = Table(show_header=True, header_style="bold magenta") + overall_table.add_column("Metric", style="cyan") + overall_table.add_column("Value", justify="right") + + total_tests = total_passed + total_failed + overall_table.add_row("Total Configurations Tested", str(total_tests)) + overall_table.add_row("[green]Passed[/green]", str(total_passed)) + overall_table.add_row("[red]Failed[/red]", str(total_failed)) + + console.print(overall_table) + console.print() + + if total_failed == 0 and total_tests > 0: + log_message(LogLevel.SUCCESS, "All tests passed! 🎉") + elif total_tests > 0: + log_message(LogLevel.WARNING, f"{total_failed} configuration(s) had test failures") + + log_message(LogLevel.SUCCESS, "Diff generation complete!") if __name__ == "__main__": parser = ArgumentParser() @@ -320,20 +710,25 @@ def report(inp_dir: str): create_configs_parser.add_argument("--model_name", type=str, required=True) create_configs_parser.add_argument("--out_dir", type=str, required=True) create_configs_parser.add_argument("--flavor", type=str, required=True) + submit_jobs_parser = subparsers.add_parser("submit_jobs") submit_jobs_parser.add_argument("--inp_dir", type=str, required=True) - submit_jobs_parser.add_argument("--seed_checkpoint", type=str, default=None) submit_jobs_parser.add_argument("--qos", type=str, required=True, choices=["low", "normal", "high", "prod"]) submit_jobs_parser.add_argument("--only", type=str, default=None, choices=[s.value for s in Status]) report_parser = subparsers.add_parser("report") report_parser.add_argument("--inp_dir", type=str, required=True) + check_status_parser = subparsers.add_parser("check_status") + check_status_parser.add_argument("--inp_dir", type=str, required=True) + args = parser.parse_args() if args.action == "create_configs": create_configs(args.model_name, args.out_dir, args.flavor) elif args.action == "submit_jobs": - submit_jobs(args.inp_dir, args.qos, args.only, args.seed_checkpoint) + submit_jobs(args.inp_dir, args.qos, args.only) elif args.action == "report": - report(args.inp_dir) \ No newline at end of file + report(args.inp_dir) + elif args.action == "check_status": + check_status(args.inp_dir) \ No newline at end of file From 218f40071d77ad471869d033c3ce6e3cdb0ef215 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 9 Oct 2025 12:09:50 +0000 Subject: [PATCH 061/129] now everything works (1D/2D/3D/4D). need to fix correctness with PP --- .../configs/template.slurm | 73 +++++++++++++++++-- .../infra/parallelize_hf_transformers.py | 41 ++++++++--- .../model/hf_transformers_args.py | 21 +++++- .../test_hf_integration.py | 35 +++++++-- 4 files changed, 143 insertions(+), 27 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm index 31016c37f2..493b569e95 100644 --- a/torchtitan/experiments/transformers_backend/configs/template.slurm +++ b/torchtitan/experiments/transformers_backend/configs/template.slurm @@ -3,8 +3,8 @@ #SBATCH --output={{ root_path }}/slurm_%j.out #SBATCH --error={{ root_path }}/slurm_%j.out #SBATCH --nodes={{ nodes }} -#SBATCH --ntasks-per-node={{ n_proc_per_node }} -#SBATCH --gpus-per-task=1 +#SBATCH --gres=gpu:{{ n_proc_per_node }} +#SBATCH --ntasks-per-node=1 #SBATCH --qos={{ qos }} #SBATCH --cpus-per-task=12 @@ -30,23 +30,86 @@ export CUBLAS_WORKSPACE_CONFIG=":4096:8" export CUDA_DEVICE_MAX_CONNECTIONS="1" export UV_CACHE_DIR="/fsx/ferdinandmom/.cache/uv" +# EFA settings +export FI_PROVIDER=efa +export FI_EFA_FORK_SAFE=1 +export FI_EFA_ENABLE_SHM_TRANSFER=1 +export NCCL_PROTO=simple +export NCCL_SOCKET_IFNAME=enp + module load cuda/12.4 echo "Running training job: {{ name }}" echo "Config file: {{ config_path }}" -torchrun \ +# Function to update status based on squeue output +update_status() { + job_id=$1 + status_file=$2 + # For unknown reasons, it doenst update status for pending. It only works for running + while true; do + job_status=$(squeue --job $job_id --noheader --format=%T) + echo "Job status: $job_status" + if [ -z "$job_status" ]; then + # Job has finished or is not found + break + elif [ "$job_status" = "RUNNING" ]; then + printf "running" > $status_file + break + fi + sleep 10 + done +} + +# Update status to "pending" or "running" in the background +update_status $job_id {{ root_path }}/status.txt & + +# LOG_DIR="{{ root_path }}/logs" +# mkdir -p ${LOG_DIR} + +# CMD="torchrun \ +# --nproc_per_node {{ n_proc_per_node }} \ +# --nnodes {{ nodes }} \ +# --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ +# --rdzv_backend c10d \ +# --max_restarts 0 \ +# --log-dir ${LOG_DIR} \ +# --role rank \ +# --tee 3 \ +# -m torchtitan.train \ +# --checkpoint.enable \ +# {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \ +# --training.seed 42 \ +# --training.deterministic \ +# --training.steps 1 \ +# --job.config_file {{ config_path }}" + + +CMD="torchrun \ --nproc_per_node {{ n_proc_per_node }} \ --nnodes {{ nodes }} \ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ --rdzv_backend c10d \ --max_restarts 0 \ - --local-ranks-filter {{ n_proc_per_node - 1 }} \ --role rank \ + --local_ranks_filter {{ n_proc_per_node - 1 }} \ --tee 3 \ -m torchtitan.train \ --checkpoint.enable \ {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \ --training.seed 42 \ --training.deterministic \ - --job.config_file {{ config_path }} + --job.config_file {{ config_path }}" + +# Run the main command +echo "Running command: srun -u $CMD" +srun -u $CMD +exit_status=$? + + +# Update status based on the exit status of `srun` +if [ $exit_status -eq 0 ]; then + printf "completed" > {{ root_path }}/status.txt +else + printf "fail" > {{ root_path }}/status.txt +fi diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 469c3407a8..b512ca026c 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -196,6 +196,7 @@ def parallelize_hf_transformers( logger.warning("CP support for FlexAttention is still in progress.") if parallel_dims.tp_enabled: + model.set_tp_mesh(world_mesh["tp"]) enable_float8_linear = "float8" in job_config.model.converters float8_is_rowwise = job_config.float8.recipe_name in ( "rowwise", @@ -281,6 +282,7 @@ def parallelize_hf_transformers( logger.info("Applied FSDP to the model") if parallel_dims.cp_enabled: + model.set_cp_mesh(world_mesh["cp"]) logger.info("Applied Context Parallel to the model") if job_config.training.enable_cpu_offload: @@ -296,6 +298,9 @@ def parallelize_hf_transformers( enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd, ) + if parallel_dims.pp_enabled: + model.set_pp_mesh(world_mesh["pp"]) + return model @@ -310,22 +315,36 @@ def apply_non_moe_tp( # transformer block's inputs) # 2. Parallelize the root norm layer over the sequence dim # 3. Parallelize the final linear output layer - parallelize_module( - model, - tp_mesh, - { - "tok_embeddings": RowwiseParallel( + + # skipping nn.Identity modules (which are added by pipeline parallelism for unused modules) + root_plan = {} + + if hasattr(model, 'tok_embeddings'): + if isinstance(model.tok_embeddings, nn.Identity): + root_plan["tok_embeddings"] = NoParallel() + else: + root_plan["tok_embeddings"] = RowwiseParallel( input_layouts=Replicate(), output_layouts=Shard(1), - ), - "norm": SequenceParallel(), - "output": ColwiseParallel( + ) + + if hasattr(model, 'norm'): + if isinstance(model.norm, nn.Identity): + root_plan["norm"] = NoParallel() + else: + root_plan["norm"] = SequenceParallel() + + if hasattr(model, 'output'): + if isinstance(model.output, nn.Identity): + root_plan["output"] = NoParallel() + else: + root_plan["output"] = ColwiseParallel( input_layouts=Shard(1), output_layouts=Shard(-1) if loss_parallel else Replicate(), use_local_output=not loss_parallel, - ), - }, - ) + ) + if root_plan: # Only call if there's something to parallelize + parallelize_module(model, tp_mesh, root_plan) # Parallel styles used for transformer block linear weights and their # inputs may be different for float8 linears with tensorwise scaling. diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 917d50a43f..7bc444f1eb 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -273,13 +273,27 @@ def __init__(self, model_args: HFTransformerModelArgs): patch_hf_llama() self.model = model_cls(config=model_args) - + self.max_seq_len = model_args.max_seq_len + for layer in self.model.model.layers: if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace: layer.moe_enabled = True else: layer.moe_enabled = False + self.cp_mesh = None + self.tp_mesh = None + self.pp_mesh = None + + def set_cp_mesh(self, mesh): + self.cp_mesh = mesh + + def set_tp_mesh(self, mesh): + self.tp_mesh = mesh + + def set_pp_mesh(self, mesh): + self.pp_mesh = mesh + @property def tok_embeddings(self): """Returns the model's embed_tokens, handling different Hugging Face model structures.""" @@ -358,8 +372,9 @@ def rotary_emb(self, value): raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") def forward(self, *args, **kwargs): - position_ids = torch.arange(args[0].shape[1], device=args[0].device).unsqueeze(0) - kwargs["position_ids"] = position_ids + local_seq_len = self.max_seq_len + local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1 + kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0) output = self.model.model(*args, **kwargs) output = self.model.lm_head(output.last_hidden_state) return output diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py index 38c4982319..d886549ff0 100644 --- a/torchtitan/experiments/transformers_backend/test_hf_integration.py +++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py @@ -11,6 +11,9 @@ from rich.table import Table from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn +BASELINE = "fsdp2_tp1_cp1_pp1" +# BASELINE = "fsdp1_tp1_cp1_pp1" + console = Console() class LogLevel(Enum): @@ -149,15 +152,28 @@ def create_configs(model_name: str, out_dir: str, flavor: str): config["model"]["flavor"] = flavor parallelism_configs = [ - "fsdp2_tp1_cp1_pp1", # baseline + BASELINE, # baseline "fsdp2_tp2_cp1_pp1", "fsdp2_tp1_cp1_pp2", "fsdp2_tp1_cp2_pp1", "fsdp2_tp1_cp2_pp2", "fsdp2_tp2_cp2_pp1", + "fsdp2_tp2_cp1_pp2", "fsdp2_tp2_cp2_pp2", ] + # parallelism_configs = [ + # BASELINE, # baseline + # "fsdp1_tp2_cp1_pp1", + # "fsdp1_tp1_cp1_pp2", + # "fsdp1_tp1_cp2_pp1", + # "fsdp1_tp1_cp2_pp2", + # "fsdp1_tp2_cp2_pp1", + # "fsdp1_tp2_cp1_pp2", + # "fsdp1_tp2_cp2_pp2", + # ] + + out_path = Path(out_dir) / model_name / flavor out_path.mkdir(parents=True, exist_ok=True) @@ -184,6 +200,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str): # Create parallelism configs for pc in parallelism_configs: + iter_config = toml.loads(toml.dumps(config)) m = re.match(r"fsdp(\d+)_tp(\d+)_cp(\d+)_pp(\d+)", pc) @@ -200,8 +217,10 @@ def create_configs(model_name: str, out_dir: str, flavor: str): iter_config["parallelism"]["tensor_parallel_degree"] = tp iter_config["parallelism"]["context_parallel_degree"] = cp iter_config["parallelism"]["pipeline_parallel_degree"] = pp - iter_config["parallelism"]["pipeline_parallel_schedule"] = "1F1B" + iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe" iter_config["job"]["dump_folder"] = str(pc_dir) + if pc == BASELINE or pc == "fsdp2_tp1_cp1_pp2": + iter_config["training"]["local_batch_size"] = 2 config_path = pc_dir / "config.toml" with open(config_path, "w") as f: @@ -379,9 +398,9 @@ class TrainingMetrics: grad_norm: List[float] = field(default_factory=list) # Default tolerance values (matching compare_distributed_run.py) - DEFAULT_LOSS_ATOL = 0.02 + DEFAULT_LOSS_ATOL = 5e-2 DEFAULT_LOSS_RTOL = 1e-5 - DEFAULT_GRAD_NORM_ATOL = 0.02 + DEFAULT_GRAD_NORM_ATOL = 7e-1 DEFAULT_GRAD_NORM_RTOL = 1e-5 def _extract_metrics(log_file: Path) -> TrainingMetrics: @@ -503,7 +522,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: tuple[int, int]: (passed_count, failed_count) """ # Find baseline directory - baseline_dir = flavor_dir / "fsdp2_tp1_cp1_pp1" + baseline_dir = flavor_dir / BASELINE if not baseline_dir.exists(): log_message(LogLevel.WARNING, f"No baseline directory found in {flavor_dir.relative_to(inp_path)}, skipping", indent=1) return 0, 0 @@ -525,7 +544,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: # Find all parallelism config directories (excluding seed_checkpoint and baseline) config_dirs = [] for item in flavor_dir.iterdir(): - if item.is_dir() and item.name.startswith("fsdp2_") and item.name != "fsdp2_tp1_cp1_pp1": + if item.is_dir() and item.name not in {BASELINE, "seed_checkpoint"}: config_dirs.append(item) if not config_dirs: @@ -598,7 +617,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: if success: log_message(LogLevel.INFO, f"Diff between baseline vs HF nd-parallel saved to:", indent=5, dim=True) - console.print(f" [dim]{diff_file.relative_to(flavor_dir)}[/dim]") + console.print(f" [dim]{diff_file}[/dim]") else: log_message(LogLevel.WARNING, f"Failed to generate diff: {output}", indent=5, dim=True) @@ -651,7 +670,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: # Find all directories that contain a baseline (fsdp2_tp1_cp1_pp1) subdirectory flavor_dirs = [] for root, dirs, files in os.walk(inp_path): - if "fsdp2_tp1_cp1_pp1" in dirs: + if BASELINE in dirs: flavor_dirs.append(Path(root)) if not flavor_dirs: From bb080ad7187e3322d163196dbb110be55c50ebec Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 14 Oct 2025 16:23:20 +0000 Subject: [PATCH 062/129] fix and uniformize weight init of llama-like model + various fix --- .../transformers_backend/__init__.py | 10 +- .../infra/parallelize_hf_transformers.py | 39 +++-- .../model/hf_llama_like_patch.py | 165 ++++++++++++++++++ .../model/hf_transformers_args.py | 73 ++++++-- .../test_hf_integration.py | 65 ++++--- torchtitan/utils/test_utils.py | 7 +- 6 files changed, 305 insertions(+), 54 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index ac0431ec3f..c29b3a5aa1 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -35,7 +35,7 @@ class TitanModelArgs: n_layers: int = 32 n_heads: int = 32 n_kv_heads: Optional[int] = None - vocab_size: int = 128256 + vocab_size: Optional[int] = None multiple_of: int = 256 ffn_dim_multiplier: Optional[float] = None norm_eps: float = 1e-5 @@ -69,17 +69,19 @@ class DeepSeekV3Args: beta_slow: Optional[int] = None mscale: Optional[float] = None partial_rotary_factor: Optional[float] = None + rope_interleave: bool = True + flavors = { "debugmodel": HFTransformerModelArgs( titan_args=TitanModelArgs( - vocab_size=2000, + vocab_size=51200, dim=256, - n_layers=6, + n_layers=1, n_heads=16, n_kv_heads=16, ), - deepseek_v3_args=None + pad_token_id=None, # deepseek_v3_args=DeepSeekV3Args( # partial_rotary_factor=4.0, # inter_dim=1024, diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index b512ca026c..16e33251ae 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -43,12 +43,12 @@ torch.ops.aten.mm.default, torch.ops.aten._scaled_dot_product_efficient_attention.default, torch.ops.aten._scaled_dot_product_flash_attention.default, + torch._higher_order_ops.flex_attention, torch.ops._c10d_functional.reduce_scatter_tensor.default, # for low precision training, it's useful to always save # the result of max, since the absolute maximum is # used to compute the scaling factor for quantization. torch.ops.aten.max.default, - torch._higher_order_ops.flex_attention, } def _apply_ac_to_transformer_block( @@ -379,21 +379,34 @@ def apply_non_moe_tp( "self_attn.q_proj": colwise_parallel(), "self_attn.k_proj": colwise_parallel(), "self_attn.v_proj": colwise_parallel(), - "self_attn.o_proj": rowwise_parallel(output_layouts=Shard(1)), "post_attention_layernorm": SequenceParallel(), } + + # Handle different names for the output projection layer, e.g. o_proj vs dense + o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense" + layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1)) + if not transformer_block.moe_enabled: - layer_plan.update( - { - "mlp": prepare_module_input( - input_layouts=(Shard(1),), - desired_input_layouts=(Replicate(),), - ), - "mlp.gate_proj": colwise_parallel(), - "mlp.up_proj": colwise_parallel(), - "mlp.down_proj": rowwise_parallel(output_layouts=Shard(1)), - } - ) + mlp_plan = { + "mlp": prepare_module_input( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + } + # Handle different names for MLP layers, e.g. gate_proj vs fc1 + gate_proj_name = "gate_proj" if hasattr(transformer_block.mlp, "gate_proj") else "fc1" + mlp_plan[f"mlp.{gate_proj_name}"] = colwise_parallel() + + if hasattr(transformer_block.mlp, "up_proj"): + mlp_plan["mlp.up_proj"] = colwise_parallel() + + down_proj_name = "down_proj" if hasattr(transformer_block.mlp, "down_proj") else "fc2" + mlp_plan[f"mlp.{down_proj_name}"] = rowwise_parallel(output_layouts=Shard(1)) + layer_plan.update(mlp_plan) + + # Some models like Phi-2 don't have post_attention_layernorm + if not hasattr(transformer_block, "post_attention_layernorm"): + layer_plan.pop("post_attention_layernorm") parallelize_module( module=transformer_block, diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py new file mode 100644 index 0000000000..563c5e289b --- /dev/null +++ b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py @@ -0,0 +1,165 @@ +import torch +import torch.nn as nn +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import PreTrainedModel +import math +from torch.nn import init + + +def patch_hf_llama_like(decoder_layer_cls, attention_cls, mlp_cls=None): + """ + This patch modifies a Hugging Face Llama-like model's weight initialization to match + the initialization scheme used in TorchTitan. This is crucial for ensuring + bit-for-bit reproducibility when converting checkpoints between the native + TorchTitan format and the Hugging Face format. + + The patch targets the following aspects of the model: + - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly. + - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization + for attention, MLP, embedding, and layer norm layers. This includes depth-dependent + initialization for attention and MLP layers. + - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within + each decoder layer, which is required for the depth-dependent initialization. + + By applying this patch, we can ensure that a model loaded in the transformers + backend will have the exact same weights as a model trained with the native + TorchTitan backend, which is essential for seamless conversion and debugging. + """ + + _original_decoder_layer_init = decoder_layer_cls.__init__ + + def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): + _original_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx + # Ensure both attention and mlp modules have layer_idx for depth-based init + if hasattr(self, "self_attn"): + self.self_attn.layer_idx = layer_idx + # some models might not have mlp in each layer + if hasattr(self, "mlp") and self.mlp is not None: + self.mlp.layer_idx = layer_idx + + def _initialize_weights_patched(self, module): + # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly + # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, + # which prevents subsequent proper initialization. + if getattr(module, "_is_hf_initialized", False): + return + + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + + # If not on a meta device, call the original weight initialization + self._init_weights(module) + module._is_hf_initialized = True + + def _init_weights_patched(self, module): + """ + Patched version of _init_weights to match TorchTitan's initialization for Llama-like models. + `self` is a PreTrainedModel instance. + """ + config = self.config + + # check if layer is (resid_dropout): Dropout(p=0.1, inplace=False) + if hasattr(module, "resid_dropout"): + print() + + # Build tuple of classes to check for layer_idx-based init_std calculation + layer_idx_classes = [attention_cls] + if mlp_cls: + layer_idx_classes.append(mlp_cls) + layer_idx_classes = tuple(layer_idx_classes) + + if isinstance(module, layer_idx_classes): + if not hasattr(module, "layer_idx"): + return + layer_idx = module.layer_idx + + if hasattr(config, "depth_init") and config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, attention_cls): + # Initialize weights and biases for q, k, v projections + for proj_name in ["q_proj", "k_proj", "v_proj"]: + proj = getattr(module, proj_name) + nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02) + if proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(proj.bias, -bound, bound) + + # Handle different names for the output projection layer + o_proj = getattr(module, "o_proj", getattr(module, "dense", None)) + if o_proj is not None: + nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std) + if o_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(o_proj.bias, -bound, bound) + + elif mlp_cls and isinstance(module, mlp_cls): + # Handle different names for MLP layers + gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None)) + up_proj = getattr(module, "up_proj", None) + down_proj = getattr(module, "down_proj", getattr(module, "fc2", None)) + + # gate_proj (or fc1) should always use std=0.02 for numerical stability. + if gate_proj is not None: + nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02) + if gate_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(gate_proj.bias, -bound, bound) + # up_proj and down_proj (or fc2) use the depth-dependent init_std. + if up_proj is not None: + nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std) + if up_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(up_proj.bias, -bound, bound) + if down_proj is not None: + nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std) + if down_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(down_proj.bias, -bound, bound) + + elif module is getattr( + self, "lm_head", None + ): # TODO(3outeille): find a better way to detect lm_head + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif ( + isinstance( + module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d) + ) + or "LayerNorm" in module.__class__.__name__ + or "RMSNorm" in module.__class__.__name__ + ): + # Norms can exist without weights (in which case they are None from torch primitives) + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + decoder_layer_cls.__init__ = _decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 7bc444f1eb..db1880b1dc 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -16,8 +16,7 @@ from transformers.configuration_utils import PretrainedConfig from transformers.modeling_utils import AttentionInterface from transformers.integrations.sdpa_attention import sdpa_attention_forward -from torchtitan.experiments.transformers_backend.model.hf_llama_patch import patch_hf_llama -from torchtitan.experiments.transformers_backend.model.hf_deepseek_v3_patch import patch_hf_deepseek_v3 +from torchtitan.experiments.transformers_backend.model.hf_llama_like_patch import patch_hf_llama_like @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): @@ -55,6 +54,7 @@ def __init__( attn_implementation: str = "sdpa_torchtitan", **kwargs, ): + super().__init__(attn_implementation=attn_implementation, **kwargs) assert titan_args is not None, "titan_args is required" active_mappings = {} @@ -68,6 +68,11 @@ def __init__( self._create_dynamic_properties() + # Set HF attributes from titan_args based on mappings + for titan_name, hf_name in self._active_mappings.items(): + if hasattr(titan_args, titan_name): + setattr(self, hf_name, getattr(titan_args, titan_name)) + # Fill all TorchTitan-specific args (no HF equivalent) self.multiple_of = titan_args.multiple_of self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier @@ -95,6 +100,7 @@ def __init__( self._passed_args.update(**deepseek_v3_args.__dict__) + self.rope_interleave = deepseek_v3_args.rope_interleave self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor if deepseek_v3_args.moe_args is not None: @@ -132,7 +138,7 @@ def __repr__(self) -> str: # doesn't work well with how HFTransformerModelArgs is initialized. # This custom __repr__ provides a dataclass-like representation that correctly # displays the arguments passed during initialization. - args_lines = [f"{k}={v!r}" for k, v in sorted(self._passed_args.items())] + args_lines = [f"{k}={getattr(self, k)!r}" for k in sorted(self._passed_args.keys())] args_str = "\n".join(args_lines) return f"{self.__class__.__name__}(\n{args_str}\n)" @@ -141,15 +147,24 @@ def update_from_config(self, job_config: JobConfig): hf_model_config = AutoConfig.from_pretrained( job_config.model.name, attn_implementation=self.attn_implementation, + trust_remote_code=True ) - self.__dict__.update(hf_model_config.__dict__) - + # Explicitly update attributes based on mappings + for titan_name, hf_name in self._active_mappings.items(): + if hasattr(hf_model_config, hf_name): + setattr(self, titan_name, getattr(hf_model_config, hf_name)) + + # Copy any other attributes that might not be in the mapping + # This is safer than a direct __dict__ update + for key, value in hf_model_config.to_dict().items(): + setattr(self, key, value) + # Update our attributes with the passed args from flavors for key, value in self._passed_args.items(): - if hasattr(self, key): + if hasattr(self, key) and value is not None: setattr(self, key, value) - + # MoE if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim @@ -265,12 +280,40 @@ def __init__(self, model_args: HFTransformerModelArgs): f"Make sure the class is available. Original error: {e}" ) - if model_args.architectures[0] == "DeepseekV3Model": - print("Patching deepseek") - patch_hf_deepseek_v3() - else: - print("Patching llama") - patch_hf_llama() + # agnostic weight initialization patching + try: + model_name_prefix = model_class_name.replace("ForCausalLM", "") + model_module = importlib.import_module(model_cls.__module__) + + attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None) + mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None) + decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None) + + if all([attention_cls, decoder_layer_cls]): + logger.info(f"Applying Llama-like patch for {model_name_prefix}") + patch_hf_llama_like( + decoder_layer_cls=decoder_layer_cls, + attention_cls=attention_cls, + mlp_cls=mlp_cls, # mlp_cls can be None + ) + else: + missing = [ + cls_name + for cls, cls_name in [ + (attention_cls, "Attention"), + (decoder_layer_cls, "DecoderLayer"), + ] + if not cls + ] + logger.warning( + f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " + "Skipping Llama-like patch." + ) + except Exception as e: + logger.warning( + f"Failed to apply agnostic patch for {model_class_name} due to: {e}. " + "Weight initialization might not match TorchTitan." + ) self.model = model_cls(config=model_args) self.max_seq_len = model_args.max_seq_len @@ -330,6 +373,8 @@ def norm(self): """Returns the model's norm, handling different Hugging Face model structures.""" if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like return self.model.model.norm + elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like + return self.model.model.final_layernorm else: raise AttributeError("Could not find norm in the model. Please check the model structure.") @@ -337,6 +382,8 @@ def norm(self): def norm(self, value): if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like setattr(self.model.model, "norm", value) + elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like + setattr(self.model.model, "final_layernorm", value) else: raise AttributeError("Could not find norm in the model. Please check the model structure.") diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py index d886549ff0..4838133618 100644 --- a/torchtitan/experiments/transformers_backend/test_hf_integration.py +++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py @@ -11,8 +11,8 @@ from rich.table import Table from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn -BASELINE = "fsdp2_tp1_cp1_pp1" -# BASELINE = "fsdp1_tp1_cp1_pp1" +# BASELINE = "fsdp2_tp1_cp1_pp1" +BASELINE = "fsdp1_tp1_cp1_pp1" console = Console() @@ -151,28 +151,32 @@ def create_configs(model_name: str, out_dir: str, flavor: str): config["model"]["name"] = model_name config["model"]["flavor"] = flavor - parallelism_configs = [ - BASELINE, # baseline - "fsdp2_tp2_cp1_pp1", - "fsdp2_tp1_cp1_pp2", - "fsdp2_tp1_cp2_pp1", - "fsdp2_tp1_cp2_pp2", - "fsdp2_tp2_cp2_pp1", - "fsdp2_tp2_cp1_pp2", - "fsdp2_tp2_cp2_pp2", - ] + # parallelism_configs = [ + # BASELINE, # baseline + # "fsdp2_tp2_cp1_pp1", + # # "fsdp2_tp1_cp1_pp2", + # # "fsdp2_tp1_cp2_pp1", + # # "fsdp2_tp1_cp2_pp2", + # # "fsdp2_tp2_cp2_pp1", + # # "fsdp2_tp2_cp1_pp2", + # # "fsdp2_tp2_cp2_pp2", + # ] # parallelism_configs = [ # BASELINE, # baseline - # "fsdp1_tp2_cp1_pp1", - # "fsdp1_tp1_cp1_pp2", - # "fsdp1_tp1_cp2_pp1", - # "fsdp1_tp1_cp2_pp2", - # "fsdp1_tp2_cp2_pp1", - # "fsdp1_tp2_cp1_pp2", - # "fsdp1_tp2_cp2_pp2", + # # "fsdp1_tp2_cp1_pp1", + # # "fsdp1_tp1_cp1_pp2", + # # "fsdp1_tp1_cp2_pp1", + # # "fsdp1_tp1_cp2_pp2", + # # "fsdp1_tp2_cp2_pp1", + # # "fsdp1_tp2_cp1_pp2", + # # "fsdp1_tp2_cp2_pp2", # ] + parallelism_configs = [ + BASELINE, # baseline + "fsdp1_tp2_cp1_pp1", + ] out_path = Path(out_dir) / model_name / flavor out_path.mkdir(parents=True, exist_ok=True) @@ -219,6 +223,11 @@ def create_configs(model_name: str, out_dir: str, flavor: str): iter_config["parallelism"]["pipeline_parallel_degree"] = pp iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe" iter_config["job"]["dump_folder"] = str(pc_dir) + + # if pc == "fsdp1_tp1_cp1_pp2" or pc == BASELINE: + # iter_config["training"]["global_batch_size"] = 1 + # iter_config["training"]["local_batch_size"] = 1 + if pc == BASELINE or pc == "fsdp2_tp1_cp1_pp2": iter_config["training"]["local_batch_size"] = 2 @@ -379,7 +388,7 @@ def check_status(inp_dir: str): print("=" * 30) -def report(inp_dir: str): +def report(inp_dir: str, only: str = None): """ Generate diff reports between baseline (fsdp2_tp1_cp1_pp1) and all other parallelism configs. Creates diff_baseline_vs_nd_parallelism.log in each non-baseline config directory. @@ -673,9 +682,20 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: if BASELINE in dirs: flavor_dirs.append(Path(root)) + # Filter by --only if provided + if only: + original_count = len(flavor_dirs) + flavor_dirs = [ + d for d in flavor_dirs if only in str(d.relative_to(inp_path)) + ] + log_message( + LogLevel.INFO, + f"Filtered from {original_count} to {len(flavor_dirs)} director{'ies' if len(flavor_dirs) != 1 else 'y'} matching '[bold]{only}[/bold]'", + ) + if not flavor_dirs: log_message(LogLevel.ERROR, f"No directories with baseline configuration found under {inp_path}") - console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1_pp1' subdirectory[/yellow]") + console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1' subdirectory[/yellow]") return log_message(LogLevel.INFO, f"Found {len(flavor_dirs)} model/flavor combination(s) to process:") @@ -737,6 +757,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: report_parser = subparsers.add_parser("report") report_parser.add_argument("--inp_dir", type=str, required=True) + report_parser.add_argument("--only", type=str, default=None) check_status_parser = subparsers.add_parser("check_status") check_status_parser.add_argument("--inp_dir", type=str, required=True) @@ -748,6 +769,6 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: elif args.action == "submit_jobs": submit_jobs(args.inp_dir, args.qos, args.only) elif args.action == "report": - report(args.inp_dir) + report(args.inp_dir, args.only) elif args.action == "check_status": check_status(args.inp_dir) \ No newline at end of file diff --git a/torchtitan/utils/test_utils.py b/torchtitan/utils/test_utils.py index 77db8bcfe6..efb8ac478d 100644 --- a/torchtitan/utils/test_utils.py +++ b/torchtitan/utils/test_utils.py @@ -42,8 +42,11 @@ def seeded_trunc_normal(*trunc_args, **trunc_kwargs): result = original_trunc_normal(*trunc_args, **trunc_kwargs) return result - nn.init.trunc_normal_ = seeded_trunc_normal - return func(*args, **kwargs) + try: + nn.init.trunc_normal_ = seeded_trunc_normal + return func(*args, **kwargs) + finally: + nn.init.trunc_normal_ = original_trunc_normal return wrapper return decorator From 3168f9e2785cc5589226059ea2dc01481577641c Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 15 Oct 2025 08:50:13 +0000 Subject: [PATCH 063/129] support moe init and fix with moe layer (TP for lora layers) --- .../infra/parallelize_hf_transformers.py | 32 ++++- .../model/hf_moe_like_patch.py | 135 ++++++++++++++++++ .../model/hf_transformers_args.py | 74 +++++++--- 3 files changed, 215 insertions(+), 26 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 16e33251ae..422b307cd4 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -370,18 +370,44 @@ def apply_non_moe_tp( # Apply tensor + sequence parallelism to every transformer block for transformer_block in model.layers: + is_deepseek_v3 = "deepseek_v3" in transformer_block.self_attn.__class__.__module__ layer_plan = { "input_layernorm": SequenceParallel(), "self_attn": prepare_module_input( input_kwarg_layouts={"hidden_states": Shard(1)}, desired_input_kwarg_layouts={"hidden_states": Replicate()}, ), - "self_attn.q_proj": colwise_parallel(), - "self_attn.k_proj": colwise_parallel(), - "self_attn.v_proj": colwise_parallel(), "post_attention_layernorm": SequenceParallel(), } + if is_deepseek_v3: + if getattr(transformer_block.self_attn, "q_lora_rank", None) is None: + layer_plan["self_attn.q_proj"] = colwise_parallel() + else: + layer_plan.update({ + "self_attn.q_a_proj": NoParallel(), + "self_attn.q_a_layernorm": NoParallel(), + "self_attn.q_b_proj": colwise_parallel(), + }) + + if getattr(transformer_block.self_attn, "kv_lora_rank", None) is None: + layer_plan.update({ + "self_attn.k_proj": colwise_parallel(), + "self_attn.v_proj": colwise_parallel(), + }) + else: + layer_plan.update({ + "self_attn.kv_a_proj_with_mqa": NoParallel(), + "self_attn.kv_a_layernorm": NoParallel(), + "self_attn.kv_b_proj": colwise_parallel(), + }) + else: + layer_plan.update({ + "self_attn.q_proj": colwise_parallel(), + "self_attn.k_proj": colwise_parallel(), + "self_attn.v_proj": colwise_parallel(), + }) + # Handle different names for the output projection layer, e.g. o_proj vs dense o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense" layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1)) diff --git a/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py new file mode 100644 index 0000000000..dc18e0b455 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py @@ -0,0 +1,135 @@ +import torch.nn as nn +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import PreTrainedModel + + +def patch_hf_moe_like(decoder_layer_cls, attention_cls, mlp_cls, moe_cls): + """ + This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight + initialization to match the initialization scheme used in TorchTitan, + drawing from patterns in models like DeepseekV3. + + The patch targets: + - `PreTrainedModel._initialize_weights`: For correct meta device initialization. + - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization + for attention, MLP, MoE, embedding, and layer norm layers. + - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert + modules, required for depth-dependent initialization. + """ + + _original_decoder_layer_init = decoder_layer_cls.__init__ + + def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): + _original_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx + + if hasattr(self, "self_attn"): + self.self_attn.layer_idx = layer_idx + + if hasattr(self, "mlp"): + self.mlp.layer_idx = layer_idx + if hasattr(self.mlp, "experts"): + for expert in self.mlp.experts: + expert.layer_idx = layer_idx + if hasattr(self.mlp, "shared_experts"): + # Not all MoE models have shared experts + if self.mlp.shared_experts is not None: + self.mlp.shared_experts.layer_idx = layer_idx + + def _initialize_weights_patched(self, module): + if getattr(module, "_is_hf_initialized", False): + return + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + self._init_weights(module) + module._is_hf_initialized = True + + def _init_weights_patched(self, module): + """ + Patched version of _init_weights for MoE models. + """ + config = self.config + init_std = None + + if isinstance(module, (attention_cls, mlp_cls, moe_cls)): + if hasattr(module, "layer_idx"): + layer_idx = module.layer_idx + if hasattr(config, "depth_init") and config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + # Fallback for models without depth_init + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, attention_cls): + # Handle different attention projection layer names by initializing if they exist + if hasattr(module, "q_proj"): + nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "k_proj"): + nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "v_proj"): + nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "q_a_proj"): + nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "q_b_proj"): + nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "kv_a_proj_with_mqa"): + nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) + if hasattr(module, "kv_b_proj"): + nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "o_proj") and init_std is not None: + nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, mlp_cls): + nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) + # DeepseekV3 uses std=0.02 for up_proj, unlike Llama + nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, moe_cls): + if hasattr(module, "gate") and init_std is not None: + nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) + if hasattr(module, "experts"): + for expert in module.experts: + nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) + if hasattr(module, "shared_experts") and module.shared_experts is not None: + nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) + + elif module is getattr(self, "lm_head", None): + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__: + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + decoder_layer_cls.__init__ = _decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index db1880b1dc..4bc65aa0d2 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -17,6 +17,7 @@ from transformers.modeling_utils import AttentionInterface from transformers.integrations.sdpa_attention import sdpa_attention_forward from torchtitan.experiments.transformers_backend.model.hf_llama_like_patch import patch_hf_llama_like +from torchtitan.experiments.transformers_backend.model.hf_moe_like_patch import patch_hf_moe_like @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): @@ -138,7 +139,11 @@ def __repr__(self) -> str: # doesn't work well with how HFTransformerModelArgs is initialized. # This custom __repr__ provides a dataclass-like representation that correctly # displays the arguments passed during initialization. - args_lines = [f"{k}={getattr(self, k)!r}" for k in sorted(self._passed_args.keys())] + args_lines = [ + f"{k}={getattr(self, k)!r}" + for k in sorted(self._passed_args.keys()) + if hasattr(self, k) + ] args_str = "\n".join(args_lines) return f"{self.__class__.__name__}(\n{args_str}\n)" @@ -156,7 +161,6 @@ def update_from_config(self, job_config: JobConfig): setattr(self, titan_name, getattr(hf_model_config, hf_name)) # Copy any other attributes that might not be in the mapping - # This is safer than a direct __dict__ update for key, value in hf_model_config.to_dict().items(): setattr(self, key, value) @@ -191,7 +195,7 @@ def update_from_config(self, job_config: JobConfig): def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: # Check if this is a MoE model by looking for MoE attributes - is_moe = hasattr(self, 'n_routed_experts') and hasattr(self, 'num_experts_per_tok') + is_moe = hasattr(self, 'n_routed_experts') if is_moe: # MoE parameter counting (adapted from DeepSeek V3 implementation) @@ -280,7 +284,7 @@ def __init__(self, model_args: HFTransformerModelArgs): f"Make sure the class is available. Original error: {e}" ) - # agnostic weight initialization patching + # Attempt to patch model weight initialization based on architecture type try: model_name_prefix = model_class_name.replace("ForCausalLM", "") model_module = importlib.import_module(model_cls.__module__) @@ -289,26 +293,50 @@ def __init__(self, model_args: HFTransformerModelArgs): mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None) decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None) - if all([attention_cls, decoder_layer_cls]): - logger.info(f"Applying Llama-like patch for {model_name_prefix}") - patch_hf_llama_like( - decoder_layer_cls=decoder_layer_cls, - attention_cls=attention_cls, - mlp_cls=mlp_cls, # mlp_cls can be None - ) + is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model + if is_moe: + moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None) + required_classes = { + "Attention": attention_cls, + "MLP": mlp_cls, + "DecoderLayer": decoder_layer_cls, + "MoE": moe_cls + } + + if all(required_classes.values()): + logger.info(f"Applying MoE-like patch for {model_name_prefix}") + patch_hf_moe_like( + decoder_layer_cls=decoder_layer_cls, + attention_cls=attention_cls, + mlp_cls=mlp_cls, + moe_cls=moe_cls + ) + else: + missing = [name for name, cls in required_classes.items() if not cls] + logger.warning( + f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. " + "Skipping MoE-like patch." + ) else: - missing = [ - cls_name - for cls, cls_name in [ - (attention_cls, "Attention"), - (decoder_layer_cls, "DecoderLayer"), - ] - if not cls - ] - logger.warning( - f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " - "Skipping Llama-like patch." - ) + required_classes = { + "Attention": attention_cls, + "DecoderLayer": decoder_layer_cls + } + + if all(required_classes.values()): + logger.info(f"Applying Llama-like patch for {model_name_prefix}") + patch_hf_llama_like( + decoder_layer_cls=decoder_layer_cls, + attention_cls=attention_cls, + mlp_cls=mlp_cls # mlp_cls can be None + ) + else: + missing = [name for name, cls in required_classes.items() if not cls] + logger.warning( + f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " + "Skipping Llama-like patch." + ) + except Exception as e: logger.warning( f"Failed to apply agnostic patch for {model_class_name} due to: {e}. " From a9a65b7b95188cab173056271f03aa6a70fa9d8a Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 15 Oct 2025 13:17:13 +0000 Subject: [PATCH 064/129] begin TP + EP with MoE model --- .../infra/parallelize_hf_transformers.py | 119 +++++++++--------- 1 file changed, 63 insertions(+), 56 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 422b307cd4..1bfe6ab779 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -370,7 +370,6 @@ def apply_non_moe_tp( # Apply tensor + sequence parallelism to every transformer block for transformer_block in model.layers: - is_deepseek_v3 = "deepseek_v3" in transformer_block.self_attn.__class__.__module__ layer_plan = { "input_layernorm": SequenceParallel(), "self_attn": prepare_module_input( @@ -380,37 +379,32 @@ def apply_non_moe_tp( "post_attention_layernorm": SequenceParallel(), } - if is_deepseek_v3: - if getattr(transformer_block.self_attn, "q_lora_rank", None) is None: - layer_plan["self_attn.q_proj"] = colwise_parallel() - else: - layer_plan.update({ - "self_attn.q_a_proj": NoParallel(), - "self_attn.q_a_layernorm": NoParallel(), - "self_attn.q_b_proj": colwise_parallel(), - }) - - if getattr(transformer_block.self_attn, "kv_lora_rank", None) is None: - layer_plan.update({ - "self_attn.k_proj": colwise_parallel(), - "self_attn.v_proj": colwise_parallel(), - }) - else: - layer_plan.update({ - "self_attn.kv_a_proj_with_mqa": NoParallel(), - "self_attn.kv_a_layernorm": NoParallel(), - "self_attn.kv_b_proj": colwise_parallel(), - }) - else: + if getattr(transformer_block.self_attn, "q_lora_rank", None) is None: layer_plan.update({ "self_attn.q_proj": colwise_parallel(), "self_attn.k_proj": colwise_parallel(), "self_attn.v_proj": colwise_parallel(), }) + else: + layer_plan.update({ + "self_attn.q_a_proj": NoParallel(), + "self_attn.q_a_layernorm": NoParallel(), + "self_attn.q_b_proj": colwise_parallel(), + "self_attn.kv_a_proj_with_mqa": NoParallel(), + "self_attn.kv_a_layernorm": NoParallel(), + "self_attn.kv_b_proj": colwise_parallel(), + }) # Handle different names for the output projection layer, e.g. o_proj vs dense o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense" layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1)) + + # For Qwen3 RMSNorm on Q and K + # TODO(3outeille): we should probably shard(1) then replicate => then use SequenceParallel but for now I am fed up + if hasattr(transformer_block.self_attn, "q_norm"): + layer_plan["self_attn.q_norm"] = NoParallel() + if hasattr(transformer_block.self_attn, "k_norm"): + layer_plan["self_attn.k_norm"] = NoParallel() if not transformer_block.moe_enabled: mlp_plan = { @@ -508,7 +502,7 @@ def apply_fsdp( if hasattr(transformer_block, "moe_enabled") and transformer_block.moe_enabled and ep_degree > 1: fsdp_mod_ep_config = fsdp_config.copy() fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh - + moe_block = transformer_block.mlp # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts). # When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding # causes inefficiency, so we choose to do FSDP sharding on dim-1. @@ -517,15 +511,14 @@ def apply_fsdp( # shard_placement_fn on the outer TransformerBlock-level FSDP. _experts_shard_placement_fn = None assert dp_mod_ep_mesh is not None - assert hasattr(transformer_block, "moe") if ( dp_mod_ep_mesh.size() * ep_degree - > transformer_block.moe.experts.num_experts + > moe_block.experts.num_experts ): _experts_shard_placement_fn = lambda param: Shard(1) fully_shard( - transformer_block.moe.experts, + moe_block.experts, **fsdp_mod_ep_config, reshard_after_forward=reshard_after_forward, shard_placement_fn=_experts_shard_placement_fn, @@ -534,7 +527,7 @@ def apply_fsdp( # NOTE: # Although the FSDP sharding of experts is done on a mesh of # a different size than other parameters, the gradient division # factor should be consistent with data. - transformer_block.moe.experts.set_gradient_divide_factor( + moe_block.experts.set_gradient_divide_factor( gradient_divide_factor, ) @@ -573,7 +566,7 @@ def apply_fsdp( if next_transformer_block is not None: if next_transformer_block.moe_enabled: transformer_block.set_modules_to_forward_prefetch( - [next_transformer_block, next_transformer_block.moe.experts] + [next_transformer_block, next_transformer_block.mlp.experts] ) else: transformer_block.set_modules_to_forward_prefetch( @@ -597,7 +590,7 @@ def apply_fsdp( if prev_transformer_block is not None: if prev_transformer_block.moe_enabled: transformer_block.set_modules_to_backward_prefetch( - [prev_transformer_block, prev_transformer_block.moe.experts] + [prev_transformer_block, prev_transformer_block.mlp.experts] ) else: transformer_block.set_modules_to_backward_prefetch( @@ -618,11 +611,12 @@ def apply_moe_ep_tp( if not transformer_block.moe_enabled: continue + moe_block = transformer_block.mlp if tp_mesh is not None: moe_layer_plan = { # input / output sharding on the seqlen dim # all-gather for input, reduce-scatter for output - "moe": PrepareModuleInputOutput( + "mlp": PrepareModuleInputOutput( input_layouts=(Shard(1),), desired_input_layouts=(Replicate(),), use_local_input=True, @@ -630,22 +624,22 @@ def apply_moe_ep_tp( desired_output_layouts=(Shard(1),), ), # replicate computation for the router - "moe.router.gate": NoParallel(), + "mlp.gate": NoParallel(), } if ep_mesh is not None and not etp_enabled: # If TP is borrowed for EP, then split the tokens across TP ranks so that # the reorderer, the all-to-all comms, and routed experts computation # are effectively running Sequence Parallel (split along the folded bs*slen dim) - moe_layer_plan.update({"moe.reorderer": ReordererSequenceParallel()}) - if transformer_block.moe.shared_experts is not None: + moe_layer_plan.update({"mlp.reorderer": ReordererSequenceParallel()}) + if moe_block.shared_experts is not None: # input Replicate, output Partial moe_layer_plan.update( { - "moe.shared_experts.w1": ColwiseParallel(), - "moe.shared_experts.w2": RowwiseParallel( + "mlp.shared_experts.gate_proj": ColwiseParallel(), + "mlp.shared_experts.up_proj": ColwiseParallel(), + "mlp.shared_experts.down_proj": RowwiseParallel( output_layouts=Partial() ), - "moe.shared_experts.w3": ColwiseParallel(), } ) parallelize_module( @@ -654,27 +648,40 @@ def apply_moe_ep_tp( parallelize_plan=moe_layer_plan, ) - experts_mesh, experts_plan = None, None - if ep_mesh is None: + if ep_mesh is None: # This is the TP-only case for experts experts_mesh = tp_mesh - # input Replicate, output Partial - experts_plan = TensorParallel() - elif tp_mesh is None: - experts_mesh = ep_mesh - # input / output sharding on the batch / tokens dim - experts_plan = ExpertParallel() - elif etp_enabled: - experts_mesh = ep_tp_mesh - experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh) - else: - experts_mesh = ep_mesh - experts_plan = ExpertParallel() + expert_tp_plan = {} + for i in range(len(moe_block.experts)): + expert_tp_plan.update( + { + f"{i}.gate_proj": ColwiseParallel(), + f"{i}.up_proj": ColwiseParallel(), + f"{i}.down_proj": RowwiseParallel(output_layouts=Partial()), + } + ) + parallelize_module( + module=moe_block.experts, + device_mesh=experts_mesh, + parallelize_plan=expert_tp_plan, + ) + else: # EP or ETP enabled + experts_mesh, experts_plan = None, None + if tp_mesh is None: + experts_mesh = ep_mesh + # input / output sharding on the batch / tokens dim + experts_plan = ExpertParallel() + elif etp_enabled: + experts_mesh = ep_tp_mesh + experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh) + else: + experts_mesh = ep_mesh + experts_plan = ExpertParallel() - parallelize_module( - module=transformer_block.moe.experts, - device_mesh=experts_mesh, - parallelize_plan=experts_plan, - ) + parallelize_module( + module=moe_block.experts, + device_mesh=experts_mesh, + parallelize_plan=experts_plan, + ) def apply_compile(model: nn.Module): From b4a1b8882da64a81e49f0d1067619c3babb8eb62 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 15 Oct 2025 13:22:34 +0000 Subject: [PATCH 065/129] cleaning --- .../transformers_backend/__init__.py | 48 +- .../compare_distributed_run.py | 1103 ----------------- .../compare_distributed_run.sh | 8 - .../model/hf_deepseek_v3_patch.py | 113 -- .../model/hf_llama_like_patch.py | 4 - .../model/hf_llama_patch.py | 89 -- torchtitan/models/attention.py | 4 +- torchtitan/models/deepseek_v3/model/model.py | 6 +- torchtitan/models/moe.py | 7 +- 9 files changed, 28 insertions(+), 1354 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/compare_distributed_run.py delete mode 100755 torchtitan/experiments/transformers_backend/compare_distributed_run.sh delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_patch.py diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index c29b3a5aa1..fb21837a6b 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -75,35 +75,35 @@ class DeepSeekV3Args: flavors = { "debugmodel": HFTransformerModelArgs( titan_args=TitanModelArgs( - vocab_size=51200, dim=256, - n_layers=1, + n_layers=6, n_heads=16, n_kv_heads=16, ), pad_token_id=None, - # deepseek_v3_args=DeepSeekV3Args( - # partial_rotary_factor=4.0, - # inter_dim=1024, - # moe_inter_dim=256, - # n_dense_layers=1, - # n_group=2, - # topk_group=1, - # kv_lora_rank=512, - # q_lora_rank=0, - # qk_nope_head_dim=128, - # qk_rope_head_dim=64, - # v_head_dim=128, - # mscale=0.70, - # moe_args=MoEArgs( - # num_experts=8, - # num_shared_experts=2, - # top_k=3, - # score_func="softmax", - # route_norm=True, - # score_before_experts=False, - # ), - # ) + #TODO(3outeille): use os.environ to switch between models + deepseek_v3_args=DeepSeekV3Args( + partial_rotary_factor=4.0, + inter_dim=1024, + moe_inter_dim=256, + n_dense_layers=1, + n_group=2, + topk_group=1, + kv_lora_rank=512, + q_lora_rank=0, + qk_nope_head_dim=128, + qk_rope_head_dim=64, + v_head_dim=128, + mscale=0.70, + moe_args=MoEArgs( + num_experts=8, + num_shared_experts=2, + top_k=3, + score_func="softmax", + route_norm=True, + score_before_experts=False, + ) + ) if os.environ.get("USE_MOE", "0") == "1" else None, ), "medium": HFTransformerModelArgs( titan_args=TitanModelArgs( diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py deleted file mode 100644 index b42e8b0138..0000000000 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py +++ /dev/null @@ -1,1103 +0,0 @@ -""" -python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 2d --verbose -python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor flavor --nd_parallel 2d --verbose - -Methodology: - - train on FSDP with TT (baseline) - - train on FSDP with HF (baseline) - - For all parallelism, train with nd-// with HF - - If one train fails: - - generated diff between HF FSDP (baseline) HF nd-// - - train the nd-// TT counterpart - - diff between TT nd-// and HF nd-// - - diff between TT FSDP (baseline) and HF nd-// - - diff between TT FSDP (baseline) and TF nd-// -results/ -|_ meta-llama - |_ Llama-3.2-1B - |_ 2D - |_ debugmodel - |_ baseline_hf_fsdp_4gpu.log - |_ baseline_tt_fsdp_4gpu.log - |_ baseline_fsdp_debugmodel_4gpu_huggingface.toml - |_ baseline_fsdp_debugmodel_4gpu_torchtitan.toml - |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu/ - |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.toml - |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_torchtitan.toml - |_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.log - |_ diff_hf_baseline_vs_hf_nd_parallelism.log - |_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log - |_ diff_tt_baseline_vs_hf_nd_parallelism.log - |_ full - |_ baseline_hf_fsdp_4gpu.log - |_ baseline_tt_fsdp_4gpu.log - |_ baseline_fsdp_full_4gpu_huggingface.toml - |_ baseline_fsdp_full_4gpu_torchtitan.toml - |_ fsdp1_cp1_tp2_pp2_full_4gpu/ - |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.toml - |_ fsdp1_cp1_tp2_pp2_full_4gpu_torchtitan.toml - |_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.log - |_ diff_hf_baseline_vs_hf_nd_parallelism.log - |_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log - |_ diff_tt_baseline_vs_hf_nd_parallelism.log - -""" -import argparse -import os -import re -import shutil -import subprocess -import sys -from pathlib import Path -from typing import List, Optional -from dataclasses import dataclass, field -from enum import Enum -import torch -from rich.console import Console -from rich.panel import Panel -from rich.progress import ( - BarColumn, - Progress, - SpinnerColumn, - TextColumn, - TimeElapsedColumn, -) -from rich.table import Table - - -console = Console() - - -class LogLevel(Enum): - COMMAND = "COMMAND" - INFO = "INFO" - SUCCESS = "SUCCESS" - WARNING = "WARNING" - ERROR = "ERROR" - TEST_PASS = "TEST_PASS" - TEST_FAIL = "TEST_FAIL" - - -def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None: - """Log a message with appropriate color coding.""" - style_map = { - LogLevel.COMMAND: "dim", - LogLevel.INFO: "blue", - LogLevel.SUCCESS: "green", - LogLevel.WARNING: "yellow", - LogLevel.ERROR: "bold red", - LogLevel.TEST_PASS: "green", - LogLevel.TEST_FAIL: "bold red", - } - - prefix_map = { - LogLevel.COMMAND: "[COMMAND]", - LogLevel.INFO: "[INFO]", - LogLevel.SUCCESS: "[SUCCESS]", - LogLevel.WARNING: "[WARNING]", - LogLevel.ERROR: "[ERROR]", - LogLevel.TEST_PASS: "✅ TEST PASS", - LogLevel.TEST_FAIL: "❌ TEST FAIL", - } - - style = style_map[level] - prefix = prefix_map[level] - if indent > 0: - indent_str = " " * (indent - 1) + "└─ " - else: - indent_str = "" - - output = "" - if level == LogLevel.COMMAND: - output = f"{indent_str}[{style}]{prefix} {message}[/]" - else: - output = f"{indent_str}[{style}]{prefix}[/] {message}" - - if dim: - console.print(f"[dim]{output}[/dim]") - else: - console.print(output) - - -@dataclass -class ParallelismConfig: - """Configuration for a parallelism setup.""" - name: str - dp_replicate: int - dp_shard: int - tp: int - pp: int - pp_schedule: str - cp: int - ep: int - eptp: int - -@dataclass -class TrainingMetrics: - """Training metrics extracted from logs.""" - steps: List[int] = field(default_factory=list) - loss: List[float] = field(default_factory=list) - grad_norm: List[float] = field(default_factory=list) - memory: List[float] = field(default_factory=list) - tps: List[int] = field(default_factory=list) - tflops: List[float] = field(default_factory=list) - mfu: List[float] = field(default_factory=list) - -class CompareDistributedRun: - """Main class for running distributed parallelism comparison tests.""" - - # Default values - DEFAULT_STEPS = 10 - DEFAULT_SEED = 42 - DEFAULT_FLAVOR = "debugmodel" - # value chosen based on diff of llama3 1GPU - DEFAULT_LOSS_ATOL = 0.02 - DEFAULT_LOSS_RTOL = 1e-5 - DEFAULT_GRAD_NORM_ATOL = 0.02 - DEFAULT_GRAD_NORM_RTOL = 1e-5 - - MODEL_LISTS = { - "torchtitan": ["llama3", "deepseek_v3"], - "huggingface": ["meta-llama/Llama-3.2-1B", "deepseek-ai/DeepSeek-V3"] - } - - MODEL_FLAVORS = { - "llama3": ["debugmodel", "medium", "full"], - "deepseek_v3": ["debugmodel"], - "meta-llama/Llama-3.2-1B": ["debugmodel", "medium", "full"], - "deepseek-ai/DeepSeek-V3": ["debugmodel"], - } - - #TODO(3outeille): handle slurm later for 4D/5D. Might need to rethink the whole script for that - # Available ND parallelisms <-> number of GPUs - ND_PARALLEL_TO_NB_GPUS = { - "0d": 1, - "1d": 2, - "2d": 4, - "3d": 8, - "4d": 16, - "5d": 32, - } - - def __init__(self): - self.script_dir = Path(__file__).parent.absolute() - self.torchtitan_root = self.script_dir.parent.parent - self.base_results_dir = self.script_dir / "results" - - # Configuration parameters - self.nd_parallel_to_nb_gpus = self.ND_PARALLEL_TO_NB_GPUS - self.steps = self.DEFAULT_STEPS - self.seed = self.DEFAULT_SEED - self.model_filter = "" - self.flavor = self.DEFAULT_FLAVOR - self.verbose = False - self.use_slurm = False - self.slurm_options = [] - self.loss_atol = self.DEFAULT_LOSS_ATOL - self.loss_rtol = self.DEFAULT_LOSS_RTOL - self.grad_norm_atol = self.DEFAULT_GRAD_NORM_ATOL - self.grad_norm_rtol = self.DEFAULT_GRAD_NORM_RTOL - self.parallelism_configs: List[ParallelismConfig] = [] - self.results_dir: Optional[Path] = None - self.test_filter = "" - - def generate_parallelism_configs(self, hf_model_name: str) -> None: - """Generate parallelism configurations based on the number of GPUs.""" - from transformers import AutoConfig - - try: - model_config = AutoConfig.from_pretrained(hf_model_name) - is_moe = getattr(model_config, "num_local_experts", 0) > 1 - except Exception: - # Fallback for models not on Hub or other errors - is_moe = False - log_message(LogLevel.WARNING, f"Could not determine if {hf_model_name} is a MoE model from HuggingFace Hub. EP configurations will not be generated.") - - ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] - configs = [] - - def _get_factors(n: int) -> List[int]: - factors = set() - for i in range(1, int(n**0.5) + 1): - if n % i == 0: - factors.add(i) - factors.add(n // i) - return sorted(list(factors)) - - # Baseline FSDP - configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="1F1B", cp=1, ep=1, eptp=1)) - - #NOTE(3outeille): No need to handle DDP (dp_replicate) as DDP is not supported > 1D parallelism" - #(cf https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/infra/parallelize.py#L139) - possible_fsdp = _get_factors(ngpu) # dp_shard - possible_cp = _get_factors(ngpu) - possible_tp = _get_factors(ngpu) - possible_pp = _get_factors(ngpu) - - #TODO(3outeille): handle HSDP later - - for dp_shard in possible_fsdp: - for cp in possible_cp: - for tp in possible_tp: - for pp in possible_pp: - - if dp_shard * cp * tp * pp != ngpu: - continue - - num_parallelisms_used = sum(parallel_degree > 1 for parallel_degree in [dp_shard, cp, tp, pp]) - ndims_required = int(self.nd_parallel[0]) - #NOTE(3outeille): if 2D//, we need at least 2 parallelisms to be active (> 1). For 3D //, least 3 parallelisms > 1 etc. - if ndims_required > 1 and num_parallelisms_used < ndims_required: - continue - - configs.append( - ParallelismConfig( - name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}", - dp_replicate=1, - dp_shard=dp_shard, - tp=tp, - pp=pp, - pp_schedule="1F1B", - cp=cp, - ep=1, - eptp=1 - ) - ) - - if is_moe: - # NOTE(3outeille): EP borrowing degree from dp_shard - configs.append( - ParallelismConfig( - name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}", - dp_replicate=1, - dp_shard=dp_shard, - tp=tp, - pp=pp, - pp_schedule="1F1B", - cp=cp, - ep=dp_shard, - eptp=1 - ) - ) - - - # Remove duplicates and assign to instance - unique_configs = [] - seen_configs = set() - for config in configs: - # Create a tuple of the config values to check for duplicates - config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.cp, config.ep, config.eptp) - if config_tuple not in seen_configs: - unique_configs.append(config) - seen_configs.add(config_tuple) - - self.parallelism_configs = unique_configs - - log_message( - LogLevel.INFO, - f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.", - ) - configs_to_display = self.parallelism_configs - table_title = "[bold]Generated Parallelism Configurations[/bold]" - - if self.test_filter: - # Keep fsdp baseline and anything that matches the filter - configs_to_display = [c for c in self.parallelism_configs if c.name == "fsdp" or self.test_filter in c.name] - table_title = f"[bold]Filtered Parallelism Configurations (filter: [cyan]'{self.test_filter}'[/cyan])[/bold]" - - table = Table( - title=table_title, - show_header=True, - header_style="bold magenta", - ) - table.add_column("Name", style="cyan", no_wrap=True) - table.add_column("dp_replicate", justify="right") - table.add_column("dp_shard", justify="right") - table.add_column("tp", justify="right") - table.add_column("pp", justify="right") - table.add_column("cp", justify="right") - table.add_column("ep", justify="right") - table.add_column("eptp", justify="right") - - for config in configs_to_display: - table.add_row( - config.name, - str(config.dp_replicate), - str(config.dp_shard), - str(config.tp), - str(config.pp), - str(config.cp), - str(config.ep), - str(config.eptp), - ) - console.print(table) - console.print() - - def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None, indent: int = 0, dim: bool = False) -> Path: - """Generate configuration file for a parallelism setup.""" - import toml - - if filename: - config_file = config_dir / filename - else: - config_file = config_dir / f"{config.name}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu_{backend}.toml" - - base_config = self.script_dir / "configs" / "test_template.toml" - shutil.copy2(base_config, config_file) - - # Load the TOML file as a dict - with open(config_file, 'r') as f: - config_data = toml.load(f) - - # Update [model] section - if "model" not in config_data: - config_data["model"] = {} - config_data["model"]["name"] = model_name - config_data["model"]["flavor"] = self.flavor - - # Validate flavor for model type - if model_name in self.MODEL_FLAVORS: - if self.flavor not in self.MODEL_FLAVORS[model_name]: - log_message(LogLevel.WARNING, - f"Flavor '{self.flavor}' not available for {model_name}. " - f"Available: {self.MODEL_FLAVORS[model_name]}", indent=indent, dim=dim) - - # Update [training] section - if "training" not in config_data: - config_data["training"] = {} - config_data["training"]["steps"] = self.steps - config_data["training"]["seed"] = self.seed - - # Update [parallelism] section - if "parallelism" not in config_data: - config_data["parallelism"] = {} - config_data["parallelism"]["data_parallel_replicate_degree"] = config.dp_replicate - config_data["parallelism"]["data_parallel_shard_degree"] = config.dp_shard - config_data["parallelism"]["tensor_parallel_degree"] = config.tp - config_data["parallelism"]["pipeline_parallel_degree"] = config.pp - config_data["parallelism"]["pipeline_parallel_schedule"] = config.pp_schedule - config_data["parallelism"]["context_parallel_degree"] = config.cp - config_data["parallelism"]["expert_parallel_degree"] = config.ep - config_data["parallelism"]["expert_tensor_parallel_degree"] = config.eptp - - # Write back the modified TOML - with open(config_file, 'w') as f: - toml.dump(config_data, f) - - if self.verbose: - log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})", indent=indent, dim=dim) - return config_file - - def extract_metrics(self, log_file: Path, indent: int = 0, dim: bool = False) -> TrainingMetrics: - """Extract metrics from log file.""" - metrics = TrainingMetrics() - - try: - with open(log_file, 'r') as f: - content = f.read() - - # Regex to capture all metrics from a log line, ignoring ANSI color codes - pattern = re.compile( - r"step:\s*(\d+)\s*" - r".*?loss:\s*([0-9]+\.?[0-9]*)\s*" - r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*" - ) - - for match in pattern.finditer(content): - metrics.steps.append(int(match.group(1))) - metrics.loss.append(float(match.group(2))) - metrics.grad_norm.append(float(match.group(3))) - - except Exception as e: - log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}", indent=indent, dim=dim) - - if not metrics.loss or not metrics.grad_norm: - log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}", indent=indent, dim=dim) - - return metrics - - def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, - config_name: str, indent: int = 0, dim: bool = False) -> bool: - """Compare metrics between baseline and test configuration.""" - if not baseline_metrics.loss or not test_metrics.loss: - log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics", indent=indent, dim=dim) - return False - - # Convert to tensors - baseline_loss = torch.tensor(baseline_metrics.loss) - test_loss = torch.tensor(test_metrics.loss) - baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm) - test_grad_norm = torch.tensor(test_metrics.grad_norm) - - # Check if tensors are close - loss_pass = torch.allclose(baseline_loss, test_loss, atol=self.loss_atol, rtol=self.loss_rtol) - grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=self.grad_norm_atol, rtol=self.grad_norm_rtol) - - # Calculate max absolute differences for logging - loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 - grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 - - # Calculate min absolute differences for logging - loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 - grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 - - if loss_pass and grad_pass: - log_message(LogLevel.TEST_PASS, - f"{config_name} - Max loss diff: {loss_max_diff:.2e}, " - f"Min loss diff: {loss_min_diff:.2e}, " - f"Max grad norm diff: {grad_norm_diff:.2e}, " - f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim) - return True - else: - log_message(LogLevel.TEST_FAIL, - f"{config_name} - Max loss diff: {loss_max_diff:.2e}, " - f"Min loss diff: {loss_min_diff:.2e}, " - f"Max grad norm diff: {grad_norm_diff:.2e}, " - f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim) - return False - - def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path, indent: int = 0, dim: bool = False) -> None: - """Generate diff between baseline and test logs.""" - - def _filter_log(log_file: Path) -> Path: - """Filter log file to normalize volatile information.""" - filtered_file = log_file.with_suffix(log_file.suffix + '.filtered') - - with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile: - for line in infile: - # Apply filtering patterns - line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', - 'TIMESTAMP', line) - line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', - 'torchrun ... --master_port=XXXX', line) - line = re.sub(r'PID [0-9]+', 'PID XXXX', line) - line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line) - outfile.write(line) - - return filtered_file - try: - # Filter logs to remove timestamps and volatile information - baseline_filtered = _filter_log(baseline_log) - test_filtered = _filter_log(test_log) - - # Generate colored diff using git diff - cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color", - str(baseline_filtered), str(test_filtered)] - - with open(diff_file, 'w') as f: - subprocess.run(cmd, stdout=f, stderr=subprocess.DEVNULL) - - # Clean up filtered files - baseline_filtered.unlink() - test_filtered.unlink() - - except Exception as e: - log_message(LogLevel.WARNING, f"Could not generate diff: {e}", indent=indent, dim=dim) - - def run_training_local(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]: - """Run training with given configuration.""" - log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}", indent=indent, dim=dim) - cmd = [ - "torchrun", - f"--nproc_per_node={self.ngpu}", - "--rdzv_backend", "c10d", - "--rdzv_endpoint=localhost:0", - "--local-ranks-filter", str(self.ngpu - 1), - "--role", "rank", - "--tee", "3", - "-m", "torchtitan.train", - "--training.seed", str(self.seed), - "--training.deterministic", - "--job.config_file", str(config_file) - ] - env = os.environ.copy() - env["SEED"] = str(self.seed) - env["LOG_RANK"] = str(self.ngpu - 1) - - log_message(LogLevel.COMMAND, f"{' '.join(cmd)}", indent=indent, dim=dim) - - try: - # Capture output to include it in the exception, while still writing to log file - result = subprocess.run( - cmd, - cwd=self.torchtitan_root, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, # decodes stdout/stderr as text - env=env, - check=True - ) - with open(log_file, 'w') as f: - f.write(result.stdout) - - if self.verbose: - log_message(LogLevel.SUCCESS, f"Training completed: {config_name}", indent=indent, dim=dim) - return None - - except subprocess.CalledProcessError as e: - log_message(LogLevel.ERROR, f"Training failed: {config_name}", indent=indent, dim=dim) - - # Write the failed output to the log file - with open(log_file, 'w') as f: - if e.stdout: - f.write(e.stdout) - - # Print the tail of the error log to the console for quick debugging - if e.stdout: - console.print("[bold red]--- Error Log Tail ---[/bold red]") - error_lines = e.stdout.strip().split('\n') - for line in error_lines[-15:]: - console.print(f"[red]{line}[/red]") - console.print("[bold red]--- End Error Log Tail ---[/bold red]") - - e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or ''}") - return e - - def run_training_slurm(self): - pass - - def _compare_one_parallelism_config( - self, - config: "ParallelismConfig", - hf_model_name: str, - tt_model_name: str, - hf_baseline_metrics: "TrainingMetrics", - tt_baseline_metrics: "TrainingMetrics", - baseline_log_hf: Path, - baseline_log_tt: Path, - indent: int = 0, - ) -> bool: - """Compares a single parallelism configuration against the baseline.""" - # New flow: launch all training, then all diff, then all extract/compare metrics - - # --- 1. Setup directories and config files --- - test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu" - test_dir = self.results_dir / test_dir_name - test_dir.mkdir(exist_ok=True) - - config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - config_file_hf = self.generate_config( - config_dir=test_dir, - config=config, - model_name=hf_model_name, - backend="huggingface", - filename=config_filename_hf, - indent=indent, - ) - log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" - - config_filename_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" - config_file_tt = self.generate_config( - config_dir=test_dir, - config=config, - model_name=tt_model_name, - backend="torchtitan", - filename=config_filename_tt, - indent=indent + 5, - dim=True, - ) - log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" - - # --- 2. Launch all training (HF and TT) --- - hf_run_error = self.run_training_local( - config_file=config_file_hf, - log_file=log_path_hf, - config_name=config.name, - model_name=hf_model_name, - indent=indent, - ) - tt_run_error = self.run_training_local( - config_file=config_file_tt, - log_file=log_path_tt, - config_name=config.name, - model_name=tt_model_name, - indent=indent + 5, - dim=True, - ) - - # If either training failed, log and skip further steps for this config - if hf_run_error: - log_message( - LogLevel.TEST_FAIL, - f"{config.name} (huggingface) - Training script failed.", - indent=indent + 5, - dim=True, - ) - return False - - if tt_run_error: - log_message( - LogLevel.TEST_FAIL, - f"{config.name} (torchtitan) - Training script failed.", - indent=indent + 5, - dim=True, - ) - return False - - # --- 3. Generate all diffs --- - list_of_diffs = { - "HF baseline vs HF nd-parallel": (baseline_log_hf, log_path_hf, test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"), - "TT nd-parallel vs HF nd-parallel": (log_path_tt, log_path_hf, test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"), - "TT baseline vs HF nd-parallel": (baseline_log_tt, log_path_hf, test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"), - "TT baseline vs TT nd-parallel": (baseline_log_tt, log_path_tt, test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log"), - } - for src, dst, output in list_of_diffs.values(): - self.generate_diff(src, dst, output, indent=indent + 5, dim=True) - - # --- 4. Extract all metrics --- - hf_metrics = self.extract_metrics(log_path_hf, indent=indent) - tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True) - - # --- 5. Compare metrics and determine pass/fail --- - test_passed = True - - for diff_name, (src, dst, output) in list_of_diffs.items(): - if "TT nd-parallel vs HF nd-parallel" == diff_name: - metrics_passed = self.compare_metrics( - tt_metrics, - hf_metrics, - diff_name, - indent=indent + 5, - dim=True, - ) - elif "TT baseline vs TT nd-parallel" == diff_name: - metrics_passed = self.compare_metrics( - tt_baseline_metrics, - tt_metrics, - diff_name, - indent=indent + 5, - dim=True, - ) - elif "TT baseline vs HF nd-parallel" == diff_name: - metrics_passed = self.compare_metrics( - tt_baseline_metrics, - hf_metrics, - diff_name, - indent=indent + 5, - dim=True, - ) - else: # HF baseline vs HF nd-parallel == diff_name - metrics_passed = self.compare_metrics( - hf_baseline_metrics, - hf_metrics, - diff_name, - indent=indent + 5, - dim=True, - ) - - if not metrics_passed: - test_passed = False - - log_message( - LogLevel.INFO, - f"Diff between {diff_name} saved to: {output}", - indent=indent + 10, - dim=True, - ) - - return test_passed - - def run_local(self, args: argparse.Namespace) -> int: - """Main execution function. Runs all test suites for all models.""" - self.nd_parallel = args.nd_parallel - self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] - self.steps = args.steps - self.model_filter = args.model_filter - self.test_filter = args.test_filter - self.flavor = args.flavor - self.verbose = args.verbose - self.loss_atol = args.loss_atol - self.loss_rtol = args.loss_rtol - self.grad_norm_atol = args.grad_norm_atol - self.grad_norm_rtol = args.grad_norm_rtol - - console.print( - Panel( - ( - f"[bold]GPUs:[/bold] {self.ngpu}\n" - f"[bold]Steps:[/bold] {self.steps}\n" - f"[bold]Seed:[/bold] {self.seed}\n" - f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n" - f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n" - f"[bold]Model flavor:[/bold] {self.flavor}" - ), - title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]", - expand=False, - border_style="blue", - padding=(1, 2), - ) - ) - console.print() - - self.base_results_dir.mkdir(exist_ok=True) - - # TODO(3outeille): make it more generic later - if self.model_filter == "llama3": - hf_model_name = "meta-llama/Llama-3.2-1B" - tt_model_name = "llama3" - elif self.model_filter == "deepseek_v3": - hf_model_name = "deepseek-ai/DeepSeek-V3" - tt_model_name = "deepseek_v3" - else: - raise ValueError(f"Model filter {self.model_filter} not supported") - - self.generate_parallelism_configs(hf_model_name) - - model_owner, model_repo = hf_model_name.split("/", 1) - nd_parallel_upper = self.nd_parallel.upper() - self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor - self.results_dir.mkdir(parents=True, exist_ok=True) - - if self.verbose: - log_message(LogLevel.INFO, f"Results directory: {self.results_dir}") - - console.print( - Panel( - "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]", - expand=False, - border_style="blue", - padding=(0, 2), - ) - ) - - baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None) - # --- 1. Generate configs --- - baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - baseline_config_file_hf = self.generate_config( - config_dir=self.results_dir, - config=baseline_config, - model_name=hf_model_name, - backend="huggingface", - filename=baseline_config_filename_hf, - indent=0 - ) - baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log" - - baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" - baseline_config_file_tt = self.generate_config( - config_dir=self.results_dir, - config=baseline_config, - model_name=tt_model_name, - backend="torchtitan", - filename=baseline_config_filename_tt, - indent=0 - ) - baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log" - - # --- 2. Launch all training --- - hf_baseline_run_error = self.run_training_local( - config_file=baseline_config_file_hf, - log_file=baseline_log_hf, - config_name=baseline_config.name, - model_name=hf_model_name, - indent=0 - ) - if hf_baseline_run_error: - raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error - - tt_baseline_run_error = self.run_training_local( - config_file=baseline_config_file_tt, - log_file=baseline_log_tt, - config_name=baseline_config.name, - model_name=tt_model_name, - indent=0 - ) - if tt_baseline_run_error: - raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error - - # --- 3. Generate diff --- - diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log" - self.generate_diff( - baseline_log_tt, - baseline_log_hf, - diff_file_tt_baseline_vs_hf_baseline, - indent=0 - ) - log_message( - LogLevel.INFO, - f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", - indent=5, - dim=True - ) - - # --- 4. Extract metrics --- - hf_baseline_metrics = self.extract_metrics(baseline_log_hf, indent=0) - if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm: - raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}") - - tt_baseline_metrics = self.extract_metrics(baseline_log_tt, indent=0) - if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm: - raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}") - - # --- 5. Compare metrics --- - if not self.compare_metrics( - tt_baseline_metrics, - hf_baseline_metrics, - "baseline (TT) vs baseline (HF)", - indent=5 - ): - raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}") - - console.print() - console.print( - Panel( - "[bold cyan]Comparing ND Parallelism Configurations[/bold cyan]", - expand=False, - border_style="blue", - padding=(0, 2), - ) - ) - passed_tests = 1 # +1 for the baseline (FSDP) - failed_tests = 0 - test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"] - if self.test_filter: - filtered_configs = [c for c in test_configs if self.test_filter in c.name] - if not filtered_configs: - log_message(LogLevel.WARNING, f"Test filter '{self.test_filter}' did not match any test configurations.") - test_configs = filtered_configs - total_tests = len(test_configs) + 1 # +1 for the baseline (FSDP) - results = [] - - console.print() - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task( - "[cyan]Comparing configurations...", total=total_tests - ) - for i, config in enumerate(test_configs): - if i > 0: - console.rule(style="dim") - - progress.update( - task, description=f"[cyan]Testing [bold]{config.name}[/bold]" - ) - passed = self._compare_one_parallelism_config( - config, - hf_model_name, - tt_model_name, - hf_baseline_metrics, - tt_baseline_metrics, - baseline_log_hf, - baseline_log_tt, - indent=1, - ) - results.append((config.name, passed)) - if passed: - passed_tests += 1 - else: - failed_tests += 1 - progress.advance(task) - console.print() - - console.print( - Panel( - "[bold cyan]Final Summary[/bold cyan]", - expand=False, - border_style="blue", - padding=(0, 2), - ) - ) - - summary_table = Table(show_header=True, header_style="bold magenta") - summary_table.add_column("Configuration", style="cyan") - summary_table.add_column("Status", justify="center") - - for name, passed in results: - status = ( - "[bold green]✅ PASS[/bold green]" - if passed - else "[bold red]❌ FAIL[/bold red]" - ) - summary_table.add_row(name, status) - - console.print(summary_table) - console.print() - - overall_summary = Table(title="Overall Test Summary") - overall_summary.add_column("Metric", style="cyan") - overall_summary.add_column("Value", justify="right") - overall_summary.add_row("Total Configurations Tested", str(total_tests)) - overall_summary.add_row("[green]Passed[/green]", str(passed_tests)) - overall_summary.add_row("[red]Failed[/red]", str(failed_tests)) - console.print(overall_summary) - - if passed_tests == total_tests: - log_message(LogLevel.SUCCESS, "All model tests passed! 🎉") - return 0 - else: - log_message(LogLevel.TEST_FAIL, f"{failed_tests} configuration(s) had test failures") - log_message( - LogLevel.INFO, f"Check the diff files in {self.results_dir} for details" - ) - return 1 - - def run_slurm(self, args: argparse.Namespace) -> int: - """Main execution function. Runs all test suites for all models.""" - self.nd_parallel = args.nd_parallel - self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel] - self.steps = args.steps - self.model_filter = args.model_filter - self.test_filter = args.test_filter - self.flavor = args.flavor - self.verbose = args.verbose - self.loss_atol = args.loss_atol - self.loss_rtol = args.loss_rtol - self.grad_norm_atol = args.grad_norm_atol - self.grad_norm_rtol = args.grad_norm_rtol - - console.print( - Panel( - ( - f"[bold]GPUs:[/bold] {self.ngpu}\n" - f"[bold]Steps:[/bold] {self.steps}\n" - f"[bold]Seed:[/bold] {self.seed}\n" - f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n" - f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n" - f"[bold]Model flavor:[/bold] {self.flavor}" - ), - title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]", - expand=False, - border_style="blue", - padding=(1, 2), - ) - ) - console.print() - - self.base_results_dir.mkdir(exist_ok=True) - - # TODO(3outeille): make it more generic later - if self.model_filter == "llama3": - hf_model_name = "meta-llama/Llama-3.2-1B" - tt_model_name = "llama3" - elif self.model_filter == "deepseek_v3": - hf_model_name = "deepseek-ai/DeepSeek-V3" - tt_model_name = "deepseek_v3" - else: - raise ValueError(f"Model filter {self.model_filter} not supported") - - self.generate_parallelism_configs(hf_model_name) - - model_owner, model_repo = hf_model_name.split("/", 1) - nd_parallel_upper = self.nd_parallel.upper() - self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor - self.results_dir.mkdir(parents=True, exist_ok=True) - - if self.verbose: - log_message(LogLevel.INFO, f"Results directory: {self.results_dir}") - - console.print( - Panel( - "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]", - expand=False, - border_style="blue", - padding=(0, 2), - ) - ) - - # --- 1. Generate configs --- - - L = [] - - for config in self.parallelism_configs: - - config_dir = self.results_dir if config.name == "fsdp" else self.results_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu" - config_dir.mkdir(exist_ok=True) - - config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml" - config_file_hf = self.generate_config( - config_dir=config_dir, - config=config, - model_name=hf_model_name, - backend="huggingface", - filename=config_filename_hf, - indent=0 - ) - config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml" - config_file_tt = self.generate_config( - config_dir=config_dir, - config=config, - model_name=tt_model_name, - backend="torchtitan", - filename=config_filename_tt, - indent=0 - ) - log_path_hf = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log" - log_path_tt = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log" - - L.append((config_file_hf, config_file_tt, log_path_hf, log_path_tt)) - - - # Launch slurm training - jobs = [] - from slurm_utils import Job, Status - for config_file_hf, config_file_tt, log_path_hf, log_path_tt in L: - job_hf = Job(config_file_hf, log_path_hf, qos="high") - job_tt = Job(config_file_tt, log_path_tt, qos="high") - - job_tt.set_status(Status.INIT) - job_hf.set_status(Status.INIT) - jobs.append(job_hf) - jobs.append(job_tt) - - scheduler = Scheduler() - - scheduler.create_slurm_script(jobs) - # submit in subprocess - scheduler.submit_jobs(jobs) # -> job.set_status(Status.PENDING) - - scheduler.wait_for_all_jobs_to_complete() # spawn tmux to monitor jobs - #NOTE(3outeille): run_slurm() should not be run if - - def run_tests_slurm(self, args: argparse.Namespace) -> int: - # TODO(3outeille): do diff + compare metrics - pass - -def main(): - """Entry point for the script.""" - parser = argparse.ArgumentParser( - description="Test different parallelism configurations against a baseline FSDP model.", - ) - parser.add_argument("--use_slurm", action="store_true", - help="Use SLURM for job submission") - parser.add_argument("--run_tests_slurm", action="store_true", - help="Run tests with SLURM") - parser.add_argument("-m", "--model-filter", default="", - help="Filter models by name pattern (e.g., 'llama3')") - parser.add_argument("-t", "--test-filter", default="", - help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')") - parser.add_argument("-nd", "--nd_parallel", type=str, default="2d", - help=f"Parallelism to use (default: {CompareDistributedRun.ND_PARALLEL_TO_NB_GPUS.keys()})") - parser.add_argument("-s", "--steps", type=int, default=CompareDistributedRun.DEFAULT_STEPS, - help=f"Training steps (default: {CompareDistributedRun.DEFAULT_STEPS})") - parser.add_argument("--flavor", default=CompareDistributedRun.DEFAULT_FLAVOR, - help=f"Model flavor/size (default: {CompareDistributedRun.DEFAULT_FLAVOR}). " - f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]") - parser.add_argument("-v", "--verbose", action="store_true", - help="Verbose output") - parser.add_argument("--loss-atol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_ATOL, - help=f"Absolute tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_ATOL})") - parser.add_argument("--loss-rtol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_RTOL, - help=f"Relative tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_RTOL})") - parser.add_argument("--grad-norm-atol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL, - help=f"Absolute tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL})") - parser.add_argument("--grad-norm-rtol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL, - help=f"Relative tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL})") - - args = parser.parse_args() - - runner = CompareDistributedRun() - if args.use_slurm: - return runner.run_slurm(args) - elif args.run_tests_slurm: - return runner.run_tests_slurm(args) - else: - return runner.run_local(args) - -if __name__ == "__main__": - sys.exit(main()) diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh deleted file mode 100755 index 2ca9bbee62..0000000000 --- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/bash - -if [[ "$1" == "--debug" ]]; then - shift - debugpy-run compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@" -else - python compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@" -fi diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py deleted file mode 100644 index c2cb960ac5..0000000000 --- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py +++ /dev/null @@ -1,113 +0,0 @@ -import os -import torch.nn as nn -from torchtitan.utils.test_utils import seeded_init_decorator_for_test - -from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config -from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer -from transformers.modeling_utils import PreTrainedModel - - -_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__ - -def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int): - _original_deepseek_v3_decoder_layer_init(self, config, layer_idx) - - self.layer_idx = layer_idx - self.mlp.layer_idx = layer_idx - - if hasattr(self.mlp, 'experts'): - for expert in self.mlp.experts: - expert.layer_idx = layer_idx - self.mlp.shared_experts.layer_idx = layer_idx - -def _initialize_weights_patched(self, module): - # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly - # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, - # which prevents subsequent proper initialization. - if getattr(module, "_is_hf_initialized", False): - return - - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - - # If not on a meta device, call the original weight initialization - self._init_weights(module) - module._is_hf_initialized = True - -@seeded_init_decorator_for_test(seed=os.environ.get("SEED")) -def _init_weights_patched(self, module): - """ - Patched version of _init_weights to match TorchTitan's initialization for Llama. - `self` is a LlamaPreTrainedModel instance. - """ - config = self.config - - #TODO(3outeille): only out_proj/down_proj needs std=init_std. so we can refactor to loop over module and only init last layer with std=init_std - if isinstance(module, (DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE)): - layer_idx = module.layer_idx - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - - if isinstance(module, DeepseekV3Attention): - if hasattr(module, 'q_proj'): - nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) - else: - nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) - - nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) - - nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, DeepseekV3MLP): - nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, DeepseekV3MoE): - nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) - for expert in module.experts: - nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) - - nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) - - elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif ( - isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) - or "LayerNorm" in module.__class__.__name__ - or "RMSNorm" in module.__class__.__name__ - ): - # Norms can exist without weights (in which case they are None from torch primitives) - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - - -def patch_hf_deepseek_v3(): - DeepseekV3DecoderLayer.__init__ = _deepseek_v3_decoder_layer_init_patched - PreTrainedModel._init_weights = _init_weights_patched - PreTrainedModel._initialize_weights = _initialize_weights_patched diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py index 563c5e289b..f1ada96928 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py +++ b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py @@ -20,10 +20,6 @@ def patch_hf_llama_like(decoder_layer_cls, attention_cls, mlp_cls=None): initialization for attention and MLP layers. - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within each decoder layer, which is required for the depth-dependent initialization. - - By applying this patch, we can ensure that a model loaded in the transformers - backend will have the exact same weights as a model trained with the native - TorchTitan backend, which is essential for seamless conversion and debugging. """ _original_decoder_layer_init = decoder_layer_cls.__init__ diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py deleted file mode 100644 index c3557f6973..0000000000 --- a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py +++ /dev/null @@ -1,89 +0,0 @@ -import torch -import torch.nn as nn -from transformers.models.llama.configuration_llama import LlamaConfig -from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer -from transformers.modeling_utils import PreTrainedModel - - -_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__ - -def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int): - _original_llama_decoder_layer_init(self, config, layer_idx) - self.layer_idx = layer_idx - self.mlp.layer_idx = layer_idx - -def _initialize_weights_patched(self, module): - # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly - # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, - # which prevents subsequent proper initialization. - if getattr(module, "_is_hf_initialized", False): - return - - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - - # If not on a meta device, call the original weight initialization - self._init_weights(module) - module._is_hf_initialized = True - -def _init_weights_patched(self, module): - """ - Patched version of _init_weights to match TorchTitan's initialization for Llama. - `self` is a LlamaPreTrainedModel instance. - """ - config = self.config - - if isinstance(module, (LlamaAttention, LlamaMLP)): - layer_idx = module.layer_idx - - if config.depth_init: - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - else: - init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 - - if isinstance(module, LlamaAttention): - nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, LlamaMLP): - nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std) - nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) - - elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif ( - isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)) - or "LayerNorm" in module.__class__.__name__ - or "RMSNorm" in module.__class__.__name__ - ): - # Norms can exist without weights (in which case they are None from torch primitives) - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - -def patch_hf_llama(): - LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched - PreTrainedModel._init_weights = _init_weights_patched - PreTrainedModel._initialize_weights = _initialize_weights_patched \ No newline at end of file diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py index 9d99622cc1..f66361a6d2 100644 --- a/torchtitan/models/attention.py +++ b/torchtitan/models/attention.py @@ -205,9 +205,9 @@ def _init_backend(cls) -> None: # Add CuDNN on B200 w/ highest priority cls.backends = [ - # SDPBackend.FLASH_ATTENTION, + SDPBackend.FLASH_ATTENTION, SDPBackend.EFFICIENT_ATTENTION, - # SDPBackend.MATH, + SDPBackend.MATH, ] if has_cuda_capability(10, 0): cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION) diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py index 260c7bf49a..e2c4bbeda9 100644 --- a/torchtitan/models/deepseek_v3/model/model.py +++ b/torchtitan/models/deepseek_v3/model/model.py @@ -5,7 +5,6 @@ # LICENSE file in the root directory of this source tree. import math -import os from typing import Tuple import torch @@ -14,7 +13,7 @@ from torchtitan.models.attention import build_attention from torchtitan.models.moe import FeedForward, MoE from torchtitan.protocols.train_spec import ModelProtocol -from torchtitan.utils.test_utils import seeded_init_decorator_for_test + from .args import DeepSeekV3ModelArgs @@ -241,7 +240,6 @@ def forward( output = output.view(bsz, seqlen, -1) # (bsz, seqlen, n_heads * v_head_dim) return self.wo(output) # (bsz, seqlen, dim) - @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float): linear_list = [ self.wkv_a, @@ -304,7 +302,6 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor): x = x + self.feed_forward(self.ffn_norm(x)) return x - @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, buffer_device: torch.device): for norm in (self.attention_norm, self.ffn_norm): norm.reset_parameters() @@ -342,7 +339,6 @@ def __init__(self, model_args: DeepSeekV3ModelArgs): self.model_args = model_args self.init_weights() - @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, buffer_device: torch.device | None = None) -> None: buffer_device = buffer_device or self.freqs_cis.device with torch.device(buffer_device): diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py index e2e3981625..8be14ecbf0 100644 --- a/torchtitan/models/moe.py +++ b/torchtitan/models/moe.py @@ -12,8 +12,7 @@ from torch import nn from torchtitan.distributed.expert_parallel import expert_parallel -import os -from torchtitan.utils.test_utils import seeded_init_decorator_for_test + @dataclass class MoEArgs: @@ -58,7 +57,6 @@ def __init__( def forward(self, x: torch.Tensor) -> torch.Tensor: return self.w2(F.silu(self.w1(x)) * self.w3(x)) - @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float = 0.02): nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02) for linear in (self.w2, self.w3): @@ -155,7 +153,6 @@ def forward( self.w1, self.w2, self.w3, x, num_tokens_per_expert ) - @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float): nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02) nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std) @@ -249,7 +246,6 @@ def forward( return top_scores, selected_experts_indices, num_tokens_per_expert - @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights(self, init_std: float): nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std) @@ -439,7 +435,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: out = out.reshape(bs, slen, dim) return out - @seeded_init_decorator_for_test(seed=os.environ.get("SEED")) def init_weights( self, init_std: float, From 5f1075b372bb0a0ed12e0c0928bc3ed7eda8bc5d Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 15 Oct 2025 17:29:33 +0000 Subject: [PATCH 066/129] add small example scripts --- .../configs/qwen3_fsdp2_tp2_pp2.toml | 89 +++++++++++++++++++ .../transformers_backend/run_train.sh | 33 +++++++ 2 files changed, 122 insertions(+) create mode 100644 torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml create mode 100755 torchtitan/experiments/transformers_backend/run_train.sh diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml new file mode 100644 index 0000000000..5f40ec41b3 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml @@ -0,0 +1,89 @@ +# torchtitan Config.toml + +[job] +dump_folder = "./outputs" +description = "Qwen 3 debug training" +print_args = false +use_for_integration_test = false + +[profiling] +enable_profiling = true +save_traces_folder = "profile_trace" +profile_freq = 5 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +disable_color_printing = false +enable_tensorboard = false +save_tb_folder = "tb" +enable_wandb = false + +[model] +name = "Qwen/Qwen3-4B-Instruct-2507" +flavor = "debugmodel" +# test folder with tokenizer.json, for debug purpose only +hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" +# converters = ["float8"] + +[optimizer] +name = "AdamW" +lr = 8e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps +decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps +decay_type = "linear" +min_lr_factor = 0.0 + +[training] +global_batch_size = 4 +local_batch_size = 2 +seq_len = 2048 +max_norm = 1.0 # grad norm clipping +steps = 10 +dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M) +dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" +mixed_precision_param = "float32" # force float32 for comparison +mixed_precision_reduce = "float32" + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = 2 +fsdp_reshard_after_forward = "default" # default / never / always +tensor_parallel_degree = 2 +enable_async_tensor_parallel = false +pipeline_parallel_degree = 2 +pipeline_parallel_schedule = "1F1B" +context_parallel_degree = 1 +expert_parallel_degree = 1 +expert_tensor_parallel_degree = 1 + +[checkpoint] +enable = false +folder = "checkpoint" +interval = 10 +last_save_model_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = "selective" # ["none", "selective", "full"] +selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy + +[compile] +enable=false +components = ["model", "loss"] + +[float8] +enable_fsdp_float8_all_gather = false +precompute_float8_dynamic_scale_for_fsdp = false +filter_fqns = ["output"] + +[validation] +enable = false +dataset = "c4_validation" +freq = 5 +steps = 10 diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh new file mode 100755 index 0000000000..3b82ad07f3 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/run_train.sh @@ -0,0 +1,33 @@ +#!/usr/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +# use envs as local overwrites for convenience +# e.g. +# LOG_RANK=0,1 NGPU=4 ./run_train.sh +NGPU=${NGPU:-"8"} +export LOG_RANK=${LOG_RANK:-0} + +# Option to switch between debug and train +MODE=${MODE:-"train"} # Set MODE=debug or MODE=train + +CONFIG_FILE=${CONFIG_FILE:-"configs/qwen3_fsdp2_tp2_pp2.toml"} + +if [ "$MODE" = "debug" ]; then + PYTHON_CMD="debugpy-run -m torch.distributed.run --" +else + PYTHON_CMD="torchrun" +fi + +TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} + +PYTORCH_ALLOC_CONF="expandable_segments:True" \ +TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \ +$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ +--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ +-m torchtitan.train --job.config_file ${CONFIG_FILE} "$@" \ No newline at end of file From c35ccfcc74a4ef72b0f1585b3353f92ab940dc52 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 20 Oct 2025 11:24:02 +0000 Subject: [PATCH 067/129] fix all the merge issues --- .../transformers_backend/__init__.py | 9 +---- .../configs/test_template.toml | 5 +-- .../infra/parallelize_hf_transformers.py | 5 +-- .../transformers_backend/infra/pipeline_hf.py | 38 ++++++++++++------- torchtitan/protocols/train_spec.py | 19 +++------- torchtitan/train.py | 1 + 6 files changed, 36 insertions(+), 41 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index fb21837a6b..34892cfcc2 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -105,19 +105,12 @@ class DeepSeekV3Args: ) ) if os.environ.get("USE_MOE", "0") == "1" else None, ), - "medium": HFTransformerModelArgs( - titan_args=TitanModelArgs( - dim=1024, - n_layers=12, - ), - ), "full": HFTransformerModelArgs( titan_args=TitanModelArgs(), ), } hf_train_spec = TrainSpec( - name="hf_auto_model", model_cls=HFTransformerModel, model_args=flavors, parallelize_fn=parallelize_hf_transformers, @@ -129,4 +122,4 @@ class DeepSeekV3Args: build_loss_fn=build_cross_entropy_loss, ) -register_train_spec(hf_train_spec) \ No newline at end of file +register_train_spec("hf_placeholder_name", hf_train_spec) \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml index fa0c763ed7..0964cf640e 100644 --- a/torchtitan/experiments/transformers_backend/configs/test_template.toml +++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml @@ -3,8 +3,7 @@ [job] dump_folder = "./outputs" description = "Llama 3 debug training" -print_args = false -use_for_integration_test = false +print_config = true [profiling] enable_profiling = true @@ -77,7 +76,7 @@ selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac bas enable=false components = ["model", "loss"] -[float8] +[quantize.linear.float8] enable_fsdp_float8_all_gather = false precompute_float8_dynamic_scale_for_fsdp = false filter_fqns = ["output"] diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 1bfe6ab779..32e122ab75 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -25,12 +25,11 @@ SequenceParallel, ) from torchtitan.config import JobConfig, TORCH_DTYPE_MAP -from torchtitan.distributed import ParallelDims +from torchtitan.distributed import ParallelDims, NoParallel from torchtitan.distributed.expert_parallel import ( ExpertParallel, ExpertTensorParallel, - NoParallel, ReordererSequenceParallel, TensorParallel, ) @@ -198,7 +197,7 @@ def parallelize_hf_transformers( if parallel_dims.tp_enabled: model.set_tp_mesh(world_mesh["tp"]) enable_float8_linear = "float8" in job_config.model.converters - float8_is_rowwise = job_config.float8.recipe_name in ( + float8_is_rowwise = job_config.quantize.linear.float8.recipe_name in ( "rowwise", "rowwise_with_gw_hp", ) diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py index fb707b2509..cd599ac2a5 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py @@ -19,8 +19,7 @@ from torchtitan.distributed import ParallelDims from torchtitan.distributed.pipeline_parallel import ( build_pipeline_schedule, - pipeline_module_split, - stage_ids_this_rank, + pipeline_module_split ) from torch.distributed.device_mesh import DeviceMesh from torch.distributed.pipelining import PipelineStage @@ -145,6 +144,7 @@ def generate_llm_fqn_per_model_part( return module_names_per_stage + def pipeline_module_split( whole_model: nn.Module, pp_mesh: DeviceMesh, @@ -185,7 +185,7 @@ def pipeline_module_split( ] """ pp_rank = pp_mesh.get_local_rank() - pp_size = pp_mesh.size() + pp_degree = pp_mesh.size() def _build_stage_from_modules( stage_idx: int, module_names: list[str], num_stages: int @@ -194,7 +194,6 @@ def _build_stage_from_modules( # Create a set of modules to keep for faster lookup modules_to_keep = set(module_names) - print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}") for module_name, module_value in model.named_children(): # Handle layer-like structures (e.g., "layers.0", "layers.1") if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)): @@ -250,7 +249,27 @@ def _build_stage_from_modules( "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop" ) - for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style): + def _get_stage_indices() -> tuple[int]: + """ + Compute the stage ids for the stages that will run on this pp rank + for either a looped or V style schedule + """ + assert ( + num_stages % pp_degree == 0 + ), f"num_stages {num_stages} must be evenly divisible by pp_degree {pp_degree}" + stages_per_rank = num_stages // pp_degree + if style == "loop": + return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank)) + elif style == "v": + assert ( + stages_per_rank == 2 + ), f"v schedules assume 2 stages per rank, got {stages_per_rank}" + stage_v_pairs = list( + zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1)) + ) + return stage_v_pairs[pp_rank] + + for stage_idx in _get_stage_indices(): module_names = module_names_per_stage[stage_idx] stage, model_chunk = _build_stage_from_modules( stage_idx, @@ -266,7 +285,6 @@ def _build_stage_from_modules( return stages, models - def pipeline_hf_transformers( model: nn.Module, parallel_dims: ParallelDims, @@ -276,12 +294,6 @@ def pipeline_hf_transformers( parallelize_fn: ParallelizeFunction, loss_fn: LossFunction, ) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]: - if job_config.parallelism.pipeline_parallel_split_points != []: - raise ValueError( - "pipeline_parallel_split_points is deprecated. Please use module_fqns_per_model_part instead." - "You can generate module_fqns_per_model_part programmatically with generate_llm_fqn_per_model_part" - ) - pp_mesh = parallel_dims.world_mesh["pp"] # Determine the number of virtual stages based on schedule type @@ -385,4 +397,4 @@ def pipeline_hf_transformers( if stage.is_last: has_last_stage = True - return pp_schedule, model_parts, has_first_stage, has_last_stage + return pp_schedule, model_parts, has_first_stage, has_last_stage \ No newline at end of file diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index bc81c9928e..f04d6ac269 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -52,6 +52,7 @@ class TrainSpec: build_dataloader_fn: DataLoaderBuilder build_tokenizer_fn: TokenizerBuilder | None build_loss_fn: LossFunctionBuilder + name: str | None = None build_validator_fn: ValidatorBuilder | None = None build_metrics_processor_fn: MetricsProcessorBuilder | None = None state_dict_adapter: type[BaseStateDictAdapter] | None = None @@ -70,23 +71,13 @@ def register_train_spec(name: str, train_spec: TrainSpec) -> None: def get_train_spec(name: str) -> TrainSpec: -<<<<<<< HEAD - global _train_specs - - if "/" in name: # HF model (dynamic loading) - hf_spec = _train_specs["hf_auto_model"] - new_spec = dataclasses.replace(hf_spec, name=name) - _train_specs[name] = new_spec - elif name not in _train_specs: # Torchtitan - raise ValueError(f"Model {name} is not registered.") - - return _train_specs[name] -======= # user-defined TrainSpec has higher priority global _extra_train_specs - if name in _extra_train_specs: + if "/" in name: # HF model (dynamic loading) + hf_spec = _extra_train_specs["hf_placeholder_name"] + return dataclasses.replace(hf_spec, name=name) + elif name in _extra_train_specs: return _extra_train_specs[name] ->>>>>>> main from torchtitan.experiments import _supported_experiments from torchtitan.models import _supported_models diff --git a/torchtitan/train.py b/torchtitan/train.py index 3ef25e2886..6bb28d4a8d 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -31,6 +31,7 @@ maybe_enable_memory_snapshot, maybe_enable_profiling, ) +import torchtitan.experiments.transformers_backend # noqa: F401 class Trainer(torch.distributed.checkpoint.stateful.Stateful): # core configs From d5ce2e9132d2380c1867aaef0184980e6dccc787 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 20 Oct 2025 11:29:49 +0000 Subject: [PATCH 068/129] get rid of hf patches files and put it in hf_transformer_args --- .../model/hf_llama_like_patch.py | 161 ---------- .../model/hf_moe_like_patch.py | 135 -------- .../model/hf_transformers_args.py | 291 +++++++++++++++++- 3 files changed, 286 insertions(+), 301 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py deleted file mode 100644 index f1ada96928..0000000000 --- a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py +++ /dev/null @@ -1,161 +0,0 @@ -import torch -import torch.nn as nn -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import PreTrainedModel -import math -from torch.nn import init - - -def patch_hf_llama_like(decoder_layer_cls, attention_cls, mlp_cls=None): - """ - This patch modifies a Hugging Face Llama-like model's weight initialization to match - the initialization scheme used in TorchTitan. This is crucial for ensuring - bit-for-bit reproducibility when converting checkpoints between the native - TorchTitan format and the Hugging Face format. - - The patch targets the following aspects of the model: - - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly. - - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization - for attention, MLP, embedding, and layer norm layers. This includes depth-dependent - initialization for attention and MLP layers. - - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within - each decoder layer, which is required for the depth-dependent initialization. - """ - - _original_decoder_layer_init = decoder_layer_cls.__init__ - - def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): - _original_decoder_layer_init(self, config, layer_idx) - self.layer_idx = layer_idx - # Ensure both attention and mlp modules have layer_idx for depth-based init - if hasattr(self, "self_attn"): - self.self_attn.layer_idx = layer_idx - # some models might not have mlp in each layer - if hasattr(self, "mlp") and self.mlp is not None: - self.mlp.layer_idx = layer_idx - - def _initialize_weights_patched(self, module): - # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly - # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, - # which prevents subsequent proper initialization. - if getattr(module, "_is_hf_initialized", False): - return - - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - - # If not on a meta device, call the original weight initialization - self._init_weights(module) - module._is_hf_initialized = True - - def _init_weights_patched(self, module): - """ - Patched version of _init_weights to match TorchTitan's initialization for Llama-like models. - `self` is a PreTrainedModel instance. - """ - config = self.config - - # check if layer is (resid_dropout): Dropout(p=0.1, inplace=False) - if hasattr(module, "resid_dropout"): - print() - - # Build tuple of classes to check for layer_idx-based init_std calculation - layer_idx_classes = [attention_cls] - if mlp_cls: - layer_idx_classes.append(mlp_cls) - layer_idx_classes = tuple(layer_idx_classes) - - if isinstance(module, layer_idx_classes): - if not hasattr(module, "layer_idx"): - return - layer_idx = module.layer_idx - - if hasattr(config, "depth_init") and config.depth_init: - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - else: - init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 - - if isinstance(module, attention_cls): - # Initialize weights and biases for q, k, v projections - for proj_name in ["q_proj", "k_proj", "v_proj"]: - proj = getattr(module, proj_name) - nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02) - if proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(proj.bias, -bound, bound) - - # Handle different names for the output projection layer - o_proj = getattr(module, "o_proj", getattr(module, "dense", None)) - if o_proj is not None: - nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std) - if o_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(o_proj.bias, -bound, bound) - - elif mlp_cls and isinstance(module, mlp_cls): - # Handle different names for MLP layers - gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None)) - up_proj = getattr(module, "up_proj", None) - down_proj = getattr(module, "down_proj", getattr(module, "fc2", None)) - - # gate_proj (or fc1) should always use std=0.02 for numerical stability. - if gate_proj is not None: - nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02) - if gate_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(gate_proj.bias, -bound, bound) - # up_proj and down_proj (or fc2) use the depth-dependent init_std. - if up_proj is not None: - nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std) - if up_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(up_proj.bias, -bound, bound) - if down_proj is not None: - nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std) - if down_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(down_proj.bias, -bound, bound) - - elif module is getattr( - self, "lm_head", None - ): # TODO(3outeille): find a better way to detect lm_head - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif ( - isinstance( - module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d) - ) - or "LayerNorm" in module.__class__.__name__ - or "RMSNorm" in module.__class__.__name__ - ): - # Norms can exist without weights (in which case they are None from torch primitives) - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - - decoder_layer_cls.__init__ = _decoder_layer_init_patched - PreTrainedModel._init_weights = _init_weights_patched - PreTrainedModel._initialize_weights = _initialize_weights_patched diff --git a/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py deleted file mode 100644 index dc18e0b455..0000000000 --- a/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py +++ /dev/null @@ -1,135 +0,0 @@ -import torch.nn as nn -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import PreTrainedModel - - -def patch_hf_moe_like(decoder_layer_cls, attention_cls, mlp_cls, moe_cls): - """ - This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight - initialization to match the initialization scheme used in TorchTitan, - drawing from patterns in models like DeepseekV3. - - The patch targets: - - `PreTrainedModel._initialize_weights`: For correct meta device initialization. - - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization - for attention, MLP, MoE, embedding, and layer norm layers. - - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert - modules, required for depth-dependent initialization. - """ - - _original_decoder_layer_init = decoder_layer_cls.__init__ - - def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): - _original_decoder_layer_init(self, config, layer_idx) - self.layer_idx = layer_idx - - if hasattr(self, "self_attn"): - self.self_attn.layer_idx = layer_idx - - if hasattr(self, "mlp"): - self.mlp.layer_idx = layer_idx - if hasattr(self.mlp, "experts"): - for expert in self.mlp.experts: - expert.layer_idx = layer_idx - if hasattr(self.mlp, "shared_experts"): - # Not all MoE models have shared experts - if self.mlp.shared_experts is not None: - self.mlp.shared_experts.layer_idx = layer_idx - - def _initialize_weights_patched(self, module): - if getattr(module, "_is_hf_initialized", False): - return - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - self._init_weights(module) - module._is_hf_initialized = True - - def _init_weights_patched(self, module): - """ - Patched version of _init_weights for MoE models. - """ - config = self.config - init_std = None - - if isinstance(module, (attention_cls, mlp_cls, moe_cls)): - if hasattr(module, "layer_idx"): - layer_idx = module.layer_idx - if hasattr(config, "depth_init") and config.depth_init: - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - else: - # Fallback for models without depth_init - init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 - - if isinstance(module, attention_cls): - # Handle different attention projection layer names by initializing if they exist - if hasattr(module, "q_proj"): - nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "k_proj"): - nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "v_proj"): - nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "q_a_proj"): - nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "q_b_proj"): - nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "kv_a_proj_with_mqa"): - nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) - if hasattr(module, "kv_b_proj"): - nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "o_proj") and init_std is not None: - nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, mlp_cls): - nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) - # DeepseekV3 uses std=0.02 for up_proj, unlike Llama - nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, moe_cls): - if hasattr(module, "gate") and init_std is not None: - nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) - if hasattr(module, "experts"): - for expert in module.experts: - nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) - if hasattr(module, "shared_experts") and module.shared_experts is not None: - nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) - - elif module is getattr(self, "lm_head", None): - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__: - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - - decoder_layer_cls.__init__ = _decoder_layer_init_patched - PreTrainedModel._init_weights = _init_weights_patched - PreTrainedModel._initialize_weights = _initialize_weights_patched diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py index 4bc65aa0d2..883c282dc0 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -8,16 +8,16 @@ from dataclasses import dataclass import torch from torch import nn +import math +from torch.nn import init from torchtitan.config import JobConfig from torchtitan.protocols import BaseModelArgs from torchtitan.tools.logging import logger from transformers import AutoConfig from transformers.utils import is_torch_deterministic from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import AttentionInterface +from transformers.modeling_utils import AttentionInterface, PreTrainedModel from transformers.integrations.sdpa_attention import sdpa_attention_forward -from torchtitan.experiments.transformers_backend.model.hf_llama_like_patch import patch_hf_llama_like -from torchtitan.experiments.transformers_backend.model.hf_moe_like_patch import patch_hf_moe_like @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): @@ -305,7 +305,7 @@ def __init__(self, model_args: HFTransformerModelArgs): if all(required_classes.values()): logger.info(f"Applying MoE-like patch for {model_name_prefix}") - patch_hf_moe_like( + self._patch_hf_moe_like( decoder_layer_cls=decoder_layer_cls, attention_cls=attention_cls, mlp_cls=mlp_cls, @@ -325,7 +325,7 @@ def __init__(self, model_args: HFTransformerModelArgs): if all(required_classes.values()): logger.info(f"Applying Llama-like patch for {model_name_prefix}") - patch_hf_llama_like( + self._patch_hf_llama_like( decoder_layer_cls=decoder_layer_cls, attention_cls=attention_cls, mlp_cls=mlp_cls # mlp_cls can be None @@ -365,6 +365,287 @@ def set_tp_mesh(self, mesh): def set_pp_mesh(self, mesh): self.pp_mesh = mesh + def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None): + """ + This patch modifies a Hugging Face Llama-like model's weight initialization to match + the initialization scheme used in TorchTitan. This is crucial for ensuring + bit-for-bit reproducibility when converting checkpoints between the native + TorchTitan format and the Hugging Face format. + + The patch targets the following aspects of the model: + - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly. + - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization + for attention, MLP, embedding, and layer norm layers. This includes depth-dependent + initialization for attention and MLP layers. + - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within + each decoder layer, which is required for the depth-dependent initialization. + """ + + _original_decoder_layer_init = decoder_layer_cls.__init__ + + def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): + _original_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx + # Ensure both attention and mlp modules have layer_idx for depth-based init + if hasattr(self, "self_attn"): + self.self_attn.layer_idx = layer_idx + # some models might not have mlp in each layer + if hasattr(self, "mlp") and self.mlp is not None: + self.mlp.layer_idx = layer_idx + + def _initialize_weights_patched(self, module): + # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly + # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, + # which prevents subsequent proper initialization. + if getattr(module, "_is_hf_initialized", False): + return + + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + + # If not on a meta device, call the original weight initialization + self._init_weights(module) + module._is_hf_initialized = True + + def _init_weights_patched(self, module): + """ + Patched version of _init_weights to match TorchTitan's initialization for Llama-like models. + `self` is a PreTrainedModel instance. + """ + config = self.config + + # Build tuple of classes to check for layer_idx-based init_std calculation + layer_idx_classes = [attention_cls] + if mlp_cls: + layer_idx_classes.append(mlp_cls) + layer_idx_classes = tuple(layer_idx_classes) + + if isinstance(module, layer_idx_classes): + if not hasattr(module, "layer_idx"): + return + layer_idx = module.layer_idx + + if hasattr(config, "depth_init") and config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, attention_cls): + # Initialize weights and biases for q, k, v projections + for proj_name in ["q_proj", "k_proj", "v_proj"]: + proj = getattr(module, proj_name) + nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02) + if proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(proj.bias, -bound, bound) + + # Handle different names for the output projection layer + o_proj = getattr(module, "o_proj", getattr(module, "dense", None)) + if o_proj is not None: + nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std) + if o_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(o_proj.bias, -bound, bound) + + elif mlp_cls and isinstance(module, mlp_cls): + # Handle different names for MLP layers + gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None)) + up_proj = getattr(module, "up_proj", None) + down_proj = getattr(module, "down_proj", getattr(module, "fc2", None)) + + # gate_proj (or fc1) should always use std=0.02 for numerical stability. + if gate_proj is not None: + nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02) + if gate_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(gate_proj.bias, -bound, bound) + # up_proj and down_proj (or fc2) use the depth-dependent init_std. + if up_proj is not None: + nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std) + if up_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(up_proj.bias, -bound, bound) + if down_proj is not None: + nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std) + if down_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(down_proj.bias, -bound, bound) + + elif module is getattr( + self, "lm_head", None + ): # TODO(3outeille): find a better way to detect lm_head + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif ( + isinstance( + module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d) + ) + or "LayerNorm" in module.__class__.__name__ + or "RMSNorm" in module.__class__.__name__ + ): + # Norms can exist without weights (in which case they are None from torch primitives) + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + decoder_layer_cls.__init__ = _decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched + + def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls): + """ + This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight + initialization to match the initialization scheme used in TorchTitan, + drawing from patterns in models like DeepseekV3. + + The patch targets: + - `PreTrainedModel._initialize_weights`: For correct meta device initialization. + - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization + for attention, MLP, MoE, embedding, and layer norm layers. + - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert + modules, required for depth-dependent initialization. + """ + + _original_decoder_layer_init = decoder_layer_cls.__init__ + + def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): + _original_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx + + if hasattr(self, "self_attn"): + self.self_attn.layer_idx = layer_idx + + if hasattr(self, "mlp"): + self.mlp.layer_idx = layer_idx + if hasattr(self.mlp, "experts"): + for expert in self.mlp.experts: + expert.layer_idx = layer_idx + if hasattr(self.mlp, "shared_experts"): + # Not all MoE models have shared experts + if self.mlp.shared_experts is not None: + self.mlp.shared_experts.layer_idx = layer_idx + + def _initialize_weights_patched(self, module): + if getattr(module, "_is_hf_initialized", False): + return + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + self._init_weights(module) + module._is_hf_initialized = True + + def _init_weights_patched(self, module): + """ + Patched version of _init_weights for MoE models. + """ + config = self.config + init_std = None + + if isinstance(module, (attention_cls, mlp_cls, moe_cls)): + if hasattr(module, "layer_idx"): + layer_idx = module.layer_idx + if hasattr(config, "depth_init") and config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + # Fallback for models without depth_init + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, attention_cls): + # Handle different attention projection layer names by initializing if they exist + if hasattr(module, "q_proj"): + nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "k_proj"): + nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "v_proj"): + nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "q_a_proj"): + nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "q_b_proj"): + nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "kv_a_proj_with_mqa"): + nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) + if hasattr(module, "kv_b_proj"): + nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "o_proj") and init_std is not None: + nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, mlp_cls): + nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) + # DeepseekV3 uses std=0.02 for up_proj, unlike Llama + nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, moe_cls): + if hasattr(module, "gate") and init_std is not None: + nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) + if hasattr(module, "experts"): + for expert in module.experts: + nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) + if hasattr(module, "shared_experts") and module.shared_experts is not None: + nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) + + elif module is getattr(self, "lm_head", None): + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__: + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + decoder_layer_cls.__init__ = _decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched + @property def tok_embeddings(self): """Returns the model's embed_tokens, handling different Hugging Face model structures.""" From 8d46723147543e7eb6fa4e451a65bb17ea05f0ac Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 20 Oct 2025 11:33:49 +0000 Subject: [PATCH 069/129] remove eos_id + refactor Optional[int] to comply with torchtitan convention --- .../transformers_backend/__init__.py | 46 +++++++++---------- .../transformers_backend/run_train.sh | 33 ------------- 2 files changed, 22 insertions(+), 57 deletions(-) delete mode 100755 torchtitan/experiments/transformers_backend/run_train.sh diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 34892cfcc2..0b50ce2027 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -5,7 +5,6 @@ # LICENSE file in the root directory of this source tree. import os from dataclasses import dataclass -from typing import Optional from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers @@ -34,41 +33,40 @@ class TitanModelArgs: dim: int = 4096 n_layers: int = 32 n_heads: int = 32 - n_kv_heads: Optional[int] = None - vocab_size: Optional[int] = None + n_kv_heads: int | None = None + vocab_size: int | None = None multiple_of: int = 256 - ffn_dim_multiplier: Optional[float] = None + ffn_dim_multiplier: float | None = None norm_eps: float = 1e-5 rope_theta: float = 10000 max_seq_len: int = 2048 depth_init: bool = True use_flex_attn: bool = False attn_mask_type: str = "causal" - eos_id: int = 0 @dataclass class DeepSeekV3Args: """Arguments specific to DeepSeekV3 models.""" - moe_args: Optional[MoEArgs] = None - n_group: Optional[int] = None - topk_group: Optional[int] = None - inter_dim: Optional[int] = None - moe_inter_dim: Optional[int] = None - n_dense_layers: Optional[int] = None - n_expert_groups: Optional[int] = None - n_limited_groups: Optional[int] = None - q_lora_rank: Optional[int] = None - kv_lora_rank: Optional[int] = None - qk_nope_head_dim: Optional[int] = None - qk_rope_head_dim: Optional[int] = None - v_head_dim: Optional[int] = None - original_seq_len: Optional[int] = None - rope_factor: Optional[float] = None - beta_fast: Optional[int] = None - beta_slow: Optional[int] = None - mscale: Optional[float] = None - partial_rotary_factor: Optional[float] = None + moe_args: MoEArgs | None = None + n_group: int | None = None + topk_group: int | None = None + inter_dim: int | None = None + moe_inter_dim: int | None = None + n_dense_layers: int | None = None + n_expert_groups: int | None = None + n_limited_groups: int | None = None + q_lora_rank: int | None = None + kv_lora_rank: int | None = None + qk_nope_head_dim: int | None = None + qk_rope_head_dim: int | None = None + v_head_dim: int | None = None + original_seq_len: int | None = None + rope_factor: float | None = None + beta_fast: int | None = None + beta_slow: int | None = None + mscale: float | None = None + partial_rotary_factor: float | None = None rope_interleave: bool = True diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh deleted file mode 100755 index 3b82ad07f3..0000000000 --- a/torchtitan/experiments/transformers_backend/run_train.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -set -ex - -# use envs as local overwrites for convenience -# e.g. -# LOG_RANK=0,1 NGPU=4 ./run_train.sh -NGPU=${NGPU:-"8"} -export LOG_RANK=${LOG_RANK:-0} - -# Option to switch between debug and train -MODE=${MODE:-"train"} # Set MODE=debug or MODE=train - -CONFIG_FILE=${CONFIG_FILE:-"configs/qwen3_fsdp2_tp2_pp2.toml"} - -if [ "$MODE" = "debug" ]; then - PYTHON_CMD="debugpy-run -m torch.distributed.run --" -else - PYTHON_CMD="torchrun" -fi - -TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} - -PYTORCH_ALLOC_CONF="expandable_segments:True" \ -TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \ -$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ ---local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ --m torchtitan.train --job.config_file ${CONFIG_FILE} "$@" \ No newline at end of file From 087f8411f5594fd1ee2bf350ff686ed30b859923 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 20 Oct 2025 11:38:05 +0000 Subject: [PATCH 070/129] move torch.utils.deterministic.fill_uninitialized_memory = False to utils + remove test_utils --- torchtitan/distributed/utils.py | 2 ++ torchtitan/train.py | 3 -- torchtitan/utils/test_utils.py | 52 --------------------------------- 3 files changed, 2 insertions(+), 55 deletions(-) delete mode 100644 torchtitan/utils/test_utils.py diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py index 67eb41280f..ce59df57b0 100644 --- a/torchtitan/distributed/utils.py +++ b/torchtitan/distributed/utils.py @@ -100,6 +100,8 @@ def set_determinism( if deterministic: logger.info("Deterministic algorithm enabled (expect perf degradation).") torch.use_deterministic_algorithms(True) + # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan + torch.utils.deterministic.fill_uninitialized_memory = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # env var for deterministic CuBLAS diff --git a/torchtitan/train.py b/torchtitan/train.py index 6bb28d4a8d..96a77caa0b 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -12,7 +12,6 @@ import torch from torch.distributed.elastic.multiprocessing.errors import record -from torchtitan.utils.test_utils import debug_structure_param import torchtitan.protocols.train_spec as train_spec_module from torchtitan.components.checkpoint import CheckpointManager from torchtitan.components.dataloader import DataloaderExhaustedError @@ -173,8 +172,6 @@ def __init__(self, job_config: JobConfig): self.metrics_processor.num_flops_per_token, ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len) - debug_structure_param(model) - logger.info( f"{color.blue}Model {job_config.model.name} {job_config.model.flavor} " f"{color.red}size: {model_param_count:,} total parameters{color.reset}" diff --git a/torchtitan/utils/test_utils.py b/torchtitan/utils/test_utils.py deleted file mode 100644 index efb8ac478d..0000000000 --- a/torchtitan/utils/test_utils.py +++ /dev/null @@ -1,52 +0,0 @@ -import torch -import functools -import torch.nn as nn -from torchtitan.tools.logging import logger -from transformers.utils import is_torch_deterministic -import lovely_tensors as lt; lt.monkey_patch() - -def debug_structure_param(model: nn.Module): - """Print a breakdown of model parameters by module structure.""" - logger.info("Model Structure Parameter Breakdown:") - - if is_torch_deterministic(): - # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan - torch.utils.deterministic.fill_uninitialized_memory = False - - def _format_module(module: nn.Module, prefix: str = ""): - for name, sub_module in module.named_children(): - sub_module_params = sum(p.numel() for p in sub_module.parameters()) - if sub_module_params > 0: - logger.info( - f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params" - ) - _format_module(sub_module, prefix + " ") - - total_params = sum(p.numel() for p in model.parameters()) - logger.info(f"{model.__class__.__name__} - {total_params:,} params") - _format_module(model, " ") - -def seeded_init_decorator_for_test(seed): - """ - Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call - and prints layer weights after initialization. - """ - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - original_trunc_normal = nn.init.trunc_normal_ - - def seeded_trunc_normal(*trunc_args, **trunc_kwargs): - torch.manual_seed(seed) - tensor = trunc_args[0] # First argument is always the tensor - result = original_trunc_normal(*trunc_args, **trunc_kwargs) - return result - - try: - nn.init.trunc_normal_ = seeded_trunc_normal - return func(*args, **kwargs) - finally: - nn.init.trunc_normal_ = original_trunc_normal - - return wrapper - return decorator From 937c68d092f8229330d3b1e9c8c6b361b91830b6 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 20 Oct 2025 11:51:36 +0000 Subject: [PATCH 071/129] remove test_template for base_config instead --- .../configs/test_template.toml | 88 ------------------- .../test_hf_integration.py | 12 ++- torchtitan/train.py | 1 - 3 files changed, 5 insertions(+), 96 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/configs/test_template.toml diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml deleted file mode 100644 index 0964cf640e..0000000000 --- a/torchtitan/experiments/transformers_backend/configs/test_template.toml +++ /dev/null @@ -1,88 +0,0 @@ -# torchtitan Config.toml - -[job] -dump_folder = "./outputs" -description = "Llama 3 debug training" -print_config = true - -[profiling] -enable_profiling = true -save_traces_folder = "profile_trace" -profile_freq = 5 -enable_memory_snapshot = false -save_memory_snapshot_folder = "memory_snapshot" - -[metrics] -log_freq = 1 -disable_color_printing = false -enable_tensorboard = false -save_tb_folder = "tb" -enable_wandb = false - -[model] -name = "llama3" -flavor = "debugmodel" -# test folder with tokenizer.json, for debug purpose only -hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" -# converters = ["float8"] - -[optimizer] -name = "AdamW" -lr = 8e-4 -eps = 1e-8 - -[lr_scheduler] -warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps -decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps -decay_type = "linear" -min_lr_factor = 0.0 - -[training] -global_batch_size = 4 -local_batch_size = 2 -seq_len = 2048 -max_norm = 1.0 # grad norm clipping -steps = 10 -dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M) -dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" -mixed_precision_param = "float32" # force float32 for comparison -mixed_precision_reduce = "float32" - -[parallelism] -data_parallel_replicate_degree = 1 -data_parallel_shard_degree = 1 -fsdp_reshard_after_forward = "default" # default / never / always -tensor_parallel_degree = 1 -enable_async_tensor_parallel = false -pipeline_parallel_degree = 1 -pipeline_parallel_schedule = "1F1B" -context_parallel_degree = 1 -expert_parallel_degree = 1 -expert_tensor_parallel_degree = 1 - -[checkpoint] -enable = false -folder = "checkpoint" -interval = 10 -last_save_model_only = false -export_dtype = "float32" -async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] - -[activation_checkpoint] -mode = "selective" # ["none", "selective", "full"] -selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy - -[compile] -enable=false -components = ["model", "loss"] - -[quantize.linear.float8] -enable_fsdp_float8_all_gather = false -precompute_float8_dynamic_scale_for_fsdp = false -filter_fqns = ["output"] - -[validation] -enable = false -dataset = "c4_validation" -freq = 5 -steps = 10 diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py index 4838133618..46b4b3e385 100644 --- a/torchtitan/experiments/transformers_backend/test_hf_integration.py +++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py @@ -144,7 +144,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str): |_ llama3 #torchtitan model """ - base_config = "configs/test_template.toml" + base_config = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/test_template.toml" with open(base_config, "r") as f: config = toml.load(f) @@ -223,13 +223,11 @@ def create_configs(model_name: str, out_dir: str, flavor: str): iter_config["parallelism"]["pipeline_parallel_degree"] = pp iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe" iter_config["job"]["dump_folder"] = str(pc_dir) - - # if pc == "fsdp1_tp1_cp1_pp2" or pc == BASELINE: - # iter_config["training"]["global_batch_size"] = 1 - # iter_config["training"]["local_batch_size"] = 1 - if pc == BASELINE or pc == "fsdp2_tp1_cp1_pp2": - iter_config["training"]["local_batch_size"] = 2 + iter_config["training"]["global_batch_size"] = 4 + iter_config["training"]["local_batch_size"] = 2 + iter_config["training"]["mixed_precision_param"] = "float32" + iter_config["training"]["mixed_precision_reduce"] = "float32" config_path = pc_dir / "config.toml" with open(config_path, "w") as f: diff --git a/torchtitan/train.py b/torchtitan/train.py index 96a77caa0b..bc7c23daee 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -171,7 +171,6 @@ def __init__(self, job_config: JobConfig): model_param_count, self.metrics_processor.num_flops_per_token, ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len) - logger.info( f"{color.blue}Model {job_config.model.name} {job_config.model.flavor} " f"{color.red}size: {model_param_count:,} total parameters{color.reset}" From 4f2b357909443e0147c07867ff9ac568c7134cf1 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 20 Oct 2025 12:12:55 +0000 Subject: [PATCH 072/129] separate args &model + dont extract loss metrics -1.0 when double PP rank in tests --- .../transformers_backend/__init__.py | 6 +- .../transformers_backend/model/args.py | 268 +++++++++++++++++ .../{hf_transformers_args.py => model.py} | 273 +----------------- .../test_hf_integration.py | 7 +- 4 files changed, 283 insertions(+), 271 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/model/args.py rename torchtitan/experiments/transformers_backend/model/{hf_transformers_args.py => model.py} (69%) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 0b50ce2027..77afb7d29b 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -16,14 +16,14 @@ from torchtitan.protocols.train_spec import register_train_spec, TrainSpec from .infra.parallelize_hf_transformers import parallelize_hf_transformers -from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel - +from .model.args import HFTransformerModelArgs +from .model.model import HFTransformerModel from torchtitan.models.moe import MoEArgs + __all__ = [ "HFTransformerModelArgs", "HFTransformerModel", - "hf_transformers_configs", ] @dataclass diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py new file mode 100644 index 0000000000..b1cde8e881 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -0,0 +1,268 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +from dataclasses import dataclass +import torch +from torch import nn +import math +from torch.nn import init +from torchtitan.config import JobConfig +from torchtitan.protocols import BaseModelArgs +from torchtitan.tools.logging import logger +from transformers import AutoConfig +from transformers.utils import is_torch_deterministic +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import AttentionInterface, PreTrainedModel +from transformers.integrations.sdpa_attention import sdpa_attention_forward + +@dataclass +class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): + """ + Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. + + Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. + Properties are created dynamically based on which arguments are provided. + """ + + # Define all possible mappings organized by argument type + _TT_TO_HF_MAPPINGS = { + "base": { + # Core TorchTitan mappings (always available) + "dim": "hidden_size", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "n_kv_heads": "num_key_value_heads", + "norm_eps": "rms_norm_eps", + "max_seq_len": "max_position_embeddings", + "eos_id": "eos_token_id", + }, + "deepseek_v3": { + # DeepSeekV3 specific mappings (only when deepseek_v3_args provided) + "inter_dim": "intermediate_size", + "n_dense_layers": "first_k_dense_replace", + }, + } + + def __init__( + self, + titan_args, + deepseek_v3_args=None, + # HuggingFace specific args + attn_implementation: str = "sdpa_torchtitan", + **kwargs, + ): + super().__init__(attn_implementation=attn_implementation, **kwargs) + assert titan_args is not None, "titan_args is required" + + active_mappings = {} + + active_mappings.update(self._TT_TO_HF_MAPPINGS["base"]) + + if deepseek_v3_args is not None: + active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"]) + + self._active_mappings = active_mappings + + self._create_dynamic_properties() + + # Set HF attributes from titan_args based on mappings + for titan_name, hf_name in self._active_mappings.items(): + if hasattr(titan_args, titan_name): + setattr(self, hf_name, getattr(titan_args, titan_name)) + + # Fill all TorchTitan-specific args (no HF equivalent) + self.multiple_of = titan_args.multiple_of + self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier + self.depth_init = titan_args.depth_init + self.use_flex_attn = titan_args.use_flex_attn + self.attn_mask_type = titan_args.attn_mask_type + + # HuggingFace specific args + self.attn_implementation = attn_implementation + #NOTE:(3outeille):This will force create_causal_mask to return None + AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward + + # Start with passed_args as just titan_args + self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} + self._passed_args.update(kwargs) + + #NOTE(3outeille): Wait for transformers uniformization of MoE args + if deepseek_v3_args is not None: + # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to + # setting it to None in HuggingFace. + q_lora_rank = deepseek_v3_args.q_lora_rank + if q_lora_rank == 0: + q_lora_rank = None + deepseek_v3_args.q_lora_rank = q_lora_rank + + self._passed_args.update(**deepseek_v3_args.__dict__) + + self.rope_interleave = deepseek_v3_args.rope_interleave + self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor + + if deepseek_v3_args.moe_args is not None: + moe_args = deepseek_v3_args.moe_args + self.num_experts_per_tok = moe_args.top_k + self.n_routed_experts = moe_args.num_experts + self.n_shared_experts = moe_args.num_shared_experts + self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim + self._passed_args.update( + dict( + num_experts_per_tok=moe_args.top_k, + n_routed_experts=moe_args.num_experts, + n_shared_experts=moe_args.num_shared_experts, + moe_intermediate_size=deepseek_v3_args.moe_inter_dim, + ) + ) + + def _create_dynamic_properties(self): + """Create properties dynamically based on active mappings.""" + def _create_property(hf_name: str) -> property: + def getter(self): + return getattr(self, hf_name) + def setter(self, value): + setattr(self, hf_name, value) + return property(getter, setter) + + for titan_name, hf_name in self._active_mappings.items(): + # Create getter/setter for attribute that don't already exist + if not hasattr(self.__class__, titan_name): + setattr(self.__class__, titan_name, _create_property(hf_name)) + + def __repr__(self) -> str: + # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. + # PretrainedConfig has a __repr__ that serializes the object to JSON, but it + # doesn't work well with how HFTransformerModelArgs is initialized. + # This custom __repr__ provides a dataclass-like representation that correctly + # displays the arguments passed during initialization. + args_lines = [ + f"{k}={getattr(self, k)!r}" + for k in sorted(self._passed_args.keys()) + if hasattr(self, k) + ] + args_str = "\n".join(args_lines) + return f"{self.__class__.__name__}(\n{args_str}\n)" + + def update_from_config(self, job_config: JobConfig): + # Load HF config (overwrites our HF attributes) + hf_model_config = AutoConfig.from_pretrained( + job_config.model.name, + attn_implementation=self.attn_implementation, + trust_remote_code=True + ) + + # Explicitly update attributes based on mappings + for titan_name, hf_name in self._active_mappings.items(): + if hasattr(hf_model_config, hf_name): + setattr(self, titan_name, getattr(hf_model_config, hf_name)) + + # Copy any other attributes that might not be in the mapping + for key, value in hf_model_config.to_dict().items(): + setattr(self, key, value) + + # Update our attributes with the passed args from flavors + for key, value in self._passed_args.items(): + if hasattr(self, key) and value is not None: + setattr(self, key, value) + + # MoE + if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + + # Configure HF-specific settings to match TorchTitan settings + self.tie_word_embeddings = False + self.attention_bias = False + self.mlp_bias = False + self.use_cache = False + self.initializer_range = 1.0 # use as std for normal init in embedding + + if not hasattr(self, "inter_dim"): # Only for llama model + ffn_hidden_size = 4 * self.dim + ffn_hidden_size = int(2 * ffn_hidden_size / 3) + if self.ffn_dim_multiplier is not None: + ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) + self.intermediate_size = self.multiple_of * ( + (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of + ) + + self.head_dim = self.dim // self.num_attention_heads + + return self + + def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: + # Check if this is a MoE model by looking for MoE attributes + is_moe = hasattr(self, 'n_routed_experts') + + if is_moe: + # MoE parameter counting (adapted from DeepSeek V3 implementation) + nparams_embedding = 0 + nparams_moe_router = 0 + nparams_shared_experts = 0 + nparams_experts = 0 + nparams_dense = 0 + + for name, p in model.named_parameters(): + if "embedding" in name: + nparams_embedding += p.numel() + nparams_dense += p.numel() + elif "moe.shared_experts" in name: + nparams_shared_experts += p.numel() + elif "moe.router" in name: + nparams_moe_router += p.numel() + elif "moe.experts" in name: + nparams_experts += p.numel() + else: + nparams_dense += p.numel() + + nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts + nparams = nparams_dense + nparams_sparse + nparams_sparse_active = ( + nparams_moe_router + + nparams_shared_experts + + nparams_experts * self.num_experts_per_tok // self.n_routed_experts + ) + + logger.info( + f"Total parameter count: dense {nparams_dense:,}, " + f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" + ) + + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Use active parameters for FLOPS calculation in MoE + num_flops_per_token = ( + 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) + + 12 * l * h * q * t + ) + else: + # Dense model parameter counting (original implementation) + nparams = sum(p.numel() for p in model.parameters()) + nparams_embedding = sum( + sum(p.numel() for p in m.parameters()) + for m in model.children() + if isinstance(m, nn.Embedding) + ) + + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Reasoning behind the factor of 12 for the self-attention part of the formula: + # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # 2. the flash attention does 1 more matmul recomputation in the backward + # but recomputation should not be counted in calculating MFU (+0) + # 3. each matmul performs 1 multiplication and 1 addition (*2) + # 4. we follow the convention and do not account for sparsity in causal attention + num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + + return nparams, num_flops_per_token \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/model.py similarity index 69% rename from torchtitan/experiments/transformers_backend/model/hf_transformers_args.py rename to torchtitan/experiments/transformers_backend/model/model.py index 883c282dc0..1e17247bff 100644 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -1,271 +1,12 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import importlib -from dataclasses import dataclass -import torch -from torch import nn import math +import torch from torch.nn import init -from torchtitan.config import JobConfig -from torchtitan.protocols import BaseModelArgs -from torchtitan.tools.logging import logger -from transformers import AutoConfig -from transformers.utils import is_torch_deterministic +from transformers.modeling_utils import PreTrainedModel from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import AttentionInterface, PreTrainedModel -from transformers.integrations.sdpa_attention import sdpa_attention_forward - -@dataclass -class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): - """ - Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. - - Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. - Properties are created dynamically based on which arguments are provided. - """ - - # Define all possible mappings organized by argument type - _TT_TO_HF_MAPPINGS = { - "base": { - # Core TorchTitan mappings (always available) - "dim": "hidden_size", - "n_layers": "num_hidden_layers", - "n_heads": "num_attention_heads", - "n_kv_heads": "num_key_value_heads", - "norm_eps": "rms_norm_eps", - "max_seq_len": "max_position_embeddings", - "eos_id": "eos_token_id", - }, - "deepseek_v3": { - # DeepSeekV3 specific mappings (only when deepseek_v3_args provided) - "inter_dim": "intermediate_size", - "n_dense_layers": "first_k_dense_replace", - }, - } - - def __init__( - self, - titan_args, - deepseek_v3_args=None, - # HuggingFace specific args - attn_implementation: str = "sdpa_torchtitan", - **kwargs, - ): - super().__init__(attn_implementation=attn_implementation, **kwargs) - assert titan_args is not None, "titan_args is required" - - active_mappings = {} - - active_mappings.update(self._TT_TO_HF_MAPPINGS["base"]) - - if deepseek_v3_args is not None: - active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"]) - - self._active_mappings = active_mappings - - self._create_dynamic_properties() - - # Set HF attributes from titan_args based on mappings - for titan_name, hf_name in self._active_mappings.items(): - if hasattr(titan_args, titan_name): - setattr(self, hf_name, getattr(titan_args, titan_name)) - - # Fill all TorchTitan-specific args (no HF equivalent) - self.multiple_of = titan_args.multiple_of - self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier - self.depth_init = titan_args.depth_init - self.use_flex_attn = titan_args.use_flex_attn - self.attn_mask_type = titan_args.attn_mask_type - - # HuggingFace specific args - self.attn_implementation = attn_implementation - #NOTE:(3outeille):This will force create_causal_mask to return None - AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward - - # Start with passed_args as just titan_args - self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} - self._passed_args.update(kwargs) - - #NOTE(3outeille): Wait for transformers uniformization of MoE args - if deepseek_v3_args is not None: - # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to - # setting it to None in HuggingFace. - q_lora_rank = deepseek_v3_args.q_lora_rank - if q_lora_rank == 0: - q_lora_rank = None - deepseek_v3_args.q_lora_rank = q_lora_rank - - self._passed_args.update(**deepseek_v3_args.__dict__) - - self.rope_interleave = deepseek_v3_args.rope_interleave - self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor - - if deepseek_v3_args.moe_args is not None: - moe_args = deepseek_v3_args.moe_args - self.num_experts_per_tok = moe_args.top_k - self.n_routed_experts = moe_args.num_experts - self.n_shared_experts = moe_args.num_shared_experts - self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim - self._passed_args.update( - dict( - num_experts_per_tok=moe_args.top_k, - n_routed_experts=moe_args.num_experts, - n_shared_experts=moe_args.num_shared_experts, - moe_intermediate_size=deepseek_v3_args.moe_inter_dim, - ) - ) - - def _create_dynamic_properties(self): - """Create properties dynamically based on active mappings.""" - def _create_property(hf_name: str) -> property: - def getter(self): - return getattr(self, hf_name) - def setter(self, value): - setattr(self, hf_name, value) - return property(getter, setter) - - for titan_name, hf_name in self._active_mappings.items(): - # Create getter/setter for attribute that don't already exist - if not hasattr(self.__class__, titan_name): - setattr(self.__class__, titan_name, _create_property(hf_name)) - - def __repr__(self) -> str: - # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. - # PretrainedConfig has a __repr__ that serializes the object to JSON, but it - # doesn't work well with how HFTransformerModelArgs is initialized. - # This custom __repr__ provides a dataclass-like representation that correctly - # displays the arguments passed during initialization. - args_lines = [ - f"{k}={getattr(self, k)!r}" - for k in sorted(self._passed_args.keys()) - if hasattr(self, k) - ] - args_str = "\n".join(args_lines) - return f"{self.__class__.__name__}(\n{args_str}\n)" - - def update_from_config(self, job_config: JobConfig): - # Load HF config (overwrites our HF attributes) - hf_model_config = AutoConfig.from_pretrained( - job_config.model.name, - attn_implementation=self.attn_implementation, - trust_remote_code=True - ) - - # Explicitly update attributes based on mappings - for titan_name, hf_name in self._active_mappings.items(): - if hasattr(hf_model_config, hf_name): - setattr(self, titan_name, getattr(hf_model_config, hf_name)) - - # Copy any other attributes that might not be in the mapping - for key, value in hf_model_config.to_dict().items(): - setattr(self, key, value) - - # Update our attributes with the passed args from flavors - for key, value in self._passed_args.items(): - if hasattr(self, key) and value is not None: - setattr(self, key, value) - - # MoE - if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): - self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - - # Configure HF-specific settings to match TorchTitan settings - self.tie_word_embeddings = False - self.attention_bias = False - self.mlp_bias = False - self.use_cache = False - self.initializer_range = 1.0 # use as std for normal init in embedding - - if not hasattr(self, "inter_dim"): # Only for llama model - ffn_hidden_size = 4 * self.dim - ffn_hidden_size = int(2 * ffn_hidden_size / 3) - if self.ffn_dim_multiplier is not None: - ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) - self.intermediate_size = self.multiple_of * ( - (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of - ) - - self.head_dim = self.dim // self.num_attention_heads - - return self - - def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - # Check if this is a MoE model by looking for MoE attributes - is_moe = hasattr(self, 'n_routed_experts') - - if is_moe: - # MoE parameter counting (adapted from DeepSeek V3 implementation) - nparams_embedding = 0 - nparams_moe_router = 0 - nparams_shared_experts = 0 - nparams_experts = 0 - nparams_dense = 0 - - for name, p in model.named_parameters(): - if "embedding" in name: - nparams_embedding += p.numel() - nparams_dense += p.numel() - elif "moe.shared_experts" in name: - nparams_shared_experts += p.numel() - elif "moe.router" in name: - nparams_moe_router += p.numel() - elif "moe.experts" in name: - nparams_experts += p.numel() - else: - nparams_dense += p.numel() - - nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts - nparams = nparams_dense + nparams_sparse - nparams_sparse_active = ( - nparams_moe_router - + nparams_shared_experts - + nparams_experts * self.num_experts_per_tok // self.n_routed_experts - ) - - logger.info( - f"Total parameter count: dense {nparams_dense:,}, " - f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" - ) - - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Use active parameters for FLOPS calculation in MoE - num_flops_per_token = ( - 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) - + 12 * l * h * q * t - ) - else: - # Dense model parameter counting (original implementation) - nparams = sum(p.numel() for p in model.parameters()) - nparams_embedding = sum( - sum(p.numel() for p in m.parameters()) - for m in model.children() - if isinstance(m, nn.Embedding) - ) - - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Reasoning behind the factor of 12 for the self-attention part of the formula: - # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) - # 2. the flash attention does 1 more matmul recomputation in the backward - # but recomputation should not be counted in calculating MFU (+0) - # 3. each matmul performs 1 multiplication and 1 addition (*2) - # 4. we follow the convention and do not account for sparsity in causal attention - num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t - - return nparams, num_flops_per_token +import importlib +from torch import nn +from .args import HFTransformerModelArgs +from torchtitan.tools.logging import logger class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): @@ -779,4 +520,4 @@ def __setattr__(self, name, value): return # Otherwise, fall back to the default nn.Module behavior. - super().__setattr__(name, value) \ No newline at end of file + super().__setattr__(name, value) diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py index 46b4b3e385..6a1f5c1852 100644 --- a/torchtitan/experiments/transformers_backend/test_hf_integration.py +++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py @@ -421,13 +421,16 @@ def _extract_metrics(log_file: Path) -> TrainingMetrics: # Regex to capture all metrics from a log line, ignoring ANSI color codes pattern = re.compile( r"step:\s*(\d+)\s*" - r".*?loss:\s*([0-9]+\.?[0-9]*)\s*" + r".*?loss:\s*(-?[0-9]+\.?[0-9]*)\s*" r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*" ) for match in pattern.finditer(content): + loss = float(match.group(2)) + if loss == -1.0: + continue metrics.steps.append(int(match.group(1))) - metrics.loss.append(float(match.group(2))) + metrics.loss.append(loss) metrics.grad_norm.append(float(match.group(3))) except Exception as e: From 154289d040a4624dd635d2e805aa39107376bd82 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 21 Oct 2025 08:37:29 +0000 Subject: [PATCH 073/129] use recent refactoring for flops computation for dense and moe model --- .../transformers_backend/model/args.py | 74 +------------------ 1 file changed, 4 insertions(+), 70 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index b1cde8e881..c49109aa0b 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -13,10 +13,10 @@ from torchtitan.config import JobConfig from torchtitan.protocols import BaseModelArgs from torchtitan.tools.logging import logger +from torchtitan.models.utils import get_dense_model_nparams_and_flops, get_moe_model_nparams_and_flops from transformers import AutoConfig -from transformers.utils import is_torch_deterministic from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import AttentionInterface, PreTrainedModel +from transformers.modeling_utils import AttentionInterface from transformers.integrations.sdpa_attention import sdpa_attention_forward @dataclass @@ -194,75 +194,9 @@ def update_from_config(self, job_config: JobConfig): return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - # Check if this is a MoE model by looking for MoE attributes is_moe = hasattr(self, 'n_routed_experts') if is_moe: - # MoE parameter counting (adapted from DeepSeek V3 implementation) - nparams_embedding = 0 - nparams_moe_router = 0 - nparams_shared_experts = 0 - nparams_experts = 0 - nparams_dense = 0 - - for name, p in model.named_parameters(): - if "embedding" in name: - nparams_embedding += p.numel() - nparams_dense += p.numel() - elif "moe.shared_experts" in name: - nparams_shared_experts += p.numel() - elif "moe.router" in name: - nparams_moe_router += p.numel() - elif "moe.experts" in name: - nparams_experts += p.numel() - else: - nparams_dense += p.numel() - - nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts - nparams = nparams_dense + nparams_sparse - nparams_sparse_active = ( - nparams_moe_router - + nparams_shared_experts - + nparams_experts * self.num_experts_per_tok // self.n_routed_experts - ) - - logger.info( - f"Total parameter count: dense {nparams_dense:,}, " - f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" - ) - - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Use active parameters for FLOPS calculation in MoE - num_flops_per_token = ( - 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) - + 12 * l * h * q * t - ) + return get_moe_model_nparams_and_flops(self, model, seq_len) else: - # Dense model parameter counting (original implementation) - nparams = sum(p.numel() for p in model.parameters()) - nparams_embedding = sum( - sum(p.numel() for p in m.parameters()) - for m in model.children() - if isinstance(m, nn.Embedding) - ) - - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Reasoning behind the factor of 12 for the self-attention part of the formula: - # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) - # 2. the flash attention does 1 more matmul recomputation in the backward - # but recomputation should not be counted in calculating MFU (+0) - # 3. each matmul performs 1 multiplication and 1 addition (*2) - # 4. we follow the convention and do not account for sparsity in causal attention - num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t - - return nparams, num_flops_per_token \ No newline at end of file + return get_dense_model_nparams_and_flops(self, model, seq_len) \ No newline at end of file From 1b2cfd792e63c9a91f3bffe9283760018001fb23 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 21 Oct 2025 13:35:43 +0000 Subject: [PATCH 074/129] fix tie_embedding --- .../infra/parallelize_hf_transformers.py | 4 -- .../transformers_backend/model/args.py | 1 - .../transformers_backend/model/model.py | 48 +++++++++++++------ 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 32e122ab75..3d729f3afb 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -195,7 +195,6 @@ def parallelize_hf_transformers( logger.warning("CP support for FlexAttention is still in progress.") if parallel_dims.tp_enabled: - model.set_tp_mesh(world_mesh["tp"]) enable_float8_linear = "float8" in job_config.model.converters float8_is_rowwise = job_config.quantize.linear.float8.recipe_name in ( "rowwise", @@ -297,9 +296,6 @@ def parallelize_hf_transformers( enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd, ) - if parallel_dims.pp_enabled: - model.set_pp_mesh(world_mesh["pp"]) - return model diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index c49109aa0b..6bd805fff4 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -174,7 +174,6 @@ def update_from_config(self, job_config: JobConfig): self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim # Configure HF-specific settings to match TorchTitan settings - self.tie_word_embeddings = False self.attention_bias = False self.mlp_bias = False self.use_cache = False diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index 1e17247bff..0a8c000d0e 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -94,17 +94,9 @@ def __init__(self, model_args: HFTransformerModelArgs): layer.moe_enabled = False self.cp_mesh = None - self.tp_mesh = None - self.pp_mesh = None def set_cp_mesh(self, mesh): self.cp_mesh = mesh - - def set_tp_mesh(self, mesh): - self.tp_mesh = mesh - - def set_pp_mesh(self, mesh): - self.pp_mesh = mesh def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None): """ @@ -155,7 +147,6 @@ def _init_weights_patched(self, module): `self` is a PreTrainedModel instance. """ config = self.config - # Build tuple of classes to check for layer_idx-based init_std calculation layer_idx_classes = [attention_cls] if mlp_cls: @@ -234,8 +225,21 @@ def _init_weights_patched(self, module): module.bias.data.zero_() elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) + # When tie_word_embeddings is True, use lm_head initialization + if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings: + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + else: + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() @@ -372,8 +376,21 @@ def _init_weights_patched(self, module): module.bias.data.zero_() elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) + # When tie_word_embeddings is True, use lm_head initialization + if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings: + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + else: + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() @@ -495,7 +512,10 @@ def selective_init(module): self.model.apply(selective_init) - self.model.tie_weights() + #TODO(3outeille): For pipeline parallel, only tie weights if both input and output embeddings are on the same device + # Maybe better way of handling this? + if not isinstance(self.tok_embeddings, nn.Identity) and not isinstance(self.output, nn.Identity): + self.model.tie_weights() def named_children(self): """ From 0f2c51e026f228e38eee6a2cd151d58124f393cd Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 21 Oct 2025 13:46:39 +0000 Subject: [PATCH 075/129] remove pad_token_id=None --- torchtitan/experiments/transformers_backend/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 77afb7d29b..d315c05271 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -78,7 +78,6 @@ class DeepSeekV3Args: n_heads=16, n_kv_heads=16, ), - pad_token_id=None, #TODO(3outeille): use os.environ to switch between models deepseek_v3_args=DeepSeekV3Args( partial_rotary_factor=4.0, From 4c8b4b7f3895867de1ce1e52739deb68b9b3eb8c Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 21 Oct 2025 14:39:51 +0000 Subject: [PATCH 076/129] make it clearer about args --- .../transformers_backend/__init__.py | 12 ++-- .../transformers_backend/model/args.py | 55 +++++++++---------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index d315c05271..110a376642 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -27,9 +27,8 @@ ] @dataclass -class TitanModelArgs: +class TitanDenseModelArgs: """Arguments for the base TorchTitan model.""" - dim: int = 4096 n_layers: int = 32 n_heads: int = 32 @@ -46,7 +45,7 @@ class TitanModelArgs: @dataclass -class DeepSeekV3Args: +class TitanMoeModelArgs: """Arguments specific to DeepSeekV3 models.""" moe_args: MoEArgs | None = None n_group: int | None = None @@ -72,14 +71,13 @@ class DeepSeekV3Args: flavors = { "debugmodel": HFTransformerModelArgs( - titan_args=TitanModelArgs( + titan_dense_args=TitanDenseModelArgs( dim=256, n_layers=6, n_heads=16, n_kv_heads=16, ), - #TODO(3outeille): use os.environ to switch between models - deepseek_v3_args=DeepSeekV3Args( + titan_moe_args=TitanMoeModelArgs( partial_rotary_factor=4.0, inter_dim=1024, moe_inter_dim=256, @@ -103,7 +101,7 @@ class DeepSeekV3Args: ) if os.environ.get("USE_MOE", "0") == "1" else None, ), "full": HFTransformerModelArgs( - titan_args=TitanModelArgs(), + titan_dense_args=TitanDenseModelArgs(), ), } diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 6bd805fff4..e02a04e136 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -30,8 +30,8 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): # Define all possible mappings organized by argument type _TT_TO_HF_MAPPINGS = { - "base": { - # Core TorchTitan mappings (always available) + "dense": { + # TorchTitan dense model mappings (always available) "dim": "hidden_size", "n_layers": "num_hidden_layers", "n_heads": "num_attention_heads", @@ -40,8 +40,8 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): "max_seq_len": "max_position_embeddings", "eos_id": "eos_token_id", }, - "deepseek_v3": { - # DeepSeekV3 specific mappings (only when deepseek_v3_args provided) + "moe": { + # TorchTitan moe model specific mappings (only when titan_moe_args provided) "inter_dim": "intermediate_size", "n_dense_layers": "first_k_dense_replace", }, @@ -49,21 +49,21 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): def __init__( self, - titan_args, - deepseek_v3_args=None, + titan_dense_args, + titan_moe_args=None, # HuggingFace specific args attn_implementation: str = "sdpa_torchtitan", **kwargs, ): super().__init__(attn_implementation=attn_implementation, **kwargs) - assert titan_args is not None, "titan_args is required" + assert titan_dense_args is not None, "titan_dense_args is required" active_mappings = {} - active_mappings.update(self._TT_TO_HF_MAPPINGS["base"]) + active_mappings.update(self._TT_TO_HF_MAPPINGS["dense"]) - if deepseek_v3_args is not None: - active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"]) + if titan_moe_args is not None: + active_mappings.update(self._TT_TO_HF_MAPPINGS["moe"]) self._active_mappings = active_mappings @@ -71,15 +71,15 @@ def __init__( # Set HF attributes from titan_args based on mappings for titan_name, hf_name in self._active_mappings.items(): - if hasattr(titan_args, titan_name): - setattr(self, hf_name, getattr(titan_args, titan_name)) + if hasattr(titan_dense_args, titan_name): + setattr(self, hf_name, getattr(titan_dense_args, titan_name)) # Fill all TorchTitan-specific args (no HF equivalent) - self.multiple_of = titan_args.multiple_of - self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier - self.depth_init = titan_args.depth_init - self.use_flex_attn = titan_args.use_flex_attn - self.attn_mask_type = titan_args.attn_mask_type + self.multiple_of = titan_dense_args.multiple_of + self.ffn_dim_multiplier = titan_dense_args.ffn_dim_multiplier + self.depth_init = titan_dense_args.depth_init + self.use_flex_attn = titan_dense_args.use_flex_attn + self.attn_mask_type = titan_dense_args.attn_mask_type # HuggingFace specific args self.attn_implementation = attn_implementation @@ -87,35 +87,32 @@ def __init__( AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward # Start with passed_args as just titan_args - self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} + self._passed_args = {**titan_dense_args.__dict__, "attn_implementation": attn_implementation} self._passed_args.update(kwargs) #NOTE(3outeille): Wait for transformers uniformization of MoE args - if deepseek_v3_args is not None: + if titan_moe_args is not None: # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to # setting it to None in HuggingFace. - q_lora_rank = deepseek_v3_args.q_lora_rank + q_lora_rank = titan_moe_args.q_lora_rank if q_lora_rank == 0: q_lora_rank = None - deepseek_v3_args.q_lora_rank = q_lora_rank + titan_moe_args.q_lora_rank = q_lora_rank - self._passed_args.update(**deepseek_v3_args.__dict__) + self._passed_args.update(**titan_moe_args.__dict__) - self.rope_interleave = deepseek_v3_args.rope_interleave - self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor - - if deepseek_v3_args.moe_args is not None: - moe_args = deepseek_v3_args.moe_args + if titan_moe_args.moe_args is not None: + moe_args = titan_moe_args.moe_args self.num_experts_per_tok = moe_args.top_k self.n_routed_experts = moe_args.num_experts self.n_shared_experts = moe_args.num_shared_experts - self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim + self.moe_intermediate_size = titan_moe_args.moe_inter_dim self._passed_args.update( dict( num_experts_per_tok=moe_args.top_k, n_routed_experts=moe_args.num_experts, n_shared_experts=moe_args.num_shared_experts, - moe_intermediate_size=deepseek_v3_args.moe_inter_dim, + moe_intermediate_size=titan_moe_args.moe_inter_dim, ) ) From c61271e3a75d1962a73aa10b30d4c5e63839538f Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 21 Oct 2025 14:44:12 +0000 Subject: [PATCH 077/129] remove local testing scripts --- .../configs/template.slurm | 115 --- .../test_hf_integration.py | 775 ------------------ 2 files changed, 890 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/configs/template.slurm delete mode 100644 torchtitan/experiments/transformers_backend/test_hf_integration.py diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm deleted file mode 100644 index 493b569e95..0000000000 --- a/torchtitan/experiments/transformers_backend/configs/template.slurm +++ /dev/null @@ -1,115 +0,0 @@ -#!/bin/bash -#SBATCH --job-name={{ name }} -#SBATCH --output={{ root_path }}/slurm_%j.out -#SBATCH --error={{ root_path }}/slurm_%j.out -#SBATCH --nodes={{ nodes }} -#SBATCH --gres=gpu:{{ n_proc_per_node }} -#SBATCH --ntasks-per-node=1 -#SBATCH --qos={{ qos }} -#SBATCH --cpus-per-task=12 - -# Misc initializations. -echo "========================" -echo "START TIME: $(date)" -source /etc/profile.d/modules.sh -source /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/bin/activate -echo python3 version = $(python3 --version) -echo "===========" - -# Slurm stuff -export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=$((1024 + RANDOM % 64511)) - -export TMPDIR=/scratch -export TORCH_HOME="/fsx/ferdinandmom/cache/torch" -export HF_HOME="/fsx/ferdinandmom/cache/huggingface" -export HF_DATASETS_CACHE="/fsx/ferdinandmom/cache/huggingface/datasets" -export TRANSFORMERS_CACHE="/fsx/ferdinandmom/cache/huggingface/transformers" -export CUBLAS_WORKSPACE_CONFIG=":4096:8" -export CUDA_DEVICE_MAX_CONNECTIONS="1" -export UV_CACHE_DIR="/fsx/ferdinandmom/.cache/uv" - -# EFA settings -export FI_PROVIDER=efa -export FI_EFA_FORK_SAFE=1 -export FI_EFA_ENABLE_SHM_TRANSFER=1 -export NCCL_PROTO=simple -export NCCL_SOCKET_IFNAME=enp - -module load cuda/12.4 - -echo "Running training job: {{ name }}" -echo "Config file: {{ config_path }}" - -# Function to update status based on squeue output -update_status() { - job_id=$1 - status_file=$2 - # For unknown reasons, it doenst update status for pending. It only works for running - while true; do - job_status=$(squeue --job $job_id --noheader --format=%T) - echo "Job status: $job_status" - if [ -z "$job_status" ]; then - # Job has finished or is not found - break - elif [ "$job_status" = "RUNNING" ]; then - printf "running" > $status_file - break - fi - sleep 10 - done -} - -# Update status to "pending" or "running" in the background -update_status $job_id {{ root_path }}/status.txt & - -# LOG_DIR="{{ root_path }}/logs" -# mkdir -p ${LOG_DIR} - -# CMD="torchrun \ -# --nproc_per_node {{ n_proc_per_node }} \ -# --nnodes {{ nodes }} \ -# --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ -# --rdzv_backend c10d \ -# --max_restarts 0 \ -# --log-dir ${LOG_DIR} \ -# --role rank \ -# --tee 3 \ -# -m torchtitan.train \ -# --checkpoint.enable \ -# {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \ -# --training.seed 42 \ -# --training.deterministic \ -# --training.steps 1 \ -# --job.config_file {{ config_path }}" - - -CMD="torchrun \ - --nproc_per_node {{ n_proc_per_node }} \ - --nnodes {{ nodes }} \ - --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \ - --rdzv_backend c10d \ - --max_restarts 0 \ - --role rank \ - --local_ranks_filter {{ n_proc_per_node - 1 }} \ - --tee 3 \ - -m torchtitan.train \ - --checkpoint.enable \ - {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \ - --training.seed 42 \ - --training.deterministic \ - --job.config_file {{ config_path }}" - -# Run the main command -echo "Running command: srun -u $CMD" -srun -u $CMD -exit_status=$? - - -# Update status based on the exit status of `srun` -if [ $exit_status -eq 0 ]; then - printf "completed" > {{ root_path }}/status.txt -else - printf "fail" > {{ root_path }}/status.txt -fi diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py deleted file mode 100644 index 6a1f5c1852..0000000000 --- a/torchtitan/experiments/transformers_backend/test_hf_integration.py +++ /dev/null @@ -1,775 +0,0 @@ -import toml -from argparse import ArgumentParser -from pathlib import Path -import re -import os -import subprocess -from enum import Enum -from jinja2 import Template -from rich.console import Console -from rich.panel import Panel -from rich.table import Table -from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn - -# BASELINE = "fsdp2_tp1_cp1_pp1" -BASELINE = "fsdp1_tp1_cp1_pp1" - -console = Console() - -class LogLevel(Enum): - INFO = "INFO" - SUCCESS = "SUCCESS" - WARNING = "WARNING" - ERROR = "ERROR" - TEST_PASS = "TEST_PASS" - TEST_FAIL = "TEST_FAIL" - -def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None: - """Log a message with appropriate color coding.""" - style_map = { - LogLevel.INFO: "blue", - LogLevel.SUCCESS: "green", - LogLevel.WARNING: "yellow", - LogLevel.ERROR: "bold red", - LogLevel.TEST_PASS: "green", - LogLevel.TEST_FAIL: "bold red", - } - - prefix_map = { - LogLevel.INFO: "[INFO]", - LogLevel.SUCCESS: "[SUCCESS]", - LogLevel.WARNING: "[WARNING]", - LogLevel.ERROR: "[ERROR]", - LogLevel.TEST_PASS: "✅ TEST PASS", - LogLevel.TEST_FAIL: "❌ TEST FAIL", - } - - style = style_map[level] - prefix = prefix_map[level] - if indent > 0: - indent_str = " " * (indent - 1) + "└─ " - else: - indent_str = "" - - output = f"{indent_str}[{style}]{prefix}[/] {message}" - - if dim: - console.print(f"[dim]{output}[/dim]") - else: - console.print(output) - - -def _create_slurm_script( - config: dict, - config_path: Path, - script_path: Path, - job_name: str, - initial_load_path: str = None, - repo_id: str = None, -): - with open(config_path, "r") as file: - config = toml.load(file) - - pp = config["parallelism"]["pipeline_parallel_degree"] - dp = config["parallelism"]["data_parallel_shard_degree"] - tp = config["parallelism"]["tensor_parallel_degree"] - cp = config["parallelism"]["context_parallel_degree"] - world_size = pp * dp * tp * cp - - nodes = max(1, world_size // 8) - n_proc_per_node = min(8, world_size // nodes) - - print(f"world_size: {world_size}, nodes: {nodes}, n_proc_per_node: {n_proc_per_node}") - - # Read the SLURM script template from the file - template_path = Path(__file__).parent / "configs/template.slurm" - with open(template_path, "r") as f: - slurm_script_template = f.read() - base_bench_template = Template(slurm_script_template) - - context_bench = { - "name": job_name, - "nodes": nodes, - "n_proc_per_node": n_proc_per_node, - "root_path": script_path.parent, - "config_path": config_path, - "initial_load_path": initial_load_path, - "repo_id": repo_id, - "qos": "high" if nodes > 1 else "normal", # Example logic for qos - } - - with open(script_path, "w") as file: - file.write(base_bench_template.render(context_bench)) - - print(f"Slurm script created at {script_path}") - - -def create_configs(model_name: str, out_dir: str, flavor: str): - """ - results/ - |_ meta-llama - |_ Llama-3.2-1B - |_ debugmodel/ - |_ seed_checkpoint/ - |_ config.toml - |_ seed.slurm - |_ step-0/ - |_ .... - |_ fsdp2_tp1_cp1_pp1/ - |_ config.toml - |_ nd_parallelism.slurm - |_ nd_parallelism.log - |_ fsdp2_tp2_cp1_pp1/ - |_ config.toml - |_ nd_parallelism.slurm - |_ nd_parallelism.log - |_ diff_baseline_vs_nd_parallelism.log - |_ fsdp2_tp1_cp1_pp2/ - |_ config.toml - |_ nd_parallelism.slurm - |_ nd_parallelism.log - |_ diff_baseline_vs_nd_parallelism.log - |_ fsdp2_tp1_cp2_pp1/ - |_ config.toml - |_ nd_parallelism.slurm - |_ nd_parallelism.log - |_ diff_baseline_vs_nd_parallelism.log - |_ fsdp2_tp1_cp2_pp2/ - |_ config.toml - |_ nd_parallelism.slurm - |_ nd_parallelism.log - |_ diff_baseline_vs_nd_parallelism.log` - |_ full/ - ... - |_ llama3 #torchtitan model - """ - - base_config = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/test_template.toml" - with open(base_config, "r") as f: - config = toml.load(f) - - config["model"]["name"] = model_name - config["model"]["flavor"] = flavor - - # parallelism_configs = [ - # BASELINE, # baseline - # "fsdp2_tp2_cp1_pp1", - # # "fsdp2_tp1_cp1_pp2", - # # "fsdp2_tp1_cp2_pp1", - # # "fsdp2_tp1_cp2_pp2", - # # "fsdp2_tp2_cp2_pp1", - # # "fsdp2_tp2_cp1_pp2", - # # "fsdp2_tp2_cp2_pp2", - # ] - - # parallelism_configs = [ - # BASELINE, # baseline - # # "fsdp1_tp2_cp1_pp1", - # # "fsdp1_tp1_cp1_pp2", - # # "fsdp1_tp1_cp2_pp1", - # # "fsdp1_tp1_cp2_pp2", - # # "fsdp1_tp2_cp2_pp1", - # # "fsdp1_tp2_cp1_pp2", - # # "fsdp1_tp2_cp2_pp2", - # ] - - parallelism_configs = [ - BASELINE, # baseline - "fsdp1_tp2_cp1_pp1", - ] - - out_path = Path(out_dir) / model_name / flavor - out_path.mkdir(parents=True, exist_ok=True) - - # Create seed checkpoint - seed_config = toml.loads(toml.dumps(config)) - seed_config["parallelism"]["data_parallel_shard_degree"] = 1 - seed_config["parallelism"]["tensor_parallel_degree"] = 1 - seed_config["parallelism"]["pipeline_parallel_degree"] = 1 - seed_config["parallelism"]["context_parallel_degree"] = 1 - seed_checkpoint_dir = out_path / "seed_checkpoint" - seed_checkpoint_dir.mkdir(exist_ok=True) - seed_config["job"]["dump_folder"] = str(seed_checkpoint_dir) - seed_config_path = seed_checkpoint_dir / "config.toml" - with open(seed_config_path, "w") as f: - toml.dump(seed_config, f) - print(f"Created {seed_config_path}") - _create_slurm_script( - seed_config, - seed_config_path, - seed_checkpoint_dir / "seed.slurm", - "seed_checkpoint", - repo_id=model_name, - ) - - # Create parallelism configs - for pc in parallelism_configs: - - iter_config = toml.loads(toml.dumps(config)) - - m = re.match(r"fsdp(\d+)_tp(\d+)_cp(\d+)_pp(\d+)", pc) - if not m: - print(f"Skipping invalid config string: {pc}") - continue - - fsdp, tp, cp, pp = map(int, m.groups()) - - pc_dir = out_path / pc - pc_dir.mkdir(exist_ok=True) - - iter_config["parallelism"]["data_parallel_shard_degree"] = fsdp - iter_config["parallelism"]["tensor_parallel_degree"] = tp - iter_config["parallelism"]["context_parallel_degree"] = cp - iter_config["parallelism"]["pipeline_parallel_degree"] = pp - iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe" - iter_config["job"]["dump_folder"] = str(pc_dir) - - iter_config["training"]["global_batch_size"] = 4 - iter_config["training"]["local_batch_size"] = 2 - iter_config["training"]["mixed_precision_param"] = "float32" - iter_config["training"]["mixed_precision_reduce"] = "float32" - - config_path = pc_dir / "config.toml" - with open(config_path, "w") as f: - toml.dump(iter_config, f) - print(f"Created {config_path}") - _create_slurm_script( - iter_config, - config_path, - pc_dir / "nd_parallelism.slurm", - pc, - initial_load_path=str(seed_checkpoint_dir / "checkpoint/step-0"), - repo_id=model_name, - ) - -class Status(Enum): - # INIT -> PENDING -> [RUNNING | FAIL] -> COMPLETED - INIT = "init" # Job is created - PENDING = "pending" # Job is waiting for ressources - RUNNING = "running" # Job is running - FAIL = "fail" # Job failed - COMPLETED = "completed" # Job is completed - -class Job: - def __init__(self, root_path: str, qos: str, inp_dir: str = None) -> None: - self.root_path = root_path - self.name = os.path.basename(root_path) - - self.config = os.path.join(root_path, "config.toml") - seed_slurm = os.path.join(root_path, "seed.slurm") - if os.path.exists(seed_slurm): - self.slurm_script = seed_slurm - else: - self.slurm_script = os.path.join(root_path, "nd_parallelism.slurm") - - self.qos = qos - - # Check if the status.txt file exists - status_file_path = os.path.join(self.root_path, "status.txt") - if not os.path.exists(status_file_path): - # Create the status.txt file with INIT status - with open(status_file_path, "w") as f: - f.write(Status.INIT.value) - self.status = self.get_status() - - def get_status(self) -> Status: - """ - Read the status of the job from `status.txt` and return it - """ - is_existing = lambda value_to_check: any( - value.value == value_to_check for value in Status.__members__.values() - ) - - status_file_path = os.path.join(self.root_path, "status.txt") - with open(status_file_path, "r") as f: - status = f.read().strip() - if not is_existing(status): - raise ValueError(f"Invalid status: {status}") - return Status(status) - - def set_status(self, status: Status) -> Status: - """ - Update the status of the job in `status.txt` and return the new status - """ - status_file_path = os.path.join(self.root_path, "status.txt") - with open(status_file_path, "w") as f: - f.write(status.value) - return status - -class Scheduler: - def __init__(self, inp_dir: str, qos: str) -> None: - # Find all leaf directories, and the top-level directory if it contains a config. - jobs_directory_paths = [] - for root, dirs, files in os.walk(inp_dir): - is_job_dir = any(f.endswith(".toml") for f in files) - if is_job_dir: - if not dirs: # leaf node - jobs_directory_paths.append(os.path.abspath(root)) - # also capture baseline job in root - elif root == inp_dir: - jobs_directory_paths.append(os.path.abspath(root)) - - self.job_lists = [Job(job_path, qos, inp_dir) for job_path in jobs_directory_paths] - - def keep_only_jobs(self, status: Status): - return [job for job in self.job_lists if job.status == status] - - def filter_out_jobs(self, status: Status): - return [job for job in self.job_lists if job.status != status] - - -def submit_jobs(inp_dir, qos, only: str = None): - scheduler = Scheduler(inp_dir, qos) - - env_vars = os.environ.copy() - total_jobs = len(scheduler.job_lists) - - if only: - try: - status_to_filter = Status(only) - scheduler.job_lists = scheduler.keep_only_jobs(status_to_filter) - except ValueError: - print(f"Invalid status for --only: {only}") - return - - if only is not None: - filtered_jobs = len(scheduler.job_lists) - if filtered_jobs == 0: - print(f"No '{only}' jobs to resubmit") - return - print( - f"Only {filtered_jobs}/{total_jobs} jobs with status '{only}' will be resubmitted" - ) - - scheduler.job_lists = scheduler.filter_out_jobs(Status.COMPLETED) - - for job in scheduler.job_lists: - subprocess.run(["sbatch", job.slurm_script], env=env_vars) - job.set_status(Status.PENDING) - - -def check_status(inp_dir: str): - """ - Display a table showing the count of jobs in each status. - Reads status.txt from all job directories found in inp_dir. - """ - # Find all directories with status.txt files - jobs_directory_paths = [] - for root, dirs, files in os.walk(inp_dir): - if "status.txt" in files: - jobs_directory_paths.append(os.path.abspath(root)) - - if not jobs_directory_paths: - print(f"No jobs found in {inp_dir}") - return - - # Count jobs by status - status_counts = {status: 0 for status in Status} - for job_path in jobs_directory_paths: - job = Job(job_path, qos="N/A") - status_counts[job.status] += 1 - - total = len(jobs_directory_paths) - - # Print table - print("\nJob Status Summary") - print("=" * 30) - print(f"{'Status':<12} | {'Count':>5}") - print("-" * 30) - print(f"{'Init':<12} | {status_counts[Status.INIT]:>5}") - print(f"{'Pending':<12} | {status_counts[Status.PENDING]:>5}") - print(f"{'Running':<12} | {status_counts[Status.RUNNING]:>5}") - print(f"{'Fail':<12} | {status_counts[Status.FAIL]:>5}") - print(f"{'Completed':<12} | {status_counts[Status.COMPLETED]:>5}") - print("-" * 30) - print(f"{'Total':<12} | {total:>5}") - print("=" * 30) - - -def report(inp_dir: str, only: str = None): - """ - Generate diff reports between baseline (fsdp2_tp1_cp1_pp1) and all other parallelism configs. - Creates diff_baseline_vs_nd_parallelism.log in each non-baseline config directory. - Automatically discovers all model/flavor combinations under inp_dir. - """ - # Add imports - import torch - from dataclasses import dataclass, field - from typing import List - - @dataclass - class TrainingMetrics: - """Training metrics extracted from logs.""" - steps: List[int] = field(default_factory=list) - loss: List[float] = field(default_factory=list) - grad_norm: List[float] = field(default_factory=list) - - # Default tolerance values (matching compare_distributed_run.py) - DEFAULT_LOSS_ATOL = 5e-2 - DEFAULT_LOSS_RTOL = 1e-5 - DEFAULT_GRAD_NORM_ATOL = 7e-1 - DEFAULT_GRAD_NORM_RTOL = 1e-5 - - def _extract_metrics(log_file: Path) -> TrainingMetrics: - """Extract metrics from log file.""" - metrics = TrainingMetrics() - - try: - with open(log_file, 'r') as f: - content = f.read() - - # Regex to capture all metrics from a log line, ignoring ANSI color codes - pattern = re.compile( - r"step:\s*(\d+)\s*" - r".*?loss:\s*(-?[0-9]+\.?[0-9]*)\s*" - r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*" - ) - - for match in pattern.finditer(content): - loss = float(match.group(2)) - if loss == -1.0: - continue - metrics.steps.append(int(match.group(1))) - metrics.loss.append(loss) - metrics.grad_norm.append(float(match.group(3))) - - except Exception as e: - log_message(LogLevel.WARNING, f"Could not extract metrics: {e}", indent=3, dim=True) - - return metrics - - def _compare_metrics(baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, - config_name: str) -> tuple[bool, str]: - """Compare metrics between baseline and test configuration. - - Returns: - tuple[bool, str]: (passed, summary_message) - """ - if not baseline_metrics.loss or not test_metrics.loss: - return False, f"Unable to extract metrics" - - # Convert to tensors - baseline_loss = torch.tensor(baseline_metrics.loss) - test_loss = torch.tensor(test_metrics.loss) - baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm) - test_grad_norm = torch.tensor(test_metrics.grad_norm) - - # Check if tensors are close - loss_pass = torch.allclose(baseline_loss, test_loss, atol=DEFAULT_LOSS_ATOL, rtol=DEFAULT_LOSS_RTOL) - grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=DEFAULT_GRAD_NORM_ATOL, rtol=DEFAULT_GRAD_NORM_RTOL) - - # Calculate max absolute differences for logging - loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 - grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 - - # Calculate min absolute differences for logging - loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0 - grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0 - - summary = (f"Max loss diff: {loss_max_diff:.2e}, " - f"Min loss diff: {loss_min_diff:.2e}, " - f"Max grad norm diff: {grad_norm_diff:.2e}, " - f"Min grad norm diff: {grad_norm_min_diff:.2e}") - - return (loss_pass and grad_pass), summary - - def _filter_log(log_file: Path) -> Path: - """Filter log file to normalize volatile information (timestamps, PIDs, ports).""" - filtered_file = log_file.with_suffix(log_file.suffix + '.filtered') - - with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile: - for line in infile: - # Apply filtering patterns to remove volatile information - line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', - 'TIMESTAMP', line) - line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', - 'torchrun ... --master_port=XXXX', line) - line = re.sub(r'PID [0-9]+', 'PID XXXX', line) - line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line) - outfile.write(line) - - return filtered_file - - def _generate_diff(baseline_log: Path, test_log: Path, diff_file: Path) -> tuple[bool, str]: - """Generate diff between baseline and test logs using git diff. - - Returns: - tuple[bool, str]: (success, diff_output or error_message) - """ - # Filter logs to remove timestamps and volatile information - baseline_filtered = _filter_log(baseline_log) - test_filtered = _filter_log(test_log) - - try: - # Generate colored diff using git diff - cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color", - str(baseline_filtered), str(test_filtered)] - - result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - - # git diff returns exit code 1 when files differ (which is expected), not an error - if result.returncode not in [0, 1]: - error_msg = f"git diff failed with code {result.returncode}\n{result.stderr}" - return False, error_msg - - # Write diff to file - with open(diff_file, 'w') as f: - f.write(result.stdout) - - return True, result.stdout - - finally: - # Clean up filtered files - if baseline_filtered.exists(): - baseline_filtered.unlink() - if test_filtered.exists(): - test_filtered.unlink() - - def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]: - """Process a single model/flavor directory. - - Returns: - tuple[int, int]: (passed_count, failed_count) - """ - # Find baseline directory - baseline_dir = flavor_dir / BASELINE - if not baseline_dir.exists(): - log_message(LogLevel.WARNING, f"No baseline directory found in {flavor_dir.relative_to(inp_path)}, skipping", indent=1) - return 0, 0 - - # Find baseline .out file - baseline_out_files = list(baseline_dir.glob("*.out")) - if not baseline_out_files: - log_message(LogLevel.WARNING, f"No .out file found in baseline {baseline_dir.relative_to(inp_path)}, skipping", indent=1) - return 0, 0 - baseline_out = baseline_out_files[0] - - # Extract baseline metrics - log_message(LogLevel.INFO, f"Extracting baseline metrics from {baseline_out.name}...", indent=1) - baseline_metrics = _extract_metrics(baseline_out) - if not baseline_metrics.loss or not baseline_metrics.grad_norm: - log_message(LogLevel.WARNING, "Could not extract baseline metrics, skipping comparisons", indent=1) - return 0, 0 - - # Find all parallelism config directories (excluding seed_checkpoint and baseline) - config_dirs = [] - for item in flavor_dir.iterdir(): - if item.is_dir() and item.name not in {BASELINE, "seed_checkpoint"}: - config_dirs.append(item) - - if not config_dirs: - log_message(LogLevel.INFO, f"No test configurations found in {flavor_dir.relative_to(inp_path)}", indent=1) - return 0, 0 - - console.print() - console.print( - Panel( - f"[cyan]Baseline:[/cyan] {baseline_out.relative_to(flavor_dir)}\n" - f"[cyan]Configurations to compare:[/cyan] {len(config_dirs)}", - title=f"[bold cyan]Processing {flavor_dir.relative_to(inp_path)}[/bold cyan]", - expand=False, - border_style="cyan", - padding=(0, 2), - ) - ) - - # Track results for summary - results = [] - - # Generate diffs for each config - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("[cyan]Processing configurations...", total=len(config_dirs)) - - for i, config_dir in enumerate(sorted(config_dirs)): - if i > 0: - console.rule(style="dim") - - progress.update(task, description=f"[cyan]Testing [bold]{config_dir.name}[/bold]") - - # Find .out file in config directory - test_out_files = list(config_dir.glob("*.out")) - if not test_out_files: - log_message(LogLevel.WARNING, f"{config_dir.name}: No .out file found, skipping", indent=1) - results.append((config_dir.name, False, "No .out file found")) - progress.advance(task) - continue - - test_out = test_out_files[0] - diff_file = config_dir / "diff_baseline_vs_nd_parallelism.log" - - # Extract test metrics - test_metrics = _extract_metrics(test_out) - - # Compare metrics - if test_metrics.loss and test_metrics.grad_norm: - test_passed, metrics_summary = _compare_metrics(baseline_metrics, test_metrics, config_dir.name) - - if test_passed: - log_message(LogLevel.TEST_PASS, f"{config_dir.name} - {metrics_summary}", indent=1) - results.append((config_dir.name, True, metrics_summary)) - else: - log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - {metrics_summary}", indent=1) - results.append((config_dir.name, False, metrics_summary)) - else: - log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - Unable to extract metrics", indent=1) - results.append((config_dir.name, False, "Unable to extract metrics")) - - # Generate diff - try: - success, output = _generate_diff(baseline_out, test_out, diff_file) - - if success: - log_message(LogLevel.INFO, f"Diff between baseline vs HF nd-parallel saved to:", indent=5, dim=True) - console.print(f" [dim]{diff_file}[/dim]") - else: - log_message(LogLevel.WARNING, f"Failed to generate diff: {output}", indent=5, dim=True) - - except Exception as e: - log_message(LogLevel.WARNING, f"Failed to generate diff - {e}", indent=5, dim=True) - - progress.advance(task) - - console.print() - # Create summary table - summary_table = Table( - title=f"[bold]Summary for {flavor_dir.relative_to(inp_path)}[/bold]", - show_header=True, - header_style="bold magenta" - ) - summary_table.add_column("Configuration", style="cyan") - summary_table.add_column("Status", justify="center") - summary_table.add_column("Metrics", style="dim") - - for name, passed, summary in results: - status = "[bold green]✅ PASS[/bold green]" if passed else "[bold red]❌ FAIL[/bold red]" - # Truncate summary if too long - display_summary = summary if len(summary) < 60 else summary[:57] + "..." - summary_table.add_row(name, status, display_summary) - - console.print(summary_table) - console.print() - - passed_count = sum(1 for _, passed, _ in results if passed) - failed_count = len(results) - passed_count - - return passed_count, failed_count - - inp_path = Path(inp_dir) - - if not inp_path.exists(): - console.print(f"[bold red]Error:[/bold red] Directory not found: {inp_path}") - return - - console.print( - Panel( - "[bold cyan]HuggingFace Integration Test Report Generator[/bold cyan]", - expand=False, - border_style="blue", - padding=(1, 2), - ) - ) - console.print() - - # Find all directories that contain a baseline (fsdp2_tp1_cp1_pp1) subdirectory - flavor_dirs = [] - for root, dirs, files in os.walk(inp_path): - if BASELINE in dirs: - flavor_dirs.append(Path(root)) - - # Filter by --only if provided - if only: - original_count = len(flavor_dirs) - flavor_dirs = [ - d for d in flavor_dirs if only in str(d.relative_to(inp_path)) - ] - log_message( - LogLevel.INFO, - f"Filtered from {original_count} to {len(flavor_dirs)} director{'ies' if len(flavor_dirs) != 1 else 'y'} matching '[bold]{only}[/bold]'", - ) - - if not flavor_dirs: - log_message(LogLevel.ERROR, f"No directories with baseline configuration found under {inp_path}") - console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1' subdirectory[/yellow]") - return - - log_message(LogLevel.INFO, f"Found {len(flavor_dirs)} model/flavor combination(s) to process:") - for flavor_dir in flavor_dirs: - console.print(f" [cyan]•[/cyan] {flavor_dir.relative_to(inp_path)}") - - # Process each flavor directory - total_passed = 0 - total_failed = 0 - - for flavor_dir in flavor_dirs: - passed, failed = _process_flavor_dir(flavor_dir) - total_passed += passed - total_failed += failed - - # Final summary - console.print() - console.print( - Panel( - "[bold cyan]Overall Summary[/bold cyan]", - expand=False, - border_style="blue", - padding=(0, 2), - ) - ) - - overall_table = Table(show_header=True, header_style="bold magenta") - overall_table.add_column("Metric", style="cyan") - overall_table.add_column("Value", justify="right") - - total_tests = total_passed + total_failed - overall_table.add_row("Total Configurations Tested", str(total_tests)) - overall_table.add_row("[green]Passed[/green]", str(total_passed)) - overall_table.add_row("[red]Failed[/red]", str(total_failed)) - - console.print(overall_table) - console.print() - - if total_failed == 0 and total_tests > 0: - log_message(LogLevel.SUCCESS, "All tests passed! 🎉") - elif total_tests > 0: - log_message(LogLevel.WARNING, f"{total_failed} configuration(s) had test failures") - - log_message(LogLevel.SUCCESS, "Diff generation complete!") - -if __name__ == "__main__": - parser = ArgumentParser() - subparsers = parser.add_subparsers(dest="action") - - create_configs_parser = subparsers.add_parser("create_configs") - create_configs_parser.add_argument("--model_name", type=str, required=True) - create_configs_parser.add_argument("--out_dir", type=str, required=True) - create_configs_parser.add_argument("--flavor", type=str, required=True) - - submit_jobs_parser = subparsers.add_parser("submit_jobs") - submit_jobs_parser.add_argument("--inp_dir", type=str, required=True) - submit_jobs_parser.add_argument("--qos", type=str, required=True, choices=["low", "normal", "high", "prod"]) - submit_jobs_parser.add_argument("--only", type=str, default=None, choices=[s.value for s in Status]) - - report_parser = subparsers.add_parser("report") - report_parser.add_argument("--inp_dir", type=str, required=True) - report_parser.add_argument("--only", type=str, default=None) - - check_status_parser = subparsers.add_parser("check_status") - check_status_parser.add_argument("--inp_dir", type=str, required=True) - - args = parser.parse_args() - - if args.action == "create_configs": - create_configs(args.model_name, args.out_dir, args.flavor) - elif args.action == "submit_jobs": - submit_jobs(args.inp_dir, args.qos, args.only) - elif args.action == "report": - report(args.inp_dir, args.only) - elif args.action == "check_status": - check_status(args.inp_dir) \ No newline at end of file From a84854568c55b78d95e8f3bf3808b24619fb23c5 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 21 Oct 2025 15:22:27 +0000 Subject: [PATCH 078/129] fix linting --- torchtitan/experiments/__init__.py | 8 +- .../transformers_backend/__init__.py | 19 +- .../configs/qwen3_fsdp2_tp2_pp2.toml | 5 +- .../infra/parallelize_hf_transformers.py | 89 ++++--- .../transformers_backend/infra/pipeline_hf.py | 25 +- .../transformers_backend/model/args.py | 58 +++-- .../transformers_backend/model/model.py | 231 +++++++++++++----- torchtitan/protocols/train_spec.py | 4 +- torchtitan/train.py | 6 +- 9 files changed, 284 insertions(+), 161 deletions(-) diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py index 75d22e58e6..6c1465c14a 100644 --- a/torchtitan/experiments/__init__.py +++ b/torchtitan/experiments/__init__.py @@ -5,5 +5,11 @@ # LICENSE file in the root directory of this source tree. _supported_experiments = frozenset( - ["flux", "simple_fsdp.llama3", "simple_fsdp.deepseek_v3", "vlm", "transformers_backend"] + [ + "flux", + "simple_fsdp.llama3", + "simple_fsdp.deepseek_v3", + "vlm", + "transformers_backend", + ] ) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 110a376642..11bd36bc81 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -9,16 +9,16 @@ from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers from torchtitan.components.optimizer import build_optimizers -from torchtitan.datasets.hf_datasets import build_hf_dataloader from torchtitan.components.tokenizer import build_hf_tokenizer - -from .infra.pipeline_hf import pipeline_hf_transformers +from torchtitan.datasets.hf_datasets import build_hf_dataloader +from torchtitan.models.moe import MoEArgs from torchtitan.protocols.train_spec import register_train_spec, TrainSpec from .infra.parallelize_hf_transformers import parallelize_hf_transformers + +from .infra.pipeline_hf import pipeline_hf_transformers from .model.args import HFTransformerModelArgs from .model.model import HFTransformerModel -from torchtitan.models.moe import MoEArgs __all__ = [ @@ -26,9 +26,11 @@ "HFTransformerModel", ] + @dataclass class TitanDenseModelArgs: """Arguments for the base TorchTitan model.""" + dim: int = 4096 n_layers: int = 32 n_heads: int = 32 @@ -47,6 +49,7 @@ class TitanDenseModelArgs: @dataclass class TitanMoeModelArgs: """Arguments specific to DeepSeekV3 models.""" + moe_args: MoEArgs | None = None n_group: int | None = None topk_group: int | None = None @@ -97,8 +100,10 @@ class TitanMoeModelArgs: score_func="softmax", route_norm=True, score_before_experts=False, - ) - ) if os.environ.get("USE_MOE", "0") == "1" else None, + ), + ) + if os.environ.get("USE_MOE", "0") == "1" + else None, ), "full": HFTransformerModelArgs( titan_dense_args=TitanDenseModelArgs(), @@ -117,4 +122,4 @@ class TitanMoeModelArgs: build_loss_fn=build_cross_entropy_loss, ) -register_train_spec("hf_placeholder_name", hf_train_spec) \ No newline at end of file +register_train_spec("hf_placeholder_name", hf_train_spec) diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml index 5f40ec41b3..4e216baa77 100644 --- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml +++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml @@ -3,8 +3,7 @@ [job] dump_folder = "./outputs" description = "Qwen 3 debug training" -print_args = false -use_for_integration_test = false +print_config = true [profiling] enable_profiling = true @@ -77,7 +76,7 @@ selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac bas enable=false components = ["model", "loss"] -[float8] +[quantize.linear.float8] enable_fsdp_float8_all_gather = false precompute_float8_dynamic_scale_for_fsdp = false filter_fqns = ["output"] diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index 3d729f3afb..d1d8d4c480 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -25,16 +25,15 @@ SequenceParallel, ) from torchtitan.config import JobConfig, TORCH_DTYPE_MAP -from torchtitan.distributed import ParallelDims, NoParallel +from torchtitan.config.job_config import ActivationCheckpoint as ACConfig +from torchtitan.distributed import NoParallel, ParallelDims from torchtitan.distributed.expert_parallel import ( ExpertParallel, ExpertTensorParallel, ReordererSequenceParallel, - TensorParallel, ) from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp -from torchtitan.config.job_config import ActivationCheckpoint as ACConfig from torchtitan.tools.logging import logger # for selective op activation checkpointing @@ -50,6 +49,7 @@ torch.ops.aten.max.default, } + def _apply_ac_to_transformer_block( module: nn.Module, ac_config: ACConfig, *, base_fqn: Optional[str] = None ): @@ -137,6 +137,7 @@ def selective_checkpointing_context_fn(): else: return module + def apply_ac(model: nn.Module, ac_config: ACConfig): """Apply activation checkpointing to the model.""" for layer_id, transformer_block in model.layers.named_children(): @@ -147,6 +148,7 @@ def apply_ac(model: nn.Module, ac_config: ACConfig): logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") + def apply_ddp( model: nn.Module, dp_mesh: DeviceMesh, @@ -189,9 +191,7 @@ def parallelize_hf_transformers( ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}). """ - if ( - job_config.parallelism.context_parallel_degree > 1 - ): + if job_config.parallelism.context_parallel_degree > 1: logger.warning("CP support for FlexAttention is still in progress.") if parallel_dims.tp_enabled: @@ -310,11 +310,11 @@ def apply_non_moe_tp( # transformer block's inputs) # 2. Parallelize the root norm layer over the sequence dim # 3. Parallelize the final linear output layer - + # skipping nn.Identity modules (which are added by pipeline parallelism for unused modules) root_plan = {} - - if hasattr(model, 'tok_embeddings'): + + if hasattr(model, "tok_embeddings"): if isinstance(model.tok_embeddings, nn.Identity): root_plan["tok_embeddings"] = NoParallel() else: @@ -322,14 +322,14 @@ def apply_non_moe_tp( input_layouts=Replicate(), output_layouts=Shard(1), ) - - if hasattr(model, 'norm'): + + if hasattr(model, "norm"): if isinstance(model.norm, nn.Identity): root_plan["norm"] = NoParallel() else: root_plan["norm"] = SequenceParallel() - - if hasattr(model, 'output'): + + if hasattr(model, "output"): if isinstance(model.output, nn.Identity): root_plan["output"] = NoParallel() else: @@ -375,25 +375,33 @@ def apply_non_moe_tp( } if getattr(transformer_block.self_attn, "q_lora_rank", None) is None: - layer_plan.update({ - "self_attn.q_proj": colwise_parallel(), - "self_attn.k_proj": colwise_parallel(), - "self_attn.v_proj": colwise_parallel(), - }) + layer_plan.update( + { + "self_attn.q_proj": colwise_parallel(), + "self_attn.k_proj": colwise_parallel(), + "self_attn.v_proj": colwise_parallel(), + } + ) else: - layer_plan.update({ - "self_attn.q_a_proj": NoParallel(), - "self_attn.q_a_layernorm": NoParallel(), - "self_attn.q_b_proj": colwise_parallel(), - "self_attn.kv_a_proj_with_mqa": NoParallel(), - "self_attn.kv_a_layernorm": NoParallel(), - "self_attn.kv_b_proj": colwise_parallel(), - }) + layer_plan.update( + { + "self_attn.q_a_proj": NoParallel(), + "self_attn.q_a_layernorm": NoParallel(), + "self_attn.q_b_proj": colwise_parallel(), + "self_attn.kv_a_proj_with_mqa": NoParallel(), + "self_attn.kv_a_layernorm": NoParallel(), + "self_attn.kv_b_proj": colwise_parallel(), + } + ) # Handle different names for the output projection layer, e.g. o_proj vs dense - o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense" - layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1)) - + o_proj_name = ( + "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense" + ) + layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel( + output_layouts=Shard(1) + ) + # For Qwen3 RMSNorm on Q and K # TODO(3outeille): we should probably shard(1) then replicate => then use SequenceParallel but for now I am fed up if hasattr(transformer_block.self_attn, "q_norm"): @@ -409,14 +417,20 @@ def apply_non_moe_tp( ), } # Handle different names for MLP layers, e.g. gate_proj vs fc1 - gate_proj_name = "gate_proj" if hasattr(transformer_block.mlp, "gate_proj") else "fc1" + gate_proj_name = ( + "gate_proj" if hasattr(transformer_block.mlp, "gate_proj") else "fc1" + ) mlp_plan[f"mlp.{gate_proj_name}"] = colwise_parallel() if hasattr(transformer_block.mlp, "up_proj"): mlp_plan["mlp.up_proj"] = colwise_parallel() - down_proj_name = "down_proj" if hasattr(transformer_block.mlp, "down_proj") else "fc2" - mlp_plan[f"mlp.{down_proj_name}"] = rowwise_parallel(output_layouts=Shard(1)) + down_proj_name = ( + "down_proj" if hasattr(transformer_block.mlp, "down_proj") else "fc2" + ) + mlp_plan[f"mlp.{down_proj_name}"] = rowwise_parallel( + output_layouts=Shard(1) + ) layer_plan.update(mlp_plan) # Some models like Phi-2 don't have post_attention_layernorm @@ -494,7 +508,11 @@ def apply_fsdp( # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping # - the router and the shared experts are sharded together with the TransformerBlock # - the routed experts are sharded with the remaining dp_mod_ep_mesh - if hasattr(transformer_block, "moe_enabled") and transformer_block.moe_enabled and ep_degree > 1: + if ( + hasattr(transformer_block, "moe_enabled") + and transformer_block.moe_enabled + and ep_degree > 1 + ): fsdp_mod_ep_config = fsdp_config.copy() fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh moe_block = transformer_block.mlp @@ -506,10 +524,7 @@ def apply_fsdp( # shard_placement_fn on the outer TransformerBlock-level FSDP. _experts_shard_placement_fn = None assert dp_mod_ep_mesh is not None - if ( - dp_mod_ep_mesh.size() * ep_degree - > moe_block.experts.num_experts - ): + if dp_mod_ep_mesh.size() * ep_degree > moe_block.experts.num_experts: _experts_shard_placement_fn = lambda param: Shard(1) fully_shard( diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py index cd599ac2a5..ee7b268f9d 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py @@ -8,30 +8,26 @@ import torch import torch.nn as nn +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.pipelining import PipelineStage from torch.distributed.pipelining.schedules import ( _PipelineSchedule, get_schedule_class, PipelineScheduleSingle, + ScheduleDualPipeV, + ScheduleZBVZeroBubble, ) from torchtitan.components.loss import LossFunction from torchtitan.config import JobConfig from torchtitan.distributed import ParallelDims -from torchtitan.distributed.pipeline_parallel import ( - build_pipeline_schedule, - pipeline_module_split -) -from torch.distributed.device_mesh import DeviceMesh -from torch.distributed.pipelining import PipelineStage +from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction from torchtitan.tools.logging import logger -from torch.distributed.pipelining.schedules import ( - ScheduleDualPipeV, - ScheduleZBVZeroBubble, -) # NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part + def generate_llm_fqn_per_model_part( num_stages: int, num_layers: int, @@ -57,11 +53,7 @@ def generate_llm_fqn_per_model_part( if num_stages == 1: # Single stage gets everything layer_names = [f"layers.{i}" for i in range(num_layers)] - return [ - ["tok_embeddings"] - + layer_names - + ["norm", "output", "rotary_emb"] - ] + return [["tok_embeddings"] + layer_names + ["norm", "output", "rotary_emb"]] # Calculate effective layers including weights num_effective_layers = num_layers + input_weight + output_weight @@ -285,6 +277,7 @@ def _get_stage_indices() -> tuple[int]: return stages, models + def pipeline_hf_transformers( model: nn.Module, parallel_dims: ParallelDims, @@ -397,4 +390,4 @@ def pipeline_hf_transformers( if stage.is_last: has_last_stage = True - return pp_schedule, model_parts, has_first_stage, has_last_stage \ No newline at end of file + return pp_schedule, model_parts, has_first_stage, has_last_stage diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index e02a04e136..7181cb570a 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -4,30 +4,30 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import importlib from dataclasses import dataclass -import torch + from torch import nn -import math -from torch.nn import init from torchtitan.config import JobConfig +from torchtitan.models.utils import ( + get_dense_model_nparams_and_flops, + get_moe_model_nparams_and_flops, +) from torchtitan.protocols import BaseModelArgs -from torchtitan.tools.logging import logger -from torchtitan.models.utils import get_dense_model_nparams_and_flops, get_moe_model_nparams_and_flops from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import AttentionInterface from transformers.integrations.sdpa_attention import sdpa_attention_forward +from transformers.modeling_utils import AttentionInterface + @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): """ Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. - + Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. Properties are created dynamically based on which arguments are provided. """ - + # Define all possible mappings organized by argument type _TT_TO_HF_MAPPINGS = { "dense": { @@ -59,14 +59,14 @@ def __init__( assert titan_dense_args is not None, "titan_dense_args is required" active_mappings = {} - + active_mappings.update(self._TT_TO_HF_MAPPINGS["dense"]) - + if titan_moe_args is not None: active_mappings.update(self._TT_TO_HF_MAPPINGS["moe"]) - + self._active_mappings = active_mappings - + self._create_dynamic_properties() # Set HF attributes from titan_args based on mappings @@ -83,14 +83,17 @@ def __init__( # HuggingFace specific args self.attn_implementation = attn_implementation - #NOTE:(3outeille):This will force create_causal_mask to return None + # NOTE:(3outeille):This will force create_causal_mask to return None AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward # Start with passed_args as just titan_args - self._passed_args = {**titan_dense_args.__dict__, "attn_implementation": attn_implementation} + self._passed_args = { + **titan_dense_args.__dict__, + "attn_implementation": attn_implementation, + } self._passed_args.update(kwargs) - #NOTE(3outeille): Wait for transformers uniformization of MoE args + # NOTE(3outeille): Wait for transformers uniformization of MoE args if titan_moe_args is not None: # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to # setting it to None in HuggingFace. @@ -118,13 +121,16 @@ def __init__( def _create_dynamic_properties(self): """Create properties dynamically based on active mappings.""" + def _create_property(hf_name: str) -> property: def getter(self): return getattr(self, hf_name) + def setter(self, value): setattr(self, hf_name, value) + return property(getter, setter) - + for titan_name, hf_name in self._active_mappings.items(): # Create getter/setter for attribute that don't already exist if not hasattr(self.__class__, titan_name): @@ -149,7 +155,7 @@ def update_from_config(self, job_config: JobConfig): hf_model_config = AutoConfig.from_pretrained( job_config.model.name, attn_implementation=self.attn_implementation, - trust_remote_code=True + trust_remote_code=True, ) # Explicitly update attributes based on mappings @@ -169,14 +175,14 @@ def update_from_config(self, job_config: JobConfig): # MoE if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - + # Configure HF-specific settings to match TorchTitan settings self.attention_bias = False self.mlp_bias = False self.use_cache = False self.initializer_range = 1.0 # use as std for normal init in embedding - - if not hasattr(self, "inter_dim"): # Only for llama model + + if not hasattr(self, "inter_dim"): # Only for llama model ffn_hidden_size = 4 * self.dim ffn_hidden_size = int(2 * ffn_hidden_size / 3) if self.ffn_dim_multiplier is not None: @@ -184,15 +190,15 @@ def update_from_config(self, job_config: JobConfig): self.intermediate_size = self.multiple_of * ( (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of ) - + self.head_dim = self.dim // self.num_attention_heads - + return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - is_moe = hasattr(self, 'n_routed_experts') - + is_moe = hasattr(self, "n_routed_experts") + if is_moe: return get_moe_model_nparams_and_flops(self, model, seq_len) else: - return get_dense_model_nparams_and_flops(self, model, seq_len) \ No newline at end of file + return get_dense_model_nparams_and_flops(self, model, seq_len) diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index 0a8c000d0e..fd7561611e 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -1,17 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import importlib import math + import torch +from torch import nn from torch.nn import init -from transformers.modeling_utils import PreTrainedModel +from torchtitan.tools.logging import logger from transformers.configuration_utils import PretrainedConfig -import importlib -from torch import nn +from transformers.modeling_utils import PreTrainedModel + from .args import HFTransformerModelArgs -from torchtitan.tools.logging import logger + class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): super().__init__() - + # Try to import the model class dynamically from the transformers library if not found in globals model_class_name = model_args.architectures[0] model_cls = globals().get(model_class_name, None) @@ -23,8 +32,8 @@ def __init__(self, model_args: HFTransformerModelArgs): raise ImportError( f"Could not find model class '{model_class_name}' in globals or transformers. " f"Make sure the class is available. Original error: {e}" - ) - + ) from e + # Attempt to patch model weight initialization based on architecture type try: model_name_prefix = model_class_name.replace("ForCausalLM", "") @@ -32,28 +41,34 @@ def __init__(self, model_args: HFTransformerModelArgs): attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None) mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None) - decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None) + decoder_layer_cls = getattr( + model_module, f"{model_name_prefix}DecoderLayer", None + ) - is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model + is_moe = hasattr( + model_args, "n_routed_experts" + ) # TODO(3outeille): check if this is the most reliable to detect a moe model if is_moe: moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None) required_classes = { "Attention": attention_cls, - "MLP": mlp_cls, + "MLP": mlp_cls, "DecoderLayer": decoder_layer_cls, - "MoE": moe_cls + "MoE": moe_cls, } - + if all(required_classes.values()): logger.info(f"Applying MoE-like patch for {model_name_prefix}") self._patch_hf_moe_like( decoder_layer_cls=decoder_layer_cls, attention_cls=attention_cls, mlp_cls=mlp_cls, - moe_cls=moe_cls + moe_cls=moe_cls, ) else: - missing = [name for name, cls in required_classes.items() if not cls] + missing = [ + name for name, cls in required_classes.items() if not cls + ] logger.warning( f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. " "Skipping MoE-like patch." @@ -61,18 +76,20 @@ def __init__(self, model_args: HFTransformerModelArgs): else: required_classes = { "Attention": attention_cls, - "DecoderLayer": decoder_layer_cls + "DecoderLayer": decoder_layer_cls, } - + if all(required_classes.values()): logger.info(f"Applying Llama-like patch for {model_name_prefix}") self._patch_hf_llama_like( decoder_layer_cls=decoder_layer_cls, attention_cls=attention_cls, - mlp_cls=mlp_cls # mlp_cls can be None + mlp_cls=mlp_cls, # mlp_cls can be None ) else: - missing = [name for name, cls in required_classes.items() if not cls] + missing = [ + name for name, cls in required_classes.items() if not cls + ] logger.warning( f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " "Skipping Llama-like patch." @@ -86,9 +103,12 @@ def __init__(self, model_args: HFTransformerModelArgs): self.model = model_cls(config=model_args) self.max_seq_len = model_args.max_seq_len - + for layer in self.model.model.layers: - if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace: + if ( + hasattr(model_args, "first_k_dense_replace") + and layer.layer_idx >= model_args.first_k_dense_replace + ): layer.moe_enabled = True else: layer.moe_enabled = False @@ -226,7 +246,10 @@ def _init_weights_patched(self, module): elif isinstance(module, nn.Embedding): # When tie_word_embeddings is True, use lm_head initialization - if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings: + if ( + hasattr(config, "tie_word_embeddings") + and config.tie_word_embeddings + ): final_out_std = config.hidden_size**-0.5 cutoff_factor = 3 nn.init.trunc_normal_( @@ -239,13 +262,14 @@ def _init_weights_patched(self, module): else: std = config.initializer_range module.weight.data.normal_(mean=0.0, std=std) - + if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif ( isinstance( - module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d) + module, + (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d), ) or "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__ @@ -331,12 +355,14 @@ def _init_weights_patched(self, module): nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) if hasattr(module, "q_b_proj"): nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) - + if hasattr(module, "kv_a_proj_with_mqa"): - nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_( + module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02 + ) if hasattr(module, "kv_b_proj"): nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) - + if hasattr(module, "o_proj") and init_std is not None: nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) @@ -345,22 +371,39 @@ def _init_weights_patched(self, module): # DeepseekV3 uses std=0.02 for up_proj, unlike Llama nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) if init_std is not None: - nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) + nn.init.trunc_normal_( + module.down_proj.weight, mean=0.0, std=init_std + ) elif isinstance(module, moe_cls): if hasattr(module, "gate") and init_std is not None: nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) if hasattr(module, "experts"): for expert in module.experts: - nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_( + expert.gate_proj.weight, mean=0.0, std=0.02 + ) nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) if init_std is not None: - nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) - if hasattr(module, "shared_experts") and module.shared_experts is not None: - nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_( + expert.down_proj.weight, mean=0.0, std=init_std + ) + if ( + hasattr(module, "shared_experts") + and module.shared_experts is not None + ): + nn.init.trunc_normal_( + module.shared_experts.gate_proj.weight, mean=0.0, std=0.02 + ) + nn.init.trunc_normal_( + module.shared_experts.up_proj.weight, mean=0.0, std=0.02 + ) if init_std is not None: - nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) + nn.init.trunc_normal_( + module.shared_experts.down_proj.weight, + mean=0.0, + std=init_std, + ) elif module is getattr(self, "lm_head", None): final_out_std = config.hidden_size**-0.5 @@ -377,7 +420,10 @@ def _init_weights_patched(self, module): elif isinstance(module, nn.Embedding): # When tie_word_embeddings is True, use lm_head initialization - if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings: + if ( + hasattr(config, "tie_word_embeddings") + and config.tie_word_embeddings + ): final_out_std = config.hidden_size**-0.5 cutoff_factor = 3 nn.init.trunc_normal_( @@ -390,11 +436,14 @@ def _init_weights_patched(self, module): else: std = config.initializer_range module.weight.data.normal_(mean=0.0, std=std) - + if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__: + elif ( + "LayerNorm" in module.__class__.__name__ + or "RMSNorm" in module.__class__.__name__ + ): if hasattr(module, "weight") and module.weight is not None: module.weight.data.fill_(1.0) if hasattr(module, "bias") and module.bias is not None: @@ -407,52 +456,80 @@ def _init_weights_patched(self, module): @property def tok_embeddings(self): """Returns the model's embed_tokens, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like + if hasattr(self.model, "model") and hasattr( + self.model.model, "embed_tokens" + ): # Llama-like return self.model.model.embed_tokens else: - raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + raise AttributeError( + "Could not find embed_tokens in the model. Please check the model structure." + ) @tok_embeddings.setter def tok_embeddings(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like - setattr(self.model.model, "embed_tokens", value) + if hasattr(self.model, "model") and hasattr( + self.model.model, "embed_tokens" + ): # Llama-like + self.model.model.embed_tokens = value else: - raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + raise AttributeError( + "Could not find embed_tokens in the model. Please check the model structure." + ) @property def layers(self): """Returns the model's layers, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like + if hasattr(self.model, "model") and hasattr( + self.model.model, "layers" + ): # Llama-like return self.model.model.layers else: # Add more cases here if needed for other model architectures - raise AttributeError("Could not find layers in the model. Please check the model structure.") + raise AttributeError( + "Could not find layers in the model. Please check the model structure." + ) @layers.setter def layers(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like - setattr(self.model.model, "layers", value) + if hasattr(self.model, "model") and hasattr( + self.model.model, "layers" + ): # Llama-like + self.model.model.layers = value else: - raise AttributeError("Could not find layers in the model. Please check the model structure.") + raise AttributeError( + "Could not find layers in the model. Please check the model structure." + ) @property def norm(self): """Returns the model's norm, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like + if hasattr(self.model, "model") and hasattr( + self.model.model, "norm" + ): # Llama-like return self.model.model.norm - elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like + elif hasattr(self.model, "model") and hasattr( + self.model.model, "final_layernorm" + ): # Phi-like return self.model.model.final_layernorm else: - raise AttributeError("Could not find norm in the model. Please check the model structure.") + raise AttributeError( + "Could not find norm in the model. Please check the model structure." + ) @norm.setter def norm(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like - setattr(self.model.model, "norm", value) - elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like - setattr(self.model.model, "final_layernorm", value) + if hasattr(self.model, "model") and hasattr( + self.model.model, "norm" + ): # Llama-like + self.model.model.norm = value + elif hasattr(self.model, "model") and hasattr( + self.model.model, "final_layernorm" + ): # Phi-like + self.model.model.final_layernorm = value else: - raise AttributeError("Could not find norm in the model. Please check the model structure.") + raise AttributeError( + "Could not find norm in the model. Please check the model structure." + ) @property def output(self): @@ -461,34 +538,52 @@ def output(self): return self.model.lm_head else: # Add more cases here if needed for other model architectures - raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") + raise AttributeError( + "Could not find output (lm_head) in the model. Please check the model structure." + ) @output.setter def output(self, value): if hasattr(self.model, "lm_head"): # For models like LlamaForCausalLM - setattr(self.model, "lm_head", value) + self.model.lm_head = value else: - raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") + raise AttributeError( + "Could not find output (lm_head) in the model. Please check the model structure." + ) @property def rotary_emb(self): """Returns the model's rotary_emb, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like + if hasattr(self.model, "model") and hasattr( + self.model.model, "rotary_emb" + ): # Llama-like return self.model.model.rotary_emb else: - raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") + raise AttributeError( + "Could not find rotary_emb in the model. Please check the model structure." + ) @rotary_emb.setter def rotary_emb(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like - setattr(self.model.model, "rotary_emb", value) + if hasattr(self.model, "model") and hasattr( + self.model.model, "rotary_emb" + ): # Llama-like + self.model.model.rotary_emb = value else: - raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") + raise AttributeError( + "Could not find rotary_emb in the model. Please check the model structure." + ) def forward(self, *args, **kwargs): local_seq_len = self.max_seq_len - local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1 - kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0) + local_seq_len //= ( + self.cp_mesh.size() + if self.cp_mesh is not None and self.cp_mesh.size() > 1 + else 1 + ) + kwargs["position_ids"] = torch.arange( + local_seq_len, device=args[0].device + ).unsqueeze(0) output = self.model.model(*args, **kwargs) output = self.model.lm_head(output.last_hidden_state) return output @@ -512,11 +607,13 @@ def selective_init(module): self.model.apply(selective_init) - #TODO(3outeille): For pipeline parallel, only tie weights if both input and output embeddings are on the same device + # TODO(3outeille): For pipeline parallel, only tie weights if both input and output embeddings are on the same device # Maybe better way of handling this? - if not isinstance(self.tok_embeddings, nn.Identity) and not isinstance(self.output, nn.Identity): + if not isinstance(self.tok_embeddings, nn.Identity) and not isinstance( + self.output, nn.Identity + ): self.model.tie_weights() - + def named_children(self): """ Provides a flattened view of the model's main components, diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index f04d6ac269..81933604bd 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -4,8 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from collections.abc import Callable import dataclasses +from collections.abc import Callable from dataclasses import dataclass from importlib import import_module from typing import Mapping, TypeAlias @@ -73,7 +73,7 @@ def register_train_spec(name: str, train_spec: TrainSpec) -> None: def get_train_spec(name: str) -> TrainSpec: # user-defined TrainSpec has higher priority global _extra_train_specs - if "/" in name: # HF model (dynamic loading) + if "/" in name: # HF model (dynamic loading) hf_spec = _extra_train_specs["hf_placeholder_name"] return dataclasses.replace(hf_spec, name=name) elif name in _extra_train_specs: diff --git a/torchtitan/train.py b/torchtitan/train.py index bc7c23daee..ed4c11298e 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -12,6 +12,8 @@ import torch from torch.distributed.elastic.multiprocessing.errors import record + +import torchtitan.experiments.transformers_backend # noqa: F401 # noqa: F401 import torchtitan.protocols.train_spec as train_spec_module from torchtitan.components.checkpoint import CheckpointManager from torchtitan.components.dataloader import DataloaderExhaustedError @@ -30,7 +32,7 @@ maybe_enable_memory_snapshot, maybe_enable_profiling, ) -import torchtitan.experiments.transformers_backend # noqa: F401 + class Trainer(torch.distributed.checkpoint.stateful.Stateful): # core configs @@ -432,7 +434,7 @@ def forward_backward_step( # apply context parallelism if cp is enabled # ensure CP handles the separate freqs_cis buffer for each pp stage cp_buffers = [inputs, labels] - cp_seq_dims = [1, 1] + cp_seq_dims = [1, 1] if hasattr(model_parts[0], "freqs_cis"): cp_buffers += [m.freqs_cis for m in model_parts] cp_seq_dims += [0 for _ in model_parts] From 9488a165e87bd9161603d847c344cd19ef3620af Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 21 Oct 2025 15:47:16 +0000 Subject: [PATCH 079/129] create CI jobs to guard --- .../integration_test_8gpu_huggingface.yaml | 55 ++++++++++++++ .../tests/integration_tests.py | 71 +++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 .github/workflows/integration_test_8gpu_huggingface.yaml create mode 100644 torchtitan/experiments/transformers_backend/tests/integration_tests.py diff --git a/.github/workflows/integration_test_8gpu_huggingface.yaml b/.github/workflows/integration_test_8gpu_huggingface.yaml new file mode 100644 index 0000000000..cde7959510 --- /dev/null +++ b/.github/workflows/integration_test_8gpu_huggingface.yaml @@ -0,0 +1,55 @@ +name: Transformers Backend 8 GPU Integration Tests + +on: + push: + branches: [ main ] + paths: + - 'torchtitan/experiments/transformers_backend/**' + pull_request: + paths: + - 'torchtitan/experiments/transformers_backend/**' + schedule: + # Runs every 12 hours + - cron: '0 */12 * * *' + +concurrency: + group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -l -eo pipefail {0} + +jobs: + build-test: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.48xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + # This image is faster to clone than the default, but it lacks CC needed by triton + # (1m25s vs 2m37s). + docker-image: torchtitan-ubuntu-20.04-clang12 + repository: pytorch/torchtitan + upload-artifact: outputs + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + # Log CUDA driver version for debugging. + DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true) + echo "CUDA driver version: ${DRIVER_VERSION}" + + pip config --user set global.progress_bar off + + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 + + USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 + + python -m pip install transformers==4.55.4 + + mkdir artifacts-to-be-uploaded + python -m torchtitan.experiments.transformers_backend.tests.integration_tests artifacts-to-be-uploaded --ngpu 8 diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py new file mode 100644 index 0000000000..1f2a38d322 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +from tests.integration_tests import OverrideDefinitions +from tests.integration_tests.run_tests import run_tests + + +def build_transformers_backend_test_list() -> list[OverrideDefinitions]: + """ + key is the config file name and value is a list of OverrideDefinitions + that is used to generate variations of integration tests based on the + same root config file. + """ + integration_tests_flavors = [ + OverrideDefinitions( + [ + [ + "--model.name meta-llama/Llama-3.2-1B", + "--training.dataset wikitext2-test", + "--parallelism.data_parallel_shard_degree 2", + "--parallelism.tensor_parallel_degree 2", + "--parallelism.pipeline_parallel_degree 2", + "--parallelism.pipeline_parallel_schedule 1F1B", + ], + ], + "Transformers Backend FSDP+TP+PP", + "transformers_backend_fsdp+tp+pp", + ngpu=8, + ), + ] + return integration_tests_flavors + + +_TEST_SUITES_FUNCTION = { + "transformers_backend": build_transformers_backend_test_list, +} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("output_dir") + parser.add_argument( + "--config_path", + default="./tests/integration_tests/base_config.toml", + help="Base config path for integration tests. This is the config that will be used as a base for all tests.", + ) + parser.add_argument( + "--test_name", + default="all", + help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)", + ) + parser.add_argument("--ngpu", default=8, type=int) + args = parser.parse_args() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + if os.listdir(args.output_dir): + raise RuntimeError("Please provide an empty output directory.") + + test_list = _TEST_SUITES_FUNCTION["transformers_backend"]()() + run_tests(args, test_list) + + +if __name__ == "__main__": + main() From e8a17577e30ec5b143509ea9f6454ad1539e7310 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 29 Oct 2025 13:40:08 +0000 Subject: [PATCH 080/129] update the way we register_train_spec --- torchtitan/config/job_config.py | 7 + .../transformers_backend/__init__.py | 29 +- .../configs/qwen3_fsdp2_tp2_pp2.toml | 5 +- .../transformers_backend/model/args.py | 2 +- .../model/hf_transformers_args.py | 782 ++++++++++++++++++ torchtitan/protocols/train_spec.py | 5 +- torchtitan/train.py | 1 - 7 files changed, 809 insertions(+), 22 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/model/hf_transformers_args.py diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py index 7fe6802374..ee89d13627 100644 --- a/torchtitan/config/job_config.py +++ b/torchtitan/config/job_config.py @@ -131,6 +131,12 @@ class Model: """ +@dataclass +class HFTransformers: + model: str = "" + """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')""" + + @dataclass class Optimizer: name: str = "AdamW" @@ -897,6 +903,7 @@ class JobConfig: profiling: Profiling = field(default_factory=Profiling) metrics: Metrics = field(default_factory=Metrics) model: Model = field(default_factory=Model) + hf_transformers: HFTransformers = field(default_factory=HFTransformers) optimizer: Optimizer = field(default_factory=Optimizer) lr_scheduler: LRScheduler = field(default_factory=LRScheduler) training: Training = field(default_factory=Training) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 11bd36bc81..453cb338da 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -10,9 +10,9 @@ from torchtitan.components.lr_scheduler import build_lr_schedulers from torchtitan.components.optimizer import build_optimizers from torchtitan.components.tokenizer import build_hf_tokenizer -from torchtitan.datasets.hf_datasets import build_hf_dataloader +from torchtitan.hf_datasets.text_datasets import build_text_dataloader from torchtitan.models.moe import MoEArgs -from torchtitan.protocols.train_spec import register_train_spec, TrainSpec +from torchtitan.protocols.train_spec import TrainSpec from .infra.parallelize_hf_transformers import parallelize_hf_transformers @@ -110,16 +110,15 @@ class TitanMoeModelArgs: ), } -hf_train_spec = TrainSpec( - model_cls=HFTransformerModel, - model_args=flavors, - parallelize_fn=parallelize_hf_transformers, - pipelining_fn=pipeline_hf_transformers, - build_optimizers_fn=build_optimizers, - build_lr_schedulers_fn=build_lr_schedulers, - build_dataloader_fn=build_hf_dataloader, - build_tokenizer_fn=build_hf_tokenizer, - build_loss_fn=build_cross_entropy_loss, -) - -register_train_spec("hf_placeholder_name", hf_train_spec) +def get_train_spec() -> TrainSpec: + return TrainSpec( + model_cls=HFTransformerModel, + model_args=flavors, + parallelize_fn=parallelize_hf_transformers, + pipelining_fn=pipeline_hf_transformers, + build_optimizers_fn=build_optimizers, + build_lr_schedulers_fn=build_lr_schedulers, + build_dataloader_fn=build_text_dataloader, + build_tokenizer_fn=build_hf_tokenizer, + build_loss_fn=build_cross_entropy_loss, + ) diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml index 4e216baa77..2832304900 100644 --- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml +++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml @@ -20,12 +20,15 @@ save_tb_folder = "tb" enable_wandb = false [model] -name = "Qwen/Qwen3-4B-Instruct-2507" +name = "transformers_backend" flavor = "debugmodel" # test folder with tokenizer.json, for debug purpose only hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" # converters = ["float8"] +[hf_transformers] +model = "Qwen/Qwen3-4B-Instruct-2507" + [optimizer] name = "AdamW" lr = 8e-4 diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 7181cb570a..bc150820ab 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -153,7 +153,7 @@ def __repr__(self) -> str: def update_from_config(self, job_config: JobConfig): # Load HF config (overwrites our HF attributes) hf_model_config = AutoConfig.from_pretrained( - job_config.model.name, + job_config.hf_transformers.model, attn_implementation=self.attn_implementation, trust_remote_code=True, ) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py new file mode 100644 index 0000000000..5cda5b3b5d --- /dev/null +++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py @@ -0,0 +1,782 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +from dataclasses import dataclass +import torch +from torch import nn +import math +from torch.nn import init +from torchtitan.config import JobConfig +from torchtitan.protocols import BaseModelArgs +from torchtitan.tools.logging import logger +from transformers import AutoConfig +from transformers.utils import is_torch_deterministic +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import AttentionInterface, PreTrainedModel +from transformers.integrations.sdpa_attention import sdpa_attention_forward + +@dataclass +class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): + """ + Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. + + Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. + Properties are created dynamically based on which arguments are provided. + """ + + # Define all possible mappings organized by argument type + _TT_TO_HF_MAPPINGS = { + "base": { + # Core TorchTitan mappings (always available) + "dim": "hidden_size", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "n_kv_heads": "num_key_value_heads", + "norm_eps": "rms_norm_eps", + "max_seq_len": "max_position_embeddings", + "eos_id": "eos_token_id", + }, + "deepseek_v3": { + # DeepSeekV3 specific mappings (only when deepseek_v3_args provided) + "inter_dim": "intermediate_size", + "n_dense_layers": "first_k_dense_replace", + }, + } + + def __init__( + self, + titan_args, + deepseek_v3_args=None, + # HuggingFace specific args + attn_implementation: str = "sdpa_torchtitan", + **kwargs, + ): + super().__init__(attn_implementation=attn_implementation, **kwargs) + assert titan_args is not None, "titan_args is required" + + active_mappings = {} + + active_mappings.update(self._TT_TO_HF_MAPPINGS["base"]) + + if deepseek_v3_args is not None: + active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"]) + + self._active_mappings = active_mappings + + self._create_dynamic_properties() + + # Set HF attributes from titan_args based on mappings + for titan_name, hf_name in self._active_mappings.items(): + if hasattr(titan_args, titan_name): + setattr(self, hf_name, getattr(titan_args, titan_name)) + + # Fill all TorchTitan-specific args (no HF equivalent) + self.multiple_of = titan_args.multiple_of + self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier + self.depth_init = titan_args.depth_init + self.use_flex_attn = titan_args.use_flex_attn + self.attn_mask_type = titan_args.attn_mask_type + + # HuggingFace specific args + self.attn_implementation = attn_implementation + #NOTE:(3outeille):This will force create_causal_mask to return None + AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward + + # Start with passed_args as just titan_args + self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} + self._passed_args.update(kwargs) + + #NOTE(3outeille): Wait for transformers uniformization of MoE args + if deepseek_v3_args is not None: + # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to + # setting it to None in HuggingFace. + q_lora_rank = deepseek_v3_args.q_lora_rank + if q_lora_rank == 0: + q_lora_rank = None + deepseek_v3_args.q_lora_rank = q_lora_rank + + self._passed_args.update(**deepseek_v3_args.__dict__) + + self.rope_interleave = deepseek_v3_args.rope_interleave + self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor + + if deepseek_v3_args.moe_args is not None: + moe_args = deepseek_v3_args.moe_args + self.num_experts_per_tok = moe_args.top_k + self.n_routed_experts = moe_args.num_experts + self.n_shared_experts = moe_args.num_shared_experts + self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim + self._passed_args.update( + dict( + num_experts_per_tok=moe_args.top_k, + n_routed_experts=moe_args.num_experts, + n_shared_experts=moe_args.num_shared_experts, + moe_intermediate_size=deepseek_v3_args.moe_inter_dim, + ) + ) + + def _create_dynamic_properties(self): + """Create properties dynamically based on active mappings.""" + def _create_property(hf_name: str) -> property: + def getter(self): + return getattr(self, hf_name) + def setter(self, value): + setattr(self, hf_name, value) + return property(getter, setter) + + for titan_name, hf_name in self._active_mappings.items(): + # Create getter/setter for attribute that don't already exist + if not hasattr(self.__class__, titan_name): + setattr(self.__class__, titan_name, _create_property(hf_name)) + + def __repr__(self) -> str: + # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. + # PretrainedConfig has a __repr__ that serializes the object to JSON, but it + # doesn't work well with how HFTransformerModelArgs is initialized. + # This custom __repr__ provides a dataclass-like representation that correctly + # displays the arguments passed during initialization. + args_lines = [ + f"{k}={getattr(self, k)!r}" + for k in sorted(self._passed_args.keys()) + if hasattr(self, k) + ] + args_str = "\n".join(args_lines) + return f"{self.__class__.__name__}(\n{args_str}\n)" + + def update_from_config(self, job_config: JobConfig): + # Load HF config (overwrites our HF attributes) + hf_model_config = AutoConfig.from_pretrained( + job_config.hf_transformers.model, + attn_implementation=self.attn_implementation, + trust_remote_code=True + ) + + # Explicitly update attributes based on mappings + for titan_name, hf_name in self._active_mappings.items(): + if hasattr(hf_model_config, hf_name): + setattr(self, titan_name, getattr(hf_model_config, hf_name)) + + # Copy any other attributes that might not be in the mapping + for key, value in hf_model_config.to_dict().items(): + setattr(self, key, value) + + # Update our attributes with the passed args from flavors + for key, value in self._passed_args.items(): + if hasattr(self, key) and value is not None: + setattr(self, key, value) + + # MoE + if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): + self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim + + # Configure HF-specific settings to match TorchTitan settings + self.tie_word_embeddings = False + self.attention_bias = False + self.mlp_bias = False + self.use_cache = False + self.initializer_range = 1.0 # use as std for normal init in embedding + + if not hasattr(self, "inter_dim"): # Only for llama model + ffn_hidden_size = 4 * self.dim + ffn_hidden_size = int(2 * ffn_hidden_size / 3) + if self.ffn_dim_multiplier is not None: + ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) + self.intermediate_size = self.multiple_of * ( + (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of + ) + + self.head_dim = self.dim // self.num_attention_heads + + return self + + def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: + # Check if this is a MoE model by looking for MoE attributes + is_moe = hasattr(self, 'n_routed_experts') + + if is_moe: + # MoE parameter counting (adapted from DeepSeek V3 implementation) + nparams_embedding = 0 + nparams_moe_router = 0 + nparams_shared_experts = 0 + nparams_experts = 0 + nparams_dense = 0 + + for name, p in model.named_parameters(): + if "embedding" in name: + nparams_embedding += p.numel() + nparams_dense += p.numel() + elif "moe.shared_experts" in name: + nparams_shared_experts += p.numel() + elif "moe.router" in name: + nparams_moe_router += p.numel() + elif "moe.experts" in name: + nparams_experts += p.numel() + else: + nparams_dense += p.numel() + + nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts + nparams = nparams_dense + nparams_sparse + nparams_sparse_active = ( + nparams_moe_router + + nparams_shared_experts + + nparams_experts * self.num_experts_per_tok // self.n_routed_experts + ) + + logger.info( + f"Total parameter count: dense {nparams_dense:,}, " + f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" + ) + + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Use active parameters for FLOPS calculation in MoE + num_flops_per_token = ( + 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) + + 12 * l * h * q * t + ) + else: + # Dense model parameter counting (original implementation) + nparams = sum(p.numel() for p in model.parameters()) + nparams_embedding = sum( + sum(p.numel() for p in m.parameters()) + for m in model.children() + if isinstance(m, nn.Embedding) + ) + + l, h, q, t = ( + self.n_layers, + self.n_heads, + self.dim // self.n_heads, + seq_len, + ) + # Reasoning behind the factor of 12 for the self-attention part of the formula: + # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) + # 2. the flash attention does 1 more matmul recomputation in the backward + # but recomputation should not be counted in calculating MFU (+0) + # 3. each matmul performs 1 multiplication and 1 addition (*2) + # 4. we follow the convention and do not account for sparsity in causal attention + num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + + return nparams, num_flops_per_token + +class HFTransformerModel(nn.Module): + def __init__(self, model_args: HFTransformerModelArgs): + super().__init__() + + # Try to import the model class dynamically from the transformers library if not found in globals + model_class_name = model_args.architectures[0] + model_cls = globals().get(model_class_name, None) + if model_cls is None: + try: + transformers_mod = importlib.import_module("transformers") + model_cls = getattr(transformers_mod, model_class_name) + except (ImportError, AttributeError) as e: + raise ImportError( + f"Could not find model class '{model_class_name}' in globals or transformers. " + f"Make sure the class is available. Original error: {e}" + ) + + # Attempt to patch model weight initialization based on architecture type + try: + model_name_prefix = model_class_name.replace("ForCausalLM", "") + model_module = importlib.import_module(model_cls.__module__) + + attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None) + mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None) + decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None) + + is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model + if is_moe: + moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None) + required_classes = { + "Attention": attention_cls, + "MLP": mlp_cls, + "DecoderLayer": decoder_layer_cls, + "MoE": moe_cls + } + + if all(required_classes.values()): + logger.info(f"Applying MoE-like patch for {model_name_prefix}") + self._patch_hf_moe_like( + decoder_layer_cls=decoder_layer_cls, + attention_cls=attention_cls, + mlp_cls=mlp_cls, + moe_cls=moe_cls + ) + else: + missing = [name for name, cls in required_classes.items() if not cls] + logger.warning( + f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. " + "Skipping MoE-like patch." + ) + else: + required_classes = { + "Attention": attention_cls, + "DecoderLayer": decoder_layer_cls + } + + if all(required_classes.values()): + logger.info(f"Applying Llama-like patch for {model_name_prefix}") + self._patch_hf_llama_like( + decoder_layer_cls=decoder_layer_cls, + attention_cls=attention_cls, + mlp_cls=mlp_cls # mlp_cls can be None + ) + else: + missing = [name for name, cls in required_classes.items() if not cls] + logger.warning( + f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " + "Skipping Llama-like patch." + ) + + except Exception as e: + logger.warning( + f"Failed to apply agnostic patch for {model_class_name} due to: {e}. " + "Weight initialization might not match TorchTitan." + ) + + self.model = model_cls(config=model_args) + self.max_seq_len = model_args.max_seq_len + + for layer in self.model.model.layers: + if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace: + layer.moe_enabled = True + else: + layer.moe_enabled = False + + self.cp_mesh = None + self.tp_mesh = None + self.pp_mesh = None + + def set_cp_mesh(self, mesh): + self.cp_mesh = mesh + + def set_tp_mesh(self, mesh): + self.tp_mesh = mesh + + def set_pp_mesh(self, mesh): + self.pp_mesh = mesh + + def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None): + """ + This patch modifies a Hugging Face Llama-like model's weight initialization to match + the initialization scheme used in TorchTitan. This is crucial for ensuring + bit-for-bit reproducibility when converting checkpoints between the native + TorchTitan format and the Hugging Face format. + + The patch targets the following aspects of the model: + - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly. + - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization + for attention, MLP, embedding, and layer norm layers. This includes depth-dependent + initialization for attention and MLP layers. + - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within + each decoder layer, which is required for the depth-dependent initialization. + """ + + _original_decoder_layer_init = decoder_layer_cls.__init__ + + def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): + _original_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx + # Ensure both attention and mlp modules have layer_idx for depth-based init + if hasattr(self, "self_attn"): + self.self_attn.layer_idx = layer_idx + # some models might not have mlp in each layer + if hasattr(self, "mlp") and self.mlp is not None: + self.mlp.layer_idx = layer_idx + + def _initialize_weights_patched(self, module): + # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly + # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, + # which prevents subsequent proper initialization. + if getattr(module, "_is_hf_initialized", False): + return + + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + + # If not on a meta device, call the original weight initialization + self._init_weights(module) + module._is_hf_initialized = True + + def _init_weights_patched(self, module): + """ + Patched version of _init_weights to match TorchTitan's initialization for Llama-like models. + `self` is a PreTrainedModel instance. + """ + config = self.config + + # Build tuple of classes to check for layer_idx-based init_std calculation + layer_idx_classes = [attention_cls] + if mlp_cls: + layer_idx_classes.append(mlp_cls) + layer_idx_classes = tuple(layer_idx_classes) + + if isinstance(module, layer_idx_classes): + if not hasattr(module, "layer_idx"): + return + layer_idx = module.layer_idx + + if hasattr(config, "depth_init") and config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, attention_cls): + # Initialize weights and biases for q, k, v projections + for proj_name in ["q_proj", "k_proj", "v_proj"]: + proj = getattr(module, proj_name) + nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02) + if proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(proj.bias, -bound, bound) + + # Handle different names for the output projection layer + o_proj = getattr(module, "o_proj", getattr(module, "dense", None)) + if o_proj is not None: + nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std) + if o_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(o_proj.bias, -bound, bound) + + elif mlp_cls and isinstance(module, mlp_cls): + # Handle different names for MLP layers + gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None)) + up_proj = getattr(module, "up_proj", None) + down_proj = getattr(module, "down_proj", getattr(module, "fc2", None)) + + # gate_proj (or fc1) should always use std=0.02 for numerical stability. + if gate_proj is not None: + nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02) + if gate_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(gate_proj.bias, -bound, bound) + # up_proj and down_proj (or fc2) use the depth-dependent init_std. + if up_proj is not None: + nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std) + if up_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(up_proj.bias, -bound, bound) + if down_proj is not None: + nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std) + if down_proj.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(down_proj.bias, -bound, bound) + + elif module is getattr( + self, "lm_head", None + ): # TODO(3outeille): find a better way to detect lm_head + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif ( + isinstance( + module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d) + ) + or "LayerNorm" in module.__class__.__name__ + or "RMSNorm" in module.__class__.__name__ + ): + # Norms can exist without weights (in which case they are None from torch primitives) + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + decoder_layer_cls.__init__ = _decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched + + def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls): + """ + This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight + initialization to match the initialization scheme used in TorchTitan, + drawing from patterns in models like DeepseekV3. + + The patch targets: + - `PreTrainedModel._initialize_weights`: For correct meta device initialization. + - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization + for attention, MLP, MoE, embedding, and layer norm layers. + - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert + modules, required for depth-dependent initialization. + """ + + _original_decoder_layer_init = decoder_layer_cls.__init__ + + def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): + _original_decoder_layer_init(self, config, layer_idx) + self.layer_idx = layer_idx + + if hasattr(self, "self_attn"): + self.self_attn.layer_idx = layer_idx + + if hasattr(self, "mlp"): + self.mlp.layer_idx = layer_idx + if hasattr(self.mlp, "experts"): + for expert in self.mlp.experts: + expert.layer_idx = layer_idx + if hasattr(self.mlp, "shared_experts"): + # Not all MoE models have shared experts + if self.mlp.shared_experts is not None: + self.mlp.shared_experts.layer_idx = layer_idx + + def _initialize_weights_patched(self, module): + if getattr(module, "_is_hf_initialized", False): + return + for param in module.parameters(recurse=True): + if param.device.type == "meta": + return + self._init_weights(module) + module._is_hf_initialized = True + + def _init_weights_patched(self, module): + """ + Patched version of _init_weights for MoE models. + """ + config = self.config + init_std = None + + if isinstance(module, (attention_cls, mlp_cls, moe_cls)): + if hasattr(module, "layer_idx"): + layer_idx = module.layer_idx + if hasattr(config, "depth_init") and config.depth_init: + init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 + else: + # Fallback for models without depth_init + init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 + + if isinstance(module, attention_cls): + # Handle different attention projection layer names by initializing if they exist + if hasattr(module, "q_proj"): + nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "k_proj"): + nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "v_proj"): + nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "q_a_proj"): + nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) + if hasattr(module, "q_b_proj"): + nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "kv_a_proj_with_mqa"): + nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) + if hasattr(module, "kv_b_proj"): + nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) + + if hasattr(module, "o_proj") and init_std is not None: + nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, mlp_cls): + nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) + # DeepseekV3 uses std=0.02 for up_proj, unlike Llama + nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) + + elif isinstance(module, moe_cls): + if hasattr(module, "gate") and init_std is not None: + nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) + if hasattr(module, "experts"): + for expert in module.experts: + nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) + if hasattr(module, "shared_experts") and module.shared_experts is not None: + nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) + if init_std is not None: + nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) + + elif module is getattr(self, "lm_head", None): + final_out_std = config.hidden_size**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + module.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + if module.bias is not None: + module.bias.data.zero_() + + elif isinstance(module, nn.Embedding): + std = config.initializer_range + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__: + if hasattr(module, "weight") and module.weight is not None: + module.weight.data.fill_(1.0) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + decoder_layer_cls.__init__ = _decoder_layer_init_patched + PreTrainedModel._init_weights = _init_weights_patched + PreTrainedModel._initialize_weights = _initialize_weights_patched + + @property + def tok_embeddings(self): + """Returns the model's embed_tokens, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like + return self.model.model.embed_tokens + else: + raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + + @tok_embeddings.setter + def tok_embeddings(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like + setattr(self.model.model, "embed_tokens", value) + else: + raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") + + @property + def layers(self): + """Returns the model's layers, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like + return self.model.model.layers + else: + # Add more cases here if needed for other model architectures + raise AttributeError("Could not find layers in the model. Please check the model structure.") + + @layers.setter + def layers(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like + setattr(self.model.model, "layers", value) + else: + raise AttributeError("Could not find layers in the model. Please check the model structure.") + + @property + def norm(self): + """Returns the model's norm, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like + return self.model.model.norm + elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like + return self.model.model.final_layernorm + else: + raise AttributeError("Could not find norm in the model. Please check the model structure.") + + @norm.setter + def norm(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like + setattr(self.model.model, "norm", value) + elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like + setattr(self.model.model, "final_layernorm", value) + else: + raise AttributeError("Could not find norm in the model. Please check the model structure.") + + @property + def output(self): + """Returns the model's output layer, handling different Hugging Face model structures.""" + if hasattr(self.model, "lm_head"): # For models like LlamaForCausalLM + return self.model.lm_head + else: + # Add more cases here if needed for other model architectures + raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") + + @output.setter + def output(self, value): + if hasattr(self.model, "lm_head"): # For models like LlamaForCausalLM + setattr(self.model, "lm_head", value) + else: + raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") + + @property + def rotary_emb(self): + """Returns the model's rotary_emb, handling different Hugging Face model structures.""" + if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like + return self.model.model.rotary_emb + else: + raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") + + @rotary_emb.setter + def rotary_emb(self, value): + if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like + setattr(self.model.model, "rotary_emb", value) + else: + raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") + + def forward(self, *args, **kwargs): + # local_seq_len = self.max_seq_len + # local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1 + # kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0) + output = self.model.model(*args, **kwargs) + output = self.model.lm_head(output.last_hidden_state) + return output + + def init_weights(self, *args, **kwargs): + # This method replicates the behavior of the original PreTrainedModel.init_weights, + # but with a custom weight initialization function that skips nn.Identity modules (when PP is enabled) + + if self.model.config.pruned_heads: + logger.info("Pruning heads as per model configuration.") + self.model.prune_heads(self.model.config.pruned_heads) + + original_init_weights_fn = self.model._init_weights + + def selective_init(module): + # For pipeline parallel, we need to skip nn.Identity modules + if not isinstance(module, nn.Identity): + original_init_weights_fn(module) + else: + logger.info("Skipping nn.Identity module during weight initialization.") + + self.model.apply(selective_init) + + self.model.tie_weights() + + def named_children(self): + """ + Provides a flattened view of the model's main components, + making it compatible with TorchTitan's expectations. + """ + yield "tok_embeddings", self.tok_embeddings + yield "layers", self.layers + yield "norm", self.norm + yield "output", self.output + yield "rotary_emb", self.rotary_emb + + def __setattr__(self, name, value): + # If a property with a setter exists for this name, use it. + # This is to bypass the nn.Module.__setattr__ logic that + # directly registers modules and skips property setters. + cls = self.__class__ + if hasattr(cls, name): + prop = getattr(cls, name) + if isinstance(prop, property) and prop.fset is not None: + prop.fset(self, value) + return + + # Otherwise, fall back to the default nn.Module behavior. + super().__setattr__(name, value) \ No newline at end of file diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index 81933604bd..c5bd62793b 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -73,10 +73,7 @@ def register_train_spec(name: str, train_spec: TrainSpec) -> None: def get_train_spec(name: str) -> TrainSpec: # user-defined TrainSpec has higher priority global _extra_train_specs - if "/" in name: # HF model (dynamic loading) - hf_spec = _extra_train_specs["hf_placeholder_name"] - return dataclasses.replace(hf_spec, name=name) - elif name in _extra_train_specs: + if name in _extra_train_specs: return _extra_train_specs[name] from torchtitan.experiments import _supported_experiments diff --git a/torchtitan/train.py b/torchtitan/train.py index 59813638fe..d4de8bc5d4 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -13,7 +13,6 @@ import torch from torch.distributed.elastic.multiprocessing.errors import record -import torchtitan.experiments.transformers_backend # noqa: F401 # noqa: F401 import torchtitan.protocols.train_spec as train_spec_module from torchtitan.components.checkpoint import CheckpointManager from torchtitan.components.dataloader import DataloaderExhaustedError From 141c377c75cd6b4c2f12ad9b335ffb53bff0b656 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 29 Oct 2025 13:44:38 +0000 Subject: [PATCH 081/129] relative path for qwen3_fsdp2_tp2_pp2.toml --- .../transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml index 2832304900..d1433bb7ed 100644 --- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml +++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml @@ -23,7 +23,7 @@ enable_wandb = false name = "transformers_backend" flavor = "debugmodel" # test folder with tokenizer.json, for debug purpose only -hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" +hf_assets_path = "./tests/assets/tokenizer" # converters = ["float8"] [hf_transformers] @@ -47,7 +47,7 @@ seq_len = 2048 max_norm = 1.0 # grad norm clipping steps = 10 dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M) -dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test" +dataset_path = "./tests/assets/c4_test" mixed_precision_param = "float32" # force float32 for comparison mixed_precision_reduce = "float32" From a67e971d7250747f656d8e9aa143cb4c51eaf713 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 29 Oct 2025 14:25:51 +0000 Subject: [PATCH 082/129] dont use os.environ, use debugmodel or debugmodel_moe --- .../transformers_backend/__init__.py | 13 +- .../transformers_backend/model/args.py | 34 +- .../model/hf_transformers_args.py | 782 ------------------ 3 files changed, 42 insertions(+), 787 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_transformers_args.py diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 453cb338da..1c44b9684c 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -3,7 +3,6 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import os from dataclasses import dataclass from torchtitan.components.loss import build_cross_entropy_loss @@ -80,6 +79,14 @@ class TitanMoeModelArgs: n_heads=16, n_kv_heads=16, ), + ), + "debugmodel_moe": HFTransformerModelArgs( + titan_dense_args=TitanDenseModelArgs( + dim=256, + n_layers=6, + n_heads=16, + n_kv_heads=16, + ), titan_moe_args=TitanMoeModelArgs( partial_rotary_factor=4.0, inter_dim=1024, @@ -101,9 +108,7 @@ class TitanMoeModelArgs: route_norm=True, score_before_experts=False, ), - ) - if os.environ.get("USE_MOE", "0") == "1" - else None, + ), ), "full": HFTransformerModelArgs( titan_dense_args=TitanDenseModelArgs(), diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index bc150820ab..4837e9527a 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -103,19 +103,51 @@ def __init__( titan_moe_args.q_lora_rank = q_lora_rank self._passed_args.update(**titan_moe_args.__dict__) - + if titan_moe_args.moe_args is not None: moe_args = titan_moe_args.moe_args + + # Store moe_args for nparams/flops calculation + self.moe_args = moe_args self.num_experts_per_tok = moe_args.top_k self.n_routed_experts = moe_args.num_experts self.n_shared_experts = moe_args.num_shared_experts self.moe_intermediate_size = titan_moe_args.moe_inter_dim + + # Set MoE-specific attributes directly on config for model access + if hasattr(titan_moe_args, 'rope_interleave'): + self.rope_interleave = titan_moe_args.rope_interleave + if hasattr(titan_moe_args, 'partial_rotary_factor'): + self.partial_rotary_factor = titan_moe_args.partial_rotary_factor + if hasattr(titan_moe_args, 'n_group'): + self.n_group = titan_moe_args.n_group + if hasattr(titan_moe_args, 'topk_group'): + self.topk_group = titan_moe_args.topk_group + if hasattr(titan_moe_args, 'kv_lora_rank'): + self.kv_lora_rank = titan_moe_args.kv_lora_rank + if hasattr(titan_moe_args, 'q_lora_rank'): + self.q_lora_rank = q_lora_rank # Use the modified version (0 -> None) + if hasattr(titan_moe_args, 'qk_nope_head_dim'): + self.qk_nope_head_dim = titan_moe_args.qk_nope_head_dim + if hasattr(titan_moe_args, 'qk_rope_head_dim'): + self.qk_rope_head_dim = titan_moe_args.qk_rope_head_dim + if hasattr(titan_moe_args, 'v_head_dim'): + self.v_head_dim = titan_moe_args.v_head_dim + self._passed_args.update( dict( num_experts_per_tok=moe_args.top_k, n_routed_experts=moe_args.num_experts, n_shared_experts=moe_args.num_shared_experts, moe_intermediate_size=titan_moe_args.moe_inter_dim, + rope_interleave=titan_moe_args.rope_interleave, + partial_rotary_factor=titan_moe_args.partial_rotary_factor, + n_group=titan_moe_args.n_group, + topk_group=titan_moe_args.topk_group, + kv_lora_rank=titan_moe_args.kv_lora_rank, + qk_nope_head_dim=titan_moe_args.qk_nope_head_dim, + qk_rope_head_dim=titan_moe_args.qk_rope_head_dim, + v_head_dim=titan_moe_args.v_head_dim, ) ) diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py deleted file mode 100644 index 5cda5b3b5d..0000000000 --- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py +++ /dev/null @@ -1,782 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import importlib -from dataclasses import dataclass -import torch -from torch import nn -import math -from torch.nn import init -from torchtitan.config import JobConfig -from torchtitan.protocols import BaseModelArgs -from torchtitan.tools.logging import logger -from transformers import AutoConfig -from transformers.utils import is_torch_deterministic -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_utils import AttentionInterface, PreTrainedModel -from transformers.integrations.sdpa_attention import sdpa_attention_forward - -@dataclass -class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): - """ - Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions. - - Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility. - Properties are created dynamically based on which arguments are provided. - """ - - # Define all possible mappings organized by argument type - _TT_TO_HF_MAPPINGS = { - "base": { - # Core TorchTitan mappings (always available) - "dim": "hidden_size", - "n_layers": "num_hidden_layers", - "n_heads": "num_attention_heads", - "n_kv_heads": "num_key_value_heads", - "norm_eps": "rms_norm_eps", - "max_seq_len": "max_position_embeddings", - "eos_id": "eos_token_id", - }, - "deepseek_v3": { - # DeepSeekV3 specific mappings (only when deepseek_v3_args provided) - "inter_dim": "intermediate_size", - "n_dense_layers": "first_k_dense_replace", - }, - } - - def __init__( - self, - titan_args, - deepseek_v3_args=None, - # HuggingFace specific args - attn_implementation: str = "sdpa_torchtitan", - **kwargs, - ): - super().__init__(attn_implementation=attn_implementation, **kwargs) - assert titan_args is not None, "titan_args is required" - - active_mappings = {} - - active_mappings.update(self._TT_TO_HF_MAPPINGS["base"]) - - if deepseek_v3_args is not None: - active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"]) - - self._active_mappings = active_mappings - - self._create_dynamic_properties() - - # Set HF attributes from titan_args based on mappings - for titan_name, hf_name in self._active_mappings.items(): - if hasattr(titan_args, titan_name): - setattr(self, hf_name, getattr(titan_args, titan_name)) - - # Fill all TorchTitan-specific args (no HF equivalent) - self.multiple_of = titan_args.multiple_of - self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier - self.depth_init = titan_args.depth_init - self.use_flex_attn = titan_args.use_flex_attn - self.attn_mask_type = titan_args.attn_mask_type - - # HuggingFace specific args - self.attn_implementation = attn_implementation - #NOTE:(3outeille):This will force create_causal_mask to return None - AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward - - # Start with passed_args as just titan_args - self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation} - self._passed_args.update(kwargs) - - #NOTE(3outeille): Wait for transformers uniformization of MoE args - if deepseek_v3_args is not None: - # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to - # setting it to None in HuggingFace. - q_lora_rank = deepseek_v3_args.q_lora_rank - if q_lora_rank == 0: - q_lora_rank = None - deepseek_v3_args.q_lora_rank = q_lora_rank - - self._passed_args.update(**deepseek_v3_args.__dict__) - - self.rope_interleave = deepseek_v3_args.rope_interleave - self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor - - if deepseek_v3_args.moe_args is not None: - moe_args = deepseek_v3_args.moe_args - self.num_experts_per_tok = moe_args.top_k - self.n_routed_experts = moe_args.num_experts - self.n_shared_experts = moe_args.num_shared_experts - self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim - self._passed_args.update( - dict( - num_experts_per_tok=moe_args.top_k, - n_routed_experts=moe_args.num_experts, - n_shared_experts=moe_args.num_shared_experts, - moe_intermediate_size=deepseek_v3_args.moe_inter_dim, - ) - ) - - def _create_dynamic_properties(self): - """Create properties dynamically based on active mappings.""" - def _create_property(hf_name: str) -> property: - def getter(self): - return getattr(self, hf_name) - def setter(self, value): - setattr(self, hf_name, value) - return property(getter, setter) - - for titan_name, hf_name in self._active_mappings.items(): - # Create getter/setter for attribute that don't already exist - if not hasattr(self.__class__, titan_name): - setattr(self.__class__, titan_name, _create_property(hf_name)) - - def __repr__(self) -> str: - # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig. - # PretrainedConfig has a __repr__ that serializes the object to JSON, but it - # doesn't work well with how HFTransformerModelArgs is initialized. - # This custom __repr__ provides a dataclass-like representation that correctly - # displays the arguments passed during initialization. - args_lines = [ - f"{k}={getattr(self, k)!r}" - for k in sorted(self._passed_args.keys()) - if hasattr(self, k) - ] - args_str = "\n".join(args_lines) - return f"{self.__class__.__name__}(\n{args_str}\n)" - - def update_from_config(self, job_config: JobConfig): - # Load HF config (overwrites our HF attributes) - hf_model_config = AutoConfig.from_pretrained( - job_config.hf_transformers.model, - attn_implementation=self.attn_implementation, - trust_remote_code=True - ) - - # Explicitly update attributes based on mappings - for titan_name, hf_name in self._active_mappings.items(): - if hasattr(hf_model_config, hf_name): - setattr(self, titan_name, getattr(hf_model_config, hf_name)) - - # Copy any other attributes that might not be in the mapping - for key, value in hf_model_config.to_dict().items(): - setattr(self, key, value) - - # Update our attributes with the passed args from flavors - for key, value in self._passed_args.items(): - if hasattr(self, key) and value is not None: - setattr(self, key, value) - - # MoE - if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): - self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim - - # Configure HF-specific settings to match TorchTitan settings - self.tie_word_embeddings = False - self.attention_bias = False - self.mlp_bias = False - self.use_cache = False - self.initializer_range = 1.0 # use as std for normal init in embedding - - if not hasattr(self, "inter_dim"): # Only for llama model - ffn_hidden_size = 4 * self.dim - ffn_hidden_size = int(2 * ffn_hidden_size / 3) - if self.ffn_dim_multiplier is not None: - ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size) - self.intermediate_size = self.multiple_of * ( - (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of - ) - - self.head_dim = self.dim // self.num_attention_heads - - return self - - def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - # Check if this is a MoE model by looking for MoE attributes - is_moe = hasattr(self, 'n_routed_experts') - - if is_moe: - # MoE parameter counting (adapted from DeepSeek V3 implementation) - nparams_embedding = 0 - nparams_moe_router = 0 - nparams_shared_experts = 0 - nparams_experts = 0 - nparams_dense = 0 - - for name, p in model.named_parameters(): - if "embedding" in name: - nparams_embedding += p.numel() - nparams_dense += p.numel() - elif "moe.shared_experts" in name: - nparams_shared_experts += p.numel() - elif "moe.router" in name: - nparams_moe_router += p.numel() - elif "moe.experts" in name: - nparams_experts += p.numel() - else: - nparams_dense += p.numel() - - nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts - nparams = nparams_dense + nparams_sparse - nparams_sparse_active = ( - nparams_moe_router - + nparams_shared_experts - + nparams_experts * self.num_experts_per_tok // self.n_routed_experts - ) - - logger.info( - f"Total parameter count: dense {nparams_dense:,}, " - f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}" - ) - - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Use active parameters for FLOPS calculation in MoE - num_flops_per_token = ( - 6 * (nparams_dense - nparams_embedding + nparams_sparse_active) - + 12 * l * h * q * t - ) - else: - # Dense model parameter counting (original implementation) - nparams = sum(p.numel() for p in model.parameters()) - nparams_embedding = sum( - sum(p.numel() for p in m.parameters()) - for m in model.children() - if isinstance(m, nn.Embedding) - ) - - l, h, q, t = ( - self.n_layers, - self.n_heads, - self.dim // self.n_heads, - seq_len, - ) - # Reasoning behind the factor of 12 for the self-attention part of the formula: - # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6) - # 2. the flash attention does 1 more matmul recomputation in the backward - # but recomputation should not be counted in calculating MFU (+0) - # 3. each matmul performs 1 multiplication and 1 addition (*2) - # 4. we follow the convention and do not account for sparsity in causal attention - num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t - - return nparams, num_flops_per_token - -class HFTransformerModel(nn.Module): - def __init__(self, model_args: HFTransformerModelArgs): - super().__init__() - - # Try to import the model class dynamically from the transformers library if not found in globals - model_class_name = model_args.architectures[0] - model_cls = globals().get(model_class_name, None) - if model_cls is None: - try: - transformers_mod = importlib.import_module("transformers") - model_cls = getattr(transformers_mod, model_class_name) - except (ImportError, AttributeError) as e: - raise ImportError( - f"Could not find model class '{model_class_name}' in globals or transformers. " - f"Make sure the class is available. Original error: {e}" - ) - - # Attempt to patch model weight initialization based on architecture type - try: - model_name_prefix = model_class_name.replace("ForCausalLM", "") - model_module = importlib.import_module(model_cls.__module__) - - attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None) - mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None) - decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None) - - is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model - if is_moe: - moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None) - required_classes = { - "Attention": attention_cls, - "MLP": mlp_cls, - "DecoderLayer": decoder_layer_cls, - "MoE": moe_cls - } - - if all(required_classes.values()): - logger.info(f"Applying MoE-like patch for {model_name_prefix}") - self._patch_hf_moe_like( - decoder_layer_cls=decoder_layer_cls, - attention_cls=attention_cls, - mlp_cls=mlp_cls, - moe_cls=moe_cls - ) - else: - missing = [name for name, cls in required_classes.items() if not cls] - logger.warning( - f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. " - "Skipping MoE-like patch." - ) - else: - required_classes = { - "Attention": attention_cls, - "DecoderLayer": decoder_layer_cls - } - - if all(required_classes.values()): - logger.info(f"Applying Llama-like patch for {model_name_prefix}") - self._patch_hf_llama_like( - decoder_layer_cls=decoder_layer_cls, - attention_cls=attention_cls, - mlp_cls=mlp_cls # mlp_cls can be None - ) - else: - missing = [name for name, cls in required_classes.items() if not cls] - logger.warning( - f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " - "Skipping Llama-like patch." - ) - - except Exception as e: - logger.warning( - f"Failed to apply agnostic patch for {model_class_name} due to: {e}. " - "Weight initialization might not match TorchTitan." - ) - - self.model = model_cls(config=model_args) - self.max_seq_len = model_args.max_seq_len - - for layer in self.model.model.layers: - if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace: - layer.moe_enabled = True - else: - layer.moe_enabled = False - - self.cp_mesh = None - self.tp_mesh = None - self.pp_mesh = None - - def set_cp_mesh(self, mesh): - self.cp_mesh = mesh - - def set_tp_mesh(self, mesh): - self.tp_mesh = mesh - - def set_pp_mesh(self, mesh): - self.pp_mesh = mesh - - def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None): - """ - This patch modifies a Hugging Face Llama-like model's weight initialization to match - the initialization scheme used in TorchTitan. This is crucial for ensuring - bit-for-bit reproducibility when converting checkpoints between the native - TorchTitan format and the Hugging Face format. - - The patch targets the following aspects of the model: - - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly. - - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization - for attention, MLP, embedding, and layer norm layers. This includes depth-dependent - initialization for attention and MLP layers. - - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within - each decoder layer, which is required for the depth-dependent initialization. - """ - - _original_decoder_layer_init = decoder_layer_cls.__init__ - - def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): - _original_decoder_layer_init(self, config, layer_idx) - self.layer_idx = layer_idx - # Ensure both attention and mlp modules have layer_idx for depth-based init - if hasattr(self, "self_attn"): - self.self_attn.layer_idx = layer_idx - # some models might not have mlp in each layer - if hasattr(self, "mlp") and self.mlp is not None: - self.mlp.layer_idx = layer_idx - - def _initialize_weights_patched(self, module): - # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly - # The default _initialize_weights sets _is_hf_initialized = True even on a meta device, - # which prevents subsequent proper initialization. - if getattr(module, "_is_hf_initialized", False): - return - - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - - # If not on a meta device, call the original weight initialization - self._init_weights(module) - module._is_hf_initialized = True - - def _init_weights_patched(self, module): - """ - Patched version of _init_weights to match TorchTitan's initialization for Llama-like models. - `self` is a PreTrainedModel instance. - """ - config = self.config - - # Build tuple of classes to check for layer_idx-based init_std calculation - layer_idx_classes = [attention_cls] - if mlp_cls: - layer_idx_classes.append(mlp_cls) - layer_idx_classes = tuple(layer_idx_classes) - - if isinstance(module, layer_idx_classes): - if not hasattr(module, "layer_idx"): - return - layer_idx = module.layer_idx - - if hasattr(config, "depth_init") and config.depth_init: - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - else: - init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 - - if isinstance(module, attention_cls): - # Initialize weights and biases for q, k, v projections - for proj_name in ["q_proj", "k_proj", "v_proj"]: - proj = getattr(module, proj_name) - nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02) - if proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(proj.bias, -bound, bound) - - # Handle different names for the output projection layer - o_proj = getattr(module, "o_proj", getattr(module, "dense", None)) - if o_proj is not None: - nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std) - if o_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(o_proj.bias, -bound, bound) - - elif mlp_cls and isinstance(module, mlp_cls): - # Handle different names for MLP layers - gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None)) - up_proj = getattr(module, "up_proj", None) - down_proj = getattr(module, "down_proj", getattr(module, "fc2", None)) - - # gate_proj (or fc1) should always use std=0.02 for numerical stability. - if gate_proj is not None: - nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02) - if gate_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(gate_proj.bias, -bound, bound) - # up_proj and down_proj (or fc2) use the depth-dependent init_std. - if up_proj is not None: - nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std) - if up_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(up_proj.bias, -bound, bound) - if down_proj is not None: - nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std) - if down_proj.bias is not None: - fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight) - bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 - init.uniform_(down_proj.bias, -bound, bound) - - elif module is getattr( - self, "lm_head", None - ): # TODO(3outeille): find a better way to detect lm_head - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif ( - isinstance( - module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d) - ) - or "LayerNorm" in module.__class__.__name__ - or "RMSNorm" in module.__class__.__name__ - ): - # Norms can exist without weights (in which case they are None from torch primitives) - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - - decoder_layer_cls.__init__ = _decoder_layer_init_patched - PreTrainedModel._init_weights = _init_weights_patched - PreTrainedModel._initialize_weights = _initialize_weights_patched - - def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls): - """ - This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight - initialization to match the initialization scheme used in TorchTitan, - drawing from patterns in models like DeepseekV3. - - The patch targets: - - `PreTrainedModel._initialize_weights`: For correct meta device initialization. - - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization - for attention, MLP, MoE, embedding, and layer norm layers. - - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert - modules, required for depth-dependent initialization. - """ - - _original_decoder_layer_init = decoder_layer_cls.__init__ - - def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): - _original_decoder_layer_init(self, config, layer_idx) - self.layer_idx = layer_idx - - if hasattr(self, "self_attn"): - self.self_attn.layer_idx = layer_idx - - if hasattr(self, "mlp"): - self.mlp.layer_idx = layer_idx - if hasattr(self.mlp, "experts"): - for expert in self.mlp.experts: - expert.layer_idx = layer_idx - if hasattr(self.mlp, "shared_experts"): - # Not all MoE models have shared experts - if self.mlp.shared_experts is not None: - self.mlp.shared_experts.layer_idx = layer_idx - - def _initialize_weights_patched(self, module): - if getattr(module, "_is_hf_initialized", False): - return - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - self._init_weights(module) - module._is_hf_initialized = True - - def _init_weights_patched(self, module): - """ - Patched version of _init_weights for MoE models. - """ - config = self.config - init_std = None - - if isinstance(module, (attention_cls, mlp_cls, moe_cls)): - if hasattr(module, "layer_idx"): - layer_idx = module.layer_idx - if hasattr(config, "depth_init") and config.depth_init: - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - else: - # Fallback for models without depth_init - init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 - - if isinstance(module, attention_cls): - # Handle different attention projection layer names by initializing if they exist - if hasattr(module, "q_proj"): - nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "k_proj"): - nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "v_proj"): - nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "q_a_proj"): - nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "q_b_proj"): - nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "kv_a_proj_with_mqa"): - nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02) - if hasattr(module, "kv_b_proj"): - nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "o_proj") and init_std is not None: - nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, mlp_cls): - nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) - # DeepseekV3 uses std=0.02 for up_proj, unlike Llama - nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, moe_cls): - if hasattr(module, "gate") and init_std is not None: - nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) - if hasattr(module, "experts"): - for expert in module.experts: - nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std) - if hasattr(module, "shared_experts") and module.shared_experts is not None: - nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02) - nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std) - - elif module is getattr(self, "lm_head", None): - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__: - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - - decoder_layer_cls.__init__ = _decoder_layer_init_patched - PreTrainedModel._init_weights = _init_weights_patched - PreTrainedModel._initialize_weights = _initialize_weights_patched - - @property - def tok_embeddings(self): - """Returns the model's embed_tokens, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like - return self.model.model.embed_tokens - else: - raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") - - @tok_embeddings.setter - def tok_embeddings(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"): # Llama-like - setattr(self.model.model, "embed_tokens", value) - else: - raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.") - - @property - def layers(self): - """Returns the model's layers, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like - return self.model.model.layers - else: - # Add more cases here if needed for other model architectures - raise AttributeError("Could not find layers in the model. Please check the model structure.") - - @layers.setter - def layers(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "layers"): # Llama-like - setattr(self.model.model, "layers", value) - else: - raise AttributeError("Could not find layers in the model. Please check the model structure.") - - @property - def norm(self): - """Returns the model's norm, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like - return self.model.model.norm - elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like - return self.model.model.final_layernorm - else: - raise AttributeError("Could not find norm in the model. Please check the model structure.") - - @norm.setter - def norm(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "norm"): # Llama-like - setattr(self.model.model, "norm", value) - elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"): # Phi-like - setattr(self.model.model, "final_layernorm", value) - else: - raise AttributeError("Could not find norm in the model. Please check the model structure.") - - @property - def output(self): - """Returns the model's output layer, handling different Hugging Face model structures.""" - if hasattr(self.model, "lm_head"): # For models like LlamaForCausalLM - return self.model.lm_head - else: - # Add more cases here if needed for other model architectures - raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") - - @output.setter - def output(self, value): - if hasattr(self.model, "lm_head"): # For models like LlamaForCausalLM - setattr(self.model, "lm_head", value) - else: - raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.") - - @property - def rotary_emb(self): - """Returns the model's rotary_emb, handling different Hugging Face model structures.""" - if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like - return self.model.model.rotary_emb - else: - raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") - - @rotary_emb.setter - def rotary_emb(self, value): - if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"): # Llama-like - setattr(self.model.model, "rotary_emb", value) - else: - raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.") - - def forward(self, *args, **kwargs): - # local_seq_len = self.max_seq_len - # local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1 - # kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0) - output = self.model.model(*args, **kwargs) - output = self.model.lm_head(output.last_hidden_state) - return output - - def init_weights(self, *args, **kwargs): - # This method replicates the behavior of the original PreTrainedModel.init_weights, - # but with a custom weight initialization function that skips nn.Identity modules (when PP is enabled) - - if self.model.config.pruned_heads: - logger.info("Pruning heads as per model configuration.") - self.model.prune_heads(self.model.config.pruned_heads) - - original_init_weights_fn = self.model._init_weights - - def selective_init(module): - # For pipeline parallel, we need to skip nn.Identity modules - if not isinstance(module, nn.Identity): - original_init_weights_fn(module) - else: - logger.info("Skipping nn.Identity module during weight initialization.") - - self.model.apply(selective_init) - - self.model.tie_weights() - - def named_children(self): - """ - Provides a flattened view of the model's main components, - making it compatible with TorchTitan's expectations. - """ - yield "tok_embeddings", self.tok_embeddings - yield "layers", self.layers - yield "norm", self.norm - yield "output", self.output - yield "rotary_emb", self.rotary_emb - - def __setattr__(self, name, value): - # If a property with a setter exists for this name, use it. - # This is to bypass the nn.Module.__setattr__ logic that - # directly registers modules and skips property setters. - cls = self.__class__ - if hasattr(cls, name): - prop = getattr(cls, name) - if isinstance(prop, property) and prop.fset is not None: - prop.fset(self, value) - return - - # Otherwise, fall back to the default nn.Module behavior. - super().__setattr__(name, value) \ No newline at end of file From 060befe7ae48026d806e82c60d1c26f0b2f4382a Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 30 Oct 2025 10:20:29 +0000 Subject: [PATCH 083/129] refactor args to make it clearer --- .../transformers_backend/model/args.py | 191 +++++++++--------- 1 file changed, 99 insertions(+), 92 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 4837e9527a..d281d68b3c 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -46,6 +46,28 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): "n_dense_layers": "first_k_dense_replace", }, } + + # Declarative list of TorchTitan-only attributes (no HF equivalent) + _TT_SPECIFIC_ATTRIBUTES = [ + "multiple_of", + "ffn_dim_multiplier", + "depth_init", + "use_flex_attn", + "attn_mask_type", + ] + + # MoE attributes that should be copied directly + _MOE_SHARED_ATTRIBUTES = [ + "rope_interleave", + "partial_rotary_factor", + "n_group", + "topk_group", + "kv_lora_rank", + "q_lora_rank", + "qk_nope_head_dim", + "qk_rope_head_dim", + "v_head_dim", + ] def __init__( self, @@ -58,101 +80,81 @@ def __init__( super().__init__(attn_implementation=attn_implementation, **kwargs) assert titan_dense_args is not None, "titan_dense_args is required" - active_mappings = {} - - active_mappings.update(self._TT_TO_HF_MAPPINGS["dense"]) + # Create getter/setter dynamically for TT <-> HF attribute mappings + self._create_getter_setter_dynamically(titan_moe_args is not None) + + self._titan_injected_model_args = {} + self._titan_injected_model_args.update(kwargs) + self._configure_hf_attention(attn_implementation) + self._initialize_dense_attributes(titan_dense_args) + if titan_moe_args is not None: - active_mappings.update(self._TT_TO_HF_MAPPINGS["moe"]) - - self._active_mappings = active_mappings - - self._create_dynamic_properties() - - # Set HF attributes from titan_args based on mappings - for titan_name, hf_name in self._active_mappings.items(): + self._initialize_moe_attributes(titan_moe_args) + + def _initialize_dense_attributes(self, titan_dense_args): + """Initialize all dense model attributes.""" + # Set mapped attributes (TorchTitan <-> HuggingFace) + for titan_name, hf_name in self._tt_to_hf_attribute_map.items(): if hasattr(titan_dense_args, titan_name): - setattr(self, hf_name, getattr(titan_dense_args, titan_name)) - - # Fill all TorchTitan-specific args (no HF equivalent) - self.multiple_of = titan_dense_args.multiple_of - self.ffn_dim_multiplier = titan_dense_args.ffn_dim_multiplier - self.depth_init = titan_dense_args.depth_init - self.use_flex_attn = titan_dense_args.use_flex_attn - self.attn_mask_type = titan_dense_args.attn_mask_type - - # HuggingFace specific args + value = getattr(titan_dense_args, titan_name) + setattr(self, hf_name, value) + + # Set TorchTitan-only attributes + for attr_name in self._TT_SPECIFIC_ATTRIBUTES: + if hasattr(titan_dense_args, attr_name): + setattr(self, attr_name, getattr(titan_dense_args, attr_name)) + + # Update passed_args + self._titan_injected_model_args.update(titan_dense_args.__dict__) + + def _initialize_moe_attributes(self, titan_moe_args): + """Initialize all MoE-specific attributes.""" + if titan_moe_args.moe_args is None: + self._titan_injected_model_args.update(titan_moe_args.__dict__) + return + + moe_args = titan_moe_args.moe_args + + # Convert q_lora_rank (0 -> None for HuggingFace compatibility) + self.q_lora_rank = None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank + + # Set core MoE attributes + self.moe_args = moe_args + self.num_experts_per_tok = moe_args.top_k + self.n_routed_experts = moe_args.num_experts + self.n_shared_experts = moe_args.num_shared_experts + self.moe_intermediate_size = titan_moe_args.moe_inter_dim + + # Set remaining architecture-specific MoE attributes + for attr in self._MOE_SHARED_ATTRIBUTES: + if attr == "q_lora_rank": + continue # Already set above + if hasattr(titan_moe_args, attr): + setattr(self, attr, getattr(titan_moe_args, attr)) + + # Track all MoE arguments + self._titan_injected_model_args.update(titan_moe_args.__dict__) + self._titan_injected_model_args.update({ + "num_experts_per_tok": moe_args.top_k, + "n_routed_experts": moe_args.num_experts, + "n_shared_experts": moe_args.num_shared_experts, + "moe_intermediate_size": titan_moe_args.moe_inter_dim, + "q_lora_rank": self.q_lora_rank, + }) + + def _configure_hf_attention(self, attn_implementation: str): + """Configure HuggingFace attention settings.""" + self._titan_injected_model_args["attn_implementation"] = attn_implementation self.attn_implementation = attn_implementation # NOTE:(3outeille):This will force create_causal_mask to return None AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward - # Start with passed_args as just titan_args - self._passed_args = { - **titan_dense_args.__dict__, - "attn_implementation": attn_implementation, - } - self._passed_args.update(kwargs) - - # NOTE(3outeille): Wait for transformers uniformization of MoE args - if titan_moe_args is not None: - # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to - # setting it to None in HuggingFace. - q_lora_rank = titan_moe_args.q_lora_rank - if q_lora_rank == 0: - q_lora_rank = None - titan_moe_args.q_lora_rank = q_lora_rank - - self._passed_args.update(**titan_moe_args.__dict__) - - if titan_moe_args.moe_args is not None: - moe_args = titan_moe_args.moe_args - - # Store moe_args for nparams/flops calculation - self.moe_args = moe_args - self.num_experts_per_tok = moe_args.top_k - self.n_routed_experts = moe_args.num_experts - self.n_shared_experts = moe_args.num_shared_experts - self.moe_intermediate_size = titan_moe_args.moe_inter_dim - - # Set MoE-specific attributes directly on config for model access - if hasattr(titan_moe_args, 'rope_interleave'): - self.rope_interleave = titan_moe_args.rope_interleave - if hasattr(titan_moe_args, 'partial_rotary_factor'): - self.partial_rotary_factor = titan_moe_args.partial_rotary_factor - if hasattr(titan_moe_args, 'n_group'): - self.n_group = titan_moe_args.n_group - if hasattr(titan_moe_args, 'topk_group'): - self.topk_group = titan_moe_args.topk_group - if hasattr(titan_moe_args, 'kv_lora_rank'): - self.kv_lora_rank = titan_moe_args.kv_lora_rank - if hasattr(titan_moe_args, 'q_lora_rank'): - self.q_lora_rank = q_lora_rank # Use the modified version (0 -> None) - if hasattr(titan_moe_args, 'qk_nope_head_dim'): - self.qk_nope_head_dim = titan_moe_args.qk_nope_head_dim - if hasattr(titan_moe_args, 'qk_rope_head_dim'): - self.qk_rope_head_dim = titan_moe_args.qk_rope_head_dim - if hasattr(titan_moe_args, 'v_head_dim'): - self.v_head_dim = titan_moe_args.v_head_dim - - self._passed_args.update( - dict( - num_experts_per_tok=moe_args.top_k, - n_routed_experts=moe_args.num_experts, - n_shared_experts=moe_args.num_shared_experts, - moe_intermediate_size=titan_moe_args.moe_inter_dim, - rope_interleave=titan_moe_args.rope_interleave, - partial_rotary_factor=titan_moe_args.partial_rotary_factor, - n_group=titan_moe_args.n_group, - topk_group=titan_moe_args.topk_group, - kv_lora_rank=titan_moe_args.kv_lora_rank, - qk_nope_head_dim=titan_moe_args.qk_nope_head_dim, - qk_rope_head_dim=titan_moe_args.qk_rope_head_dim, - v_head_dim=titan_moe_args.v_head_dim, - ) - ) - - def _create_dynamic_properties(self): - """Create properties dynamically based on active mappings.""" + def _create_getter_setter_dynamically(self, has_moe: bool): + """ + Create properties dynamically based on tt and hf attribute mappings. + For example, creates a property 'dim' that reads/writes to 'hidden_size'. + """ def _create_property(hf_name: str) -> property: def getter(self): @@ -162,8 +164,13 @@ def setter(self, value): setattr(self, hf_name, value) return property(getter, setter) + + # Setup attribute mappings + self._tt_to_hf_attribute_map = dict(self._TT_TO_HF_MAPPINGS["dense"]) + if has_moe: + self._tt_to_hf_attribute_map.update(self._TT_TO_HF_MAPPINGS["moe"]) - for titan_name, hf_name in self._active_mappings.items(): + for titan_name, hf_name in self._tt_to_hf_attribute_map.items(): # Create getter/setter for attribute that don't already exist if not hasattr(self.__class__, titan_name): setattr(self.__class__, titan_name, _create_property(hf_name)) @@ -176,7 +183,7 @@ def __repr__(self) -> str: # displays the arguments passed during initialization. args_lines = [ f"{k}={getattr(self, k)!r}" - for k in sorted(self._passed_args.keys()) + for k in sorted(self._titan_injected_model_args.keys()) if hasattr(self, k) ] args_str = "\n".join(args_lines) @@ -191,7 +198,7 @@ def update_from_config(self, job_config: JobConfig): ) # Explicitly update attributes based on mappings - for titan_name, hf_name in self._active_mappings.items(): + for titan_name, hf_name in self._tt_to_hf_attribute_map.items(): if hasattr(hf_model_config, hf_name): setattr(self, titan_name, getattr(hf_model_config, hf_name)) @@ -200,7 +207,7 @@ def update_from_config(self, job_config: JobConfig): setattr(self, key, value) # Update our attributes with the passed args from flavors - for key, value in self._passed_args.items(): + for key, value in self._titan_injected_model_args.items(): if hasattr(self, key) and value is not None: setattr(self, key, value) From 3425b12bf797cf87f99da2482256962020d291f0 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 31 Oct 2025 09:37:54 +0000 Subject: [PATCH 084/129] add README --- .../transformers_backend/README.md | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 torchtitan/experiments/transformers_backend/README.md diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md new file mode 100644 index 0000000000..650855f28d --- /dev/null +++ b/torchtitan/experiments/transformers_backend/README.md @@ -0,0 +1,51 @@ +# Huggingface Transformers backend + +## Quick start + +- Requirements `transformers==4.55.4` + +- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml` +```diff +... +[model] +- name = "llama3" ++ name = "transformers_backend" +flavor = "debugmodel" +hf_assets_path = "./tests/assets/tokenizer" + ++[hf_transformers] ++model = "Qwen/Qwen3-4B-Instruct-2507" +... +``` +- Train: `LOG_RANK=7 CONFIG_FILE= + +## Supported Features + +- The following models were tested: + - Dense (FSDP/CP/TP/PP) + - `meta-llama/Llama-3.2-1B` + - `microsoft/phi-2` + - `Qwen/Qwen2.5-7B` + - `mistralai/Mistral-7B-v0.1` + - `ByteDance-Seed/Seed-Coder-8B-Instruct` + - `Qwen/Qwen3-4B-Instruct-2507` + - `arcee-ai/AFM-4.5B` + - `ibm-granite/granite-3b-code-base-2k` + - `baidu/ERNIE-4.5-0.3B-Base-PT` + - `kyutai/helium-1-preview-2b` + - `allenai/OLMo-7B-hf` + - `mistralai/Ministral-8B-Instruct-2410` + - MoE (upcoming) + +## Known issues to address later + +- When using HF modeling, the test `FSDP=2 vs FSDP=2 + PP=2`, the `loss` and `grad_norm` not bitwise matching (but converging) while it is the case with Torchtitan modeling. This will be addressed in another PR but the culprit is probably `register_buffer` when loading `seed_checkpoint` +- the HF modeling has lower MFU than Torchtitan MFU + +## Further work + +- Missing `build_optimizers_with_moe_load_balancing` support for MoE +- Missing TP/PP/EP supports for MoE +- Load HF weights +- Add LORA support \ No newline at end of file From 7b0ee5d5d72cac310104936a55ab857277e0d2b7 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 31 Oct 2025 09:38:47 +0000 Subject: [PATCH 085/129] add requirements.txt --- torchtitan/experiments/transformers_backend/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 torchtitan/experiments/transformers_backend/requirements.txt diff --git a/torchtitan/experiments/transformers_backend/requirements.txt b/torchtitan/experiments/transformers_backend/requirements.txt new file mode 100644 index 0000000000..6b0cc637db --- /dev/null +++ b/torchtitan/experiments/transformers_backend/requirements.txt @@ -0,0 +1 @@ +transformers==4.55.4 From 3e2222c702bddaf02d0554c68374b984cd05d1d3 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 31 Oct 2025 09:51:24 +0000 Subject: [PATCH 086/129] fix linting --- .../transformers_backend/README.md | 6 +-- .../transformers_backend/__init__.py | 1 + .../transformers_backend/model/args.py | 46 ++++++++++--------- torchtitan/protocols/train_spec.py | 1 - 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md index 650855f28d..ce4d7ff7c8 100644 --- a/torchtitan/experiments/transformers_backend/README.md +++ b/torchtitan/experiments/transformers_backend/README.md @@ -4,12 +4,12 @@ - Requirements `transformers==4.55.4` -- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml` +- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml` ```diff ... [model] - name = "llama3" -+ name = "transformers_backend" ++ name = "transformers_backend" flavor = "debugmodel" hf_assets_path = "./tests/assets/tokenizer" @@ -48,4 +48,4 @@ hf_assets_path = "./tests/assets/tokenizer" - Missing `build_optimizers_with_moe_load_balancing` support for MoE - Missing TP/PP/EP supports for MoE - Load HF weights -- Add LORA support \ No newline at end of file +- Add LORA support diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 1c44b9684c..50e8119b15 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -115,6 +115,7 @@ class TitanMoeModelArgs: ), } + def get_train_spec() -> TrainSpec: return TrainSpec( model_cls=HFTransformerModel, diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index d281d68b3c..b9b79bda04 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -46,7 +46,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): "n_dense_layers": "first_k_dense_replace", }, } - + # Declarative list of TorchTitan-only attributes (no HF equivalent) _TT_SPECIFIC_ATTRIBUTES = [ "multiple_of", @@ -55,7 +55,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): "use_flex_attn", "attn_mask_type", ] - + # MoE attributes that should be copied directly _MOE_SHARED_ATTRIBUTES = [ "rope_interleave", @@ -82,16 +82,16 @@ def __init__( # Create getter/setter dynamically for TT <-> HF attribute mappings self._create_getter_setter_dynamically(titan_moe_args is not None) - + self._titan_injected_model_args = {} self._titan_injected_model_args.update(kwargs) self._configure_hf_attention(attn_implementation) self._initialize_dense_attributes(titan_dense_args) - + if titan_moe_args is not None: self._initialize_moe_attributes(titan_moe_args) - + def _initialize_dense_attributes(self, titan_dense_args): """Initialize all dense model attributes.""" # Set mapped attributes (TorchTitan <-> HuggingFace) @@ -99,12 +99,12 @@ def _initialize_dense_attributes(self, titan_dense_args): if hasattr(titan_dense_args, titan_name): value = getattr(titan_dense_args, titan_name) setattr(self, hf_name, value) - + # Set TorchTitan-only attributes for attr_name in self._TT_SPECIFIC_ATTRIBUTES: if hasattr(titan_dense_args, attr_name): setattr(self, attr_name, getattr(titan_dense_args, attr_name)) - + # Update passed_args self._titan_injected_model_args.update(titan_dense_args.__dict__) @@ -113,35 +113,39 @@ def _initialize_moe_attributes(self, titan_moe_args): if titan_moe_args.moe_args is None: self._titan_injected_model_args.update(titan_moe_args.__dict__) return - + moe_args = titan_moe_args.moe_args - + # Convert q_lora_rank (0 -> None for HuggingFace compatibility) - self.q_lora_rank = None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank - + self.q_lora_rank = ( + None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank + ) + # Set core MoE attributes self.moe_args = moe_args self.num_experts_per_tok = moe_args.top_k self.n_routed_experts = moe_args.num_experts self.n_shared_experts = moe_args.num_shared_experts self.moe_intermediate_size = titan_moe_args.moe_inter_dim - + # Set remaining architecture-specific MoE attributes for attr in self._MOE_SHARED_ATTRIBUTES: if attr == "q_lora_rank": continue # Already set above if hasattr(titan_moe_args, attr): setattr(self, attr, getattr(titan_moe_args, attr)) - + # Track all MoE arguments self._titan_injected_model_args.update(titan_moe_args.__dict__) - self._titan_injected_model_args.update({ - "num_experts_per_tok": moe_args.top_k, - "n_routed_experts": moe_args.num_experts, - "n_shared_experts": moe_args.num_shared_experts, - "moe_intermediate_size": titan_moe_args.moe_inter_dim, - "q_lora_rank": self.q_lora_rank, - }) + self._titan_injected_model_args.update( + { + "num_experts_per_tok": moe_args.top_k, + "n_routed_experts": moe_args.num_experts, + "n_shared_experts": moe_args.num_shared_experts, + "moe_intermediate_size": titan_moe_args.moe_inter_dim, + "q_lora_rank": self.q_lora_rank, + } + ) def _configure_hf_attention(self, attn_implementation: str): """Configure HuggingFace attention settings.""" @@ -164,7 +168,7 @@ def setter(self, value): setattr(self, hf_name, value) return property(getter, setter) - + # Setup attribute mappings self._tt_to_hf_attribute_map = dict(self._TT_TO_HF_MAPPINGS["dense"]) if has_moe: diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index c5bd62793b..1f7899e965 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import dataclasses from collections.abc import Callable from dataclasses import dataclass from importlib import import_module From 70c348d1409e1f3ed566270a16329baccbe33585 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sat, 1 Nov 2025 12:48:14 +0000 Subject: [PATCH 087/129] fix bug related to training with different seq_len than max_seq_len --- torchtitan/experiments/transformers_backend/model/args.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index b9b79bda04..285a82c5a8 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -215,6 +215,9 @@ def update_from_config(self, job_config: JobConfig): if hasattr(self, key) and value is not None: setattr(self, key, value) + if hasattr(job_config.training, 'seq_len') and job_config.training.seq_len != self.max_seq_len: + self.max_seq_len = job_config.training.seq_len + # MoE if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim From af0a1cb76ed494adf847d8915d4dc38cf52c5497 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Sat, 1 Nov 2025 16:45:52 +0000 Subject: [PATCH 088/129] decouple MoE logic to another PR --- .../transformers_backend/__init__.py | 58 ---- .../transformers_backend/model/args.py | 82 +----- .../transformers_backend/model/model.py | 247 ++---------------- 3 files changed, 25 insertions(+), 362 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 50e8119b15..c4343b8cb7 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -10,7 +10,6 @@ from torchtitan.components.optimizer import build_optimizers from torchtitan.components.tokenizer import build_hf_tokenizer from torchtitan.hf_datasets.text_datasets import build_text_dataloader -from torchtitan.models.moe import MoEArgs from torchtitan.protocols.train_spec import TrainSpec from .infra.parallelize_hf_transformers import parallelize_hf_transformers @@ -44,33 +43,6 @@ class TitanDenseModelArgs: use_flex_attn: bool = False attn_mask_type: str = "causal" - -@dataclass -class TitanMoeModelArgs: - """Arguments specific to DeepSeekV3 models.""" - - moe_args: MoEArgs | None = None - n_group: int | None = None - topk_group: int | None = None - inter_dim: int | None = None - moe_inter_dim: int | None = None - n_dense_layers: int | None = None - n_expert_groups: int | None = None - n_limited_groups: int | None = None - q_lora_rank: int | None = None - kv_lora_rank: int | None = None - qk_nope_head_dim: int | None = None - qk_rope_head_dim: int | None = None - v_head_dim: int | None = None - original_seq_len: int | None = None - rope_factor: float | None = None - beta_fast: int | None = None - beta_slow: int | None = None - mscale: float | None = None - partial_rotary_factor: float | None = None - rope_interleave: bool = True - - flavors = { "debugmodel": HFTransformerModelArgs( titan_dense_args=TitanDenseModelArgs( @@ -80,36 +52,6 @@ class TitanMoeModelArgs: n_kv_heads=16, ), ), - "debugmodel_moe": HFTransformerModelArgs( - titan_dense_args=TitanDenseModelArgs( - dim=256, - n_layers=6, - n_heads=16, - n_kv_heads=16, - ), - titan_moe_args=TitanMoeModelArgs( - partial_rotary_factor=4.0, - inter_dim=1024, - moe_inter_dim=256, - n_dense_layers=1, - n_group=2, - topk_group=1, - kv_lora_rank=512, - q_lora_rank=0, - qk_nope_head_dim=128, - qk_rope_head_dim=64, - v_head_dim=128, - mscale=0.70, - moe_args=MoEArgs( - num_experts=8, - num_shared_experts=2, - top_k=3, - score_func="softmax", - route_norm=True, - score_before_experts=False, - ), - ), - ), "full": HFTransformerModelArgs( titan_dense_args=TitanDenseModelArgs(), ), diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 285a82c5a8..2e90eea854 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -8,10 +8,7 @@ from torch import nn from torchtitan.config import JobConfig -from torchtitan.models.utils import ( - get_dense_model_nparams_and_flops, - get_moe_model_nparams_and_flops, -) +from torchtitan.models.utils import get_dense_model_nparams_and_flops from torchtitan.protocols import BaseModelArgs from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig @@ -39,12 +36,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): "norm_eps": "rms_norm_eps", "max_seq_len": "max_position_embeddings", "eos_id": "eos_token_id", - }, - "moe": { - # TorchTitan moe model specific mappings (only when titan_moe_args provided) - "inter_dim": "intermediate_size", - "n_dense_layers": "first_k_dense_replace", - }, + } } # Declarative list of TorchTitan-only attributes (no HF equivalent) @@ -56,23 +48,9 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): "attn_mask_type", ] - # MoE attributes that should be copied directly - _MOE_SHARED_ATTRIBUTES = [ - "rope_interleave", - "partial_rotary_factor", - "n_group", - "topk_group", - "kv_lora_rank", - "q_lora_rank", - "qk_nope_head_dim", - "qk_rope_head_dim", - "v_head_dim", - ] - def __init__( self, titan_dense_args, - titan_moe_args=None, # HuggingFace specific args attn_implementation: str = "sdpa_torchtitan", **kwargs, @@ -81,7 +59,7 @@ def __init__( assert titan_dense_args is not None, "titan_dense_args is required" # Create getter/setter dynamically for TT <-> HF attribute mappings - self._create_getter_setter_dynamically(titan_moe_args is not None) + self._create_getter_setter_dynamically(has_moe=False) self._titan_injected_model_args = {} self._titan_injected_model_args.update(kwargs) @@ -89,9 +67,6 @@ def __init__( self._initialize_dense_attributes(titan_dense_args) - if titan_moe_args is not None: - self._initialize_moe_attributes(titan_moe_args) - def _initialize_dense_attributes(self, titan_dense_args): """Initialize all dense model attributes.""" # Set mapped attributes (TorchTitan <-> HuggingFace) @@ -107,46 +82,6 @@ def _initialize_dense_attributes(self, titan_dense_args): # Update passed_args self._titan_injected_model_args.update(titan_dense_args.__dict__) - - def _initialize_moe_attributes(self, titan_moe_args): - """Initialize all MoE-specific attributes.""" - if titan_moe_args.moe_args is None: - self._titan_injected_model_args.update(titan_moe_args.__dict__) - return - - moe_args = titan_moe_args.moe_args - - # Convert q_lora_rank (0 -> None for HuggingFace compatibility) - self.q_lora_rank = ( - None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank - ) - - # Set core MoE attributes - self.moe_args = moe_args - self.num_experts_per_tok = moe_args.top_k - self.n_routed_experts = moe_args.num_experts - self.n_shared_experts = moe_args.num_shared_experts - self.moe_intermediate_size = titan_moe_args.moe_inter_dim - - # Set remaining architecture-specific MoE attributes - for attr in self._MOE_SHARED_ATTRIBUTES: - if attr == "q_lora_rank": - continue # Already set above - if hasattr(titan_moe_args, attr): - setattr(self, attr, getattr(titan_moe_args, attr)) - - # Track all MoE arguments - self._titan_injected_model_args.update(titan_moe_args.__dict__) - self._titan_injected_model_args.update( - { - "num_experts_per_tok": moe_args.top_k, - "n_routed_experts": moe_args.num_experts, - "n_shared_experts": moe_args.num_shared_experts, - "moe_intermediate_size": titan_moe_args.moe_inter_dim, - "q_lora_rank": self.q_lora_rank, - } - ) - def _configure_hf_attention(self, attn_implementation: str): """Configure HuggingFace attention settings.""" self._titan_injected_model_args["attn_implementation"] = attn_implementation @@ -217,10 +152,6 @@ def update_from_config(self, job_config: JobConfig): if hasattr(job_config.training, 'seq_len') and job_config.training.seq_len != self.max_seq_len: self.max_seq_len = job_config.training.seq_len - - # MoE - if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"): - self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim # Configure HF-specific settings to match TorchTitan settings self.attention_bias = False @@ -242,9 +173,4 @@ def update_from_config(self, job_config: JobConfig): return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - is_moe = hasattr(self, "n_routed_experts") - - if is_moe: - return get_moe_model_nparams_and_flops(self, model, seq_len) - else: - return get_dense_model_nparams_and_flops(self, model, seq_len) + return get_dense_model_nparams_and_flops(self, model, seq_len) diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index fd7561611e..8041e54f70 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -45,55 +45,26 @@ def __init__(self, model_args: HFTransformerModelArgs): model_module, f"{model_name_prefix}DecoderLayer", None ) - is_moe = hasattr( - model_args, "n_routed_experts" - ) # TODO(3outeille): check if this is the most reliable to detect a moe model - if is_moe: - moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None) - required_classes = { - "Attention": attention_cls, - "MLP": mlp_cls, - "DecoderLayer": decoder_layer_cls, - "MoE": moe_cls, - } - - if all(required_classes.values()): - logger.info(f"Applying MoE-like patch for {model_name_prefix}") - self._patch_hf_moe_like( - decoder_layer_cls=decoder_layer_cls, - attention_cls=attention_cls, - mlp_cls=mlp_cls, - moe_cls=moe_cls, - ) - else: - missing = [ - name for name, cls in required_classes.items() if not cls - ] - logger.warning( - f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. " - "Skipping MoE-like patch." - ) + required_classes = { + "Attention": attention_cls, + "DecoderLayer": decoder_layer_cls, + } + + if all(required_classes.values()): + logger.info(f"Applying Llama-like patch for {model_name_prefix}") + self._patch_hf_llama_like( + decoder_layer_cls=decoder_layer_cls, + attention_cls=attention_cls, + mlp_cls=mlp_cls, # mlp_cls can be None + ) else: - required_classes = { - "Attention": attention_cls, - "DecoderLayer": decoder_layer_cls, - } - - if all(required_classes.values()): - logger.info(f"Applying Llama-like patch for {model_name_prefix}") - self._patch_hf_llama_like( - decoder_layer_cls=decoder_layer_cls, - attention_cls=attention_cls, - mlp_cls=mlp_cls, # mlp_cls can be None - ) - else: - missing = [ - name for name, cls in required_classes.items() if not cls - ] - logger.warning( - f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " - "Skipping Llama-like patch." - ) + missing = [ + name for name, cls in required_classes.items() if not cls + ] + logger.warning( + f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " + "Skipping Llama-like patch." + ) except Exception as e: logger.warning( @@ -103,17 +74,10 @@ def __init__(self, model_args: HFTransformerModelArgs): self.model = model_cls(config=model_args) self.max_seq_len = model_args.max_seq_len + self.cp_mesh = None for layer in self.model.model.layers: - if ( - hasattr(model_args, "first_k_dense_replace") - and layer.layer_idx >= model_args.first_k_dense_replace - ): - layer.moe_enabled = True - else: - layer.moe_enabled = False - - self.cp_mesh = None + layer.moe_enabled = False def set_cp_mesh(self, mesh): self.cp_mesh = mesh @@ -284,175 +248,6 @@ def _init_weights_patched(self, module): PreTrainedModel._init_weights = _init_weights_patched PreTrainedModel._initialize_weights = _initialize_weights_patched - def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls): - """ - This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight - initialization to match the initialization scheme used in TorchTitan, - drawing from patterns in models like DeepseekV3. - - The patch targets: - - `PreTrainedModel._initialize_weights`: For correct meta device initialization. - - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization - for attention, MLP, MoE, embedding, and layer norm layers. - - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert - modules, required for depth-dependent initialization. - """ - - _original_decoder_layer_init = decoder_layer_cls.__init__ - - def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int): - _original_decoder_layer_init(self, config, layer_idx) - self.layer_idx = layer_idx - - if hasattr(self, "self_attn"): - self.self_attn.layer_idx = layer_idx - - if hasattr(self, "mlp"): - self.mlp.layer_idx = layer_idx - if hasattr(self.mlp, "experts"): - for expert in self.mlp.experts: - expert.layer_idx = layer_idx - if hasattr(self.mlp, "shared_experts"): - # Not all MoE models have shared experts - if self.mlp.shared_experts is not None: - self.mlp.shared_experts.layer_idx = layer_idx - - def _initialize_weights_patched(self, module): - if getattr(module, "_is_hf_initialized", False): - return - for param in module.parameters(recurse=True): - if param.device.type == "meta": - return - self._init_weights(module) - module._is_hf_initialized = True - - def _init_weights_patched(self, module): - """ - Patched version of _init_weights for MoE models. - """ - config = self.config - init_std = None - - if isinstance(module, (attention_cls, mlp_cls, moe_cls)): - if hasattr(module, "layer_idx"): - layer_idx = module.layer_idx - if hasattr(config, "depth_init") and config.depth_init: - init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5 - else: - # Fallback for models without depth_init - init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5 - - if isinstance(module, attention_cls): - # Handle different attention projection layer names by initializing if they exist - if hasattr(module, "q_proj"): - nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "k_proj"): - nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "v_proj"): - nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "q_a_proj"): - nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02) - if hasattr(module, "q_b_proj"): - nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "kv_a_proj_with_mqa"): - nn.init.trunc_normal_( - module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02 - ) - if hasattr(module, "kv_b_proj"): - nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02) - - if hasattr(module, "o_proj") and init_std is not None: - nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std) - - elif isinstance(module, mlp_cls): - nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02) - # DeepseekV3 uses std=0.02 for up_proj, unlike Llama - nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_( - module.down_proj.weight, mean=0.0, std=init_std - ) - - elif isinstance(module, moe_cls): - if hasattr(module, "gate") and init_std is not None: - nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std) - if hasattr(module, "experts"): - for expert in module.experts: - nn.init.trunc_normal_( - expert.gate_proj.weight, mean=0.0, std=0.02 - ) - nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02) - if init_std is not None: - nn.init.trunc_normal_( - expert.down_proj.weight, mean=0.0, std=init_std - ) - if ( - hasattr(module, "shared_experts") - and module.shared_experts is not None - ): - nn.init.trunc_normal_( - module.shared_experts.gate_proj.weight, mean=0.0, std=0.02 - ) - nn.init.trunc_normal_( - module.shared_experts.up_proj.weight, mean=0.0, std=0.02 - ) - if init_std is not None: - nn.init.trunc_normal_( - module.shared_experts.down_proj.weight, - mean=0.0, - std=init_std, - ) - - elif module is getattr(self, "lm_head", None): - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - if module.bias is not None: - module.bias.data.zero_() - - elif isinstance(module, nn.Embedding): - # When tie_word_embeddings is True, use lm_head initialization - if ( - hasattr(config, "tie_word_embeddings") - and config.tie_word_embeddings - ): - final_out_std = config.hidden_size**-0.5 - cutoff_factor = 3 - nn.init.trunc_normal_( - module.weight, - mean=0.0, - std=final_out_std, - a=-cutoff_factor * final_out_std, - b=cutoff_factor * final_out_std, - ) - else: - std = config.initializer_range - module.weight.data.normal_(mean=0.0, std=std) - - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - elif ( - "LayerNorm" in module.__class__.__name__ - or "RMSNorm" in module.__class__.__name__ - ): - if hasattr(module, "weight") and module.weight is not None: - module.weight.data.fill_(1.0) - if hasattr(module, "bias") and module.bias is not None: - module.bias.data.zero_() - - decoder_layer_cls.__init__ = _decoder_layer_init_patched - PreTrainedModel._init_weights = _init_weights_patched - PreTrainedModel._initialize_weights = _initialize_weights_patched - @property def tok_embeddings(self): """Returns the model's embed_tokens, handling different Hugging Face model structures.""" From 980a92b9997a61ded55163f7049a303b779bfd00 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 3 Nov 2025 12:07:36 +0000 Subject: [PATCH 089/129] update experiments README --- torchtitan/experiments/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md index ad1e3ee79c..5db88af3d8 100644 --- a/torchtitan/experiments/README.md +++ b/torchtitan/experiments/README.md @@ -31,3 +31,4 @@ We provide this `experiments/` folder to host experiments that add significant v | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) | | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) | | [compiler_toolkit](./compiler_tookit/) | TBA | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) | +| [transformers_backend](./transformers_backend/) | TBA | [@3outeille](https://github.com/3outeille) | From 06b6f24cb5b24bbb6ff6acb1828bbeee5cc606a4 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Mon, 3 Nov 2025 12:22:01 +0000 Subject: [PATCH 090/129] update README to confirm torch.compile support --- torchtitan/experiments/transformers_backend/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md index ce4d7ff7c8..be819e223a 100644 --- a/torchtitan/experiments/transformers_backend/README.md +++ b/torchtitan/experiments/transformers_backend/README.md @@ -23,7 +23,7 @@ hf_assets_path = "./tests/assets/tokenizer" ## Supported Features - The following models were tested: - - Dense (FSDP/CP/TP/PP) + - Dense (FSDP/CP/TP/PP/`torch.compile`) - `meta-llama/Llama-3.2-1B` - `microsoft/phi-2` - `Qwen/Qwen2.5-7B` From a70c4c4e36310fbd90cf5095ceee4f83cbb31742 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 10:07:20 +0000 Subject: [PATCH 091/129] custom job_config --- torchtitan/config/job_config.py | 7 ------- torchtitan/experiments/transformers_backend/README.md | 3 ++- .../infra/parallelize_hf_transformers.py | 3 ++- .../transformers_backend/infra/pipeline_hf.py | 2 +- .../experiments/transformers_backend/job_config.py | 10 ++++++++++ .../experiments/transformers_backend/model/args.py | 2 +- 6 files changed, 16 insertions(+), 11 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/job_config.py diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py index ee89d13627..7fe6802374 100644 --- a/torchtitan/config/job_config.py +++ b/torchtitan/config/job_config.py @@ -131,12 +131,6 @@ class Model: """ -@dataclass -class HFTransformers: - model: str = "" - """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')""" - - @dataclass class Optimizer: name: str = "AdamW" @@ -903,7 +897,6 @@ class JobConfig: profiling: Profiling = field(default_factory=Profiling) metrics: Metrics = field(default_factory=Metrics) model: Model = field(default_factory=Model) - hf_transformers: HFTransformers = field(default_factory=HFTransformers) optimizer: Optimizer = field(default_factory=Optimizer) lr_scheduler: LRScheduler = field(default_factory=LRScheduler) training: Training = field(default_factory=Training) diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md index be819e223a..8fbd19f0e8 100644 --- a/torchtitan/experiments/transformers_backend/README.md +++ b/torchtitan/experiments/transformers_backend/README.md @@ -17,7 +17,8 @@ hf_assets_path = "./tests/assets/tokenizer" +model = "Qwen/Qwen3-4B-Instruct-2507" ... ``` -- Train: `LOG_RANK=7 CONFIG_FILE= ## Supported Features diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py index d1d8d4c480..27730a5914 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py @@ -24,7 +24,8 @@ RowwiseParallel, SequenceParallel, ) -from torchtitan.config import JobConfig, TORCH_DTYPE_MAP +from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.config import TORCH_DTYPE_MAP from torchtitan.config.job_config import ActivationCheckpoint as ACConfig from torchtitan.distributed import NoParallel, ParallelDims diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py index ee7b268f9d..088cc05642 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py @@ -19,7 +19,7 @@ ) from torchtitan.components.loss import LossFunction -from torchtitan.config import JobConfig +from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.distributed import ParallelDims from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py new file mode 100644 index 0000000000..6344529d20 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/job_config.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass, field + +@dataclass +class HFTransformers: + model: str = "" + """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')""" + +@dataclass +class JobConfig: + hf_transformers: HFTransformers = field(default_factory=HFTransformers) \ No newline at end of file diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 2e90eea854..668fa48aeb 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from torch import nn -from torchtitan.config import JobConfig +from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.models.utils import get_dense_model_nparams_and_flops from torchtitan.protocols import BaseModelArgs from transformers import AutoConfig From 42884cda72e5ffab1f7e216ac6b789a93f353e36 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 10:18:27 +0000 Subject: [PATCH 092/129] remove unecessary change in train_spec --- torchtitan/protocols/train_spec.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index 1f7899e965..22bfa7df9b 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -51,7 +51,6 @@ class TrainSpec: build_dataloader_fn: DataLoaderBuilder build_tokenizer_fn: TokenizerBuilder | None build_loss_fn: LossFunctionBuilder - name: str | None = None build_validator_fn: ValidatorBuilder | None = None build_metrics_processor_fn: MetricsProcessorBuilder | None = None state_dict_adapter: type[BaseStateDictAdapter] | None = None From 4fa0874f4ea4ab79735f71760695be745bf1f247 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 10:26:41 +0000 Subject: [PATCH 093/129] rename file to comply with torchtitan style --- torchtitan/experiments/transformers_backend/__init__.py | 4 ++-- .../infra/{parallelize_hf_transformers.py => parallelize.py} | 0 .../infra/{pipeline_hf.py => pipeline.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename torchtitan/experiments/transformers_backend/infra/{parallelize_hf_transformers.py => parallelize.py} (100%) rename torchtitan/experiments/transformers_backend/infra/{pipeline_hf.py => pipeline.py} (100%) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index c4343b8cb7..b72b77760c 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -12,9 +12,9 @@ from torchtitan.hf_datasets.text_datasets import build_text_dataloader from torchtitan.protocols.train_spec import TrainSpec -from .infra.parallelize_hf_transformers import parallelize_hf_transformers +from .infra.parallelize import parallelize_hf_transformers -from .infra.pipeline_hf import pipeline_hf_transformers +from .infra.pipeline import pipeline_hf_transformers from .model.args import HFTransformerModelArgs from .model.model import HFTransformerModel diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py similarity index 100% rename from torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py rename to torchtitan/experiments/transformers_backend/infra/parallelize.py diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py similarity index 100% rename from torchtitan/experiments/transformers_backend/infra/pipeline_hf.py rename to torchtitan/experiments/transformers_backend/infra/pipeline.py From 8ffa7f4dc731b5c1e29c7c650540ede0b4cd456f Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 10:37:53 +0000 Subject: [PATCH 094/129] reuse ac form torchtitan --- .../transformers_backend/infra/parallelize.py | 112 +----------------- 1 file changed, 1 insertion(+), 111 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 27730a5914..27ff2718be 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -37,117 +37,7 @@ from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp from torchtitan.tools.logging import logger -# for selective op activation checkpointing -_save_list = { - torch.ops.aten.mm.default, - torch.ops.aten._scaled_dot_product_efficient_attention.default, - torch.ops.aten._scaled_dot_product_flash_attention.default, - torch._higher_order_ops.flex_attention, - torch.ops._c10d_functional.reduce_scatter_tensor.default, - # for low precision training, it's useful to always save - # the result of max, since the absolute maximum is - # used to compute the scaling factor for quantization. - torch.ops.aten.max.default, -} - - -def _apply_ac_to_transformer_block( - module: nn.Module, ac_config: ACConfig, *, base_fqn: Optional[str] = None -): - valid_ac_modes = ("full", "selective") - if ac_config.mode not in valid_ac_modes: - raise ValueError( - f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}" - ) - - if ac_config.mode == "full": - return ptd_checkpoint_wrapper(module, preserve_rng_state=False) - - assert ac_config.mode == "selective", f"{ac_config.mode}" - use_op_sac = ac_config.selective_ac_option == "op" - use_layer_sac = ac_config.selective_ac_option.isdigit() - if not use_op_sac and not use_layer_sac: - raise ValueError( - f"Invalid selective AC option: {ac_config.selective_ac_option}. " - f"Valid options: 'op' or a positive int representing layer frequency" - ) - if use_op_sac: - from torch.utils.checkpoint import ( - CheckpointPolicy, - create_selective_checkpoint_contexts, - ) - - mm_recompute_shapes = set() - if len(ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns) > 0: - for module_fqn, submod in module.named_modules(): - fqn = module_fqn - if base_fqn is not None: - fqn = f"{base_fqn}.{module_fqn}" - if not any( - filter_fqn in fqn - for filter_fqn in ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns - ): - continue - if not isinstance(submod, nn.Linear): - raise ValueError( - "per_op_sac_force_recompute_mm_shapes_by_fqns expected to match " - f"a nn.Linear, but got: {submod}" - ) - out_f, in_f = submod.weight.shape - mm_recompute_shapes.add((in_f, out_f)) - logger.debug( - f"Selective op AC force recomputing mms with rhs shapes {mm_recompute_shapes}" - ) - - def _get_custom_policy(meta): - def _custom_policy(ctx, func, *args, **kwargs): - mode = "recompute" if ctx.is_recompute else "forward" - mm_count_key = f"{mode}_mm_count" - if func == torch.ops.aten.mm.default: - if args[1].shape in mm_recompute_shapes: - return CheckpointPolicy.PREFER_RECOMPUTE - meta[mm_count_key] += 1 - # Saves output of all compute ops, except every second mm - to_save = func in _save_list and not ( - func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0 - ) - return ( - CheckpointPolicy.MUST_SAVE - if to_save - else CheckpointPolicy.PREFER_RECOMPUTE - ) - - return _custom_policy - - def selective_checkpointing_context_fn(): - meta = defaultdict(int) - return create_selective_checkpoint_contexts(_get_custom_policy(meta)) - - return ptd_checkpoint_wrapper( - module, - context_fn=selective_checkpointing_context_fn, - preserve_rng_state=False, - ) - elif use_layer_sac: - # Checkpoint every `ac_freq` of the modules passed to this function - ac_freq = int(ac_config.selective_ac_option) - ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0) - ptd_checkpoint_wrapper._count += 1 - if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0: - return ptd_checkpoint_wrapper(module, preserve_rng_state=False) - else: - return module - - -def apply_ac(model: nn.Module, ac_config: ACConfig): - """Apply activation checkpointing to the model.""" - for layer_id, transformer_block in model.layers.named_children(): - transformer_block = _apply_ac_to_transformer_block( - transformer_block, ac_config, base_fqn=f"layers.{layer_id}" - ) - model.layers.register_module(layer_id, transformer_block) - - logger.info(f"Applied {ac_config.mode} activation checkpointing to the model") +from torchtitan.distributed.activation_checkpoint import apply_ac def apply_ddp( From ff21c2be00de5c5a9134baca6dd11bf3df5b6322 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 10:38:58 +0000 Subject: [PATCH 095/129] reuse ddp from torchtitan --- .../transformers_backend/infra/parallelize.py | 20 +------------------ 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 27ff2718be..5a8cf94791 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -38,25 +38,7 @@ from torchtitan.tools.logging import logger from torchtitan.distributed.activation_checkpoint import apply_ac - - -def apply_ddp( - model: nn.Module, - dp_mesh: DeviceMesh, - enable_compile: bool, - enable_compiled_autograd: bool, -): - if enable_compile: - if enable_compiled_autograd: - torch._dynamo.config.optimize_ddp = ( - "python_reducer_without_compiled_forward" - ) - else: - torch._dynamo.config.optimize_ddp = "ddp_optimizer" - - replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100) - - logger.info("Applied DDP to the model") +from torchtitan.models.llama3.infra.parallelize import apply_ddp def parallelize_hf_transformers( From 0a43a8a96a69bd77bff758b0d29e19166ecc5080 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 10:40:13 +0000 Subject: [PATCH 096/129] reuse compile from torchtitan llama3 --- .../transformers_backend/infra/parallelize.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 5a8cf94791..2aca64fdcf 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -39,6 +39,7 @@ from torchtitan.distributed.activation_checkpoint import apply_ac from torchtitan.models.llama3.infra.parallelize import apply_ddp +from torchtitan.models.llama3.infra.parallelize import apply_compile def parallelize_hf_transformers( @@ -564,20 +565,4 @@ def apply_moe_ep_tp( module=moe_block.experts, device_mesh=experts_mesh, parallelize_plan=experts_plan, - ) - - -def apply_compile(model: nn.Module): - """ - Apply torch.compile to each TransformerBlock, which makes compilation efficient due to - repeated structure. Alternatively one can compile the whole model (after applying DP). - """ - for layer_id, transformer_block in model.layers.named_children(): - # TODO: remove when torch.compile supports fullgraph=True for MoE - fullgraph = True - if transformer_block.moe_enabled: - fullgraph = False - transformer_block = torch.compile(transformer_block, fullgraph=fullgraph) - model.layers.register_module(layer_id, transformer_block) - - logger.info("Compiling each TransformerBlock with torch.compile") + ) \ No newline at end of file From 8026bc7898e8880a14e8dfa0392fa62b569d7633 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 10:49:53 +0000 Subject: [PATCH 097/129] reuse compile from torchtitan --- .../transformers_backend/infra/parallelize.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 2aca64fdcf..4eac61b74e 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -4,15 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from collections import defaultdict -from typing import Optional - import torch import torch.nn as nn -from torch.distributed._composable.replicate import replicate -from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( - checkpoint_wrapper as ptd_checkpoint_wrapper, -) from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy from torch.distributed.tensor import Partial, Replicate, Shard @@ -26,7 +19,6 @@ ) from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.config import TORCH_DTYPE_MAP -from torchtitan.config.job_config import ActivationCheckpoint as ACConfig from torchtitan.distributed import NoParallel, ParallelDims from torchtitan.distributed.expert_parallel import ( @@ -113,7 +105,7 @@ def parallelize_hf_transformers( if model_compile_enabled: # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE torch._dynamo.config.capture_scalar_outputs = True - apply_compile(model) + apply_compile(model, job_config.compile) dp_mesh: DeviceMesh | None = None if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled: From cd4042fa6cac6c1b31ff23fc1885c45f8261899e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 11:26:27 +0000 Subject: [PATCH 098/129] update parallelize with main --- .../transformers_backend/infra/parallelize.py | 49 +++---------------- 1 file changed, 6 insertions(+), 43 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 4eac61b74e..276cf94bcd 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -57,9 +57,6 @@ def parallelize_hf_transformers( ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}). """ - if job_config.parallelism.context_parallel_degree > 1: - logger.warning("CP support for FlexAttention is still in progress.") - if parallel_dims.tp_enabled: enable_float8_linear = "float8" in job_config.model.converters float8_is_rowwise = job_config.quantize.linear.float8.recipe_name in ( @@ -80,64 +77,32 @@ def parallelize_hf_transformers( ) maybe_enable_async_tp(job_config, world_mesh["tp"]) - if parallel_dims.tp_enabled or parallel_dims.ep_enabled: - apply_moe_ep_tp( - model, - tp_mesh=world_mesh["tp"] if parallel_dims.tp_enabled else None, - ep_mesh=world_mesh["ep"] if parallel_dims.ep_enabled else None, - ep_tp_mesh=( - world_mesh["ep", "tp"] - if parallel_dims.tp_enabled - and parallel_dims.ep_enabled - and parallel_dims.etp_enabled - else None - ), - etp_enabled=parallel_dims.etp_enabled, - ) + model_compile_enabled = ( + job_config.compile.enable and "model" in job_config.compile.components + ) if job_config.activation_checkpoint.mode != "none": apply_ac(model, job_config.activation_checkpoint) - model_compile_enabled = ( - job_config.compile.enable and "model" in job_config.compile.components - ) # turn on per-TransformerBlock compile after AC wrapping and before FSDP if model_compile_enabled: - # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE - torch._dynamo.config.capture_scalar_outputs = True apply_compile(model, job_config.compile) - dp_mesh: DeviceMesh | None = None - if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled: + if parallel_dims.fsdp_enabled: # apply FSDP or HSDP, potentially with Context Parallel if parallel_dims.dp_replicate_enabled: dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp") else: dp_mesh_dim_names = ("dp_shard_cp",) - dp_mesh = world_mesh[tuple(dp_mesh_dim_names)] - - # the mesh dim names of which the MoE params are sharded on via FSDP/HSDP - dp_mod_ep_mesh_dim_names = [] - if parallel_dims.ep_enabled: - if parallel_dims.dp_replicate_enabled: - dp_mod_ep_mesh_dim_names.append("dp_replicate") - dp_mod_ep_mesh_dim_names.append("dp_shard_mod_ep") apply_fsdp( model, - dp_mesh, + world_mesh[tuple(dp_mesh_dim_names)], param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param], reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce], pp_enabled=parallel_dims.pp_enabled, cpu_offload=job_config.training.enable_cpu_offload, reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward, - ep_degree=parallel_dims.ep, - dp_mod_ep_mesh=( - world_mesh[tuple(dp_mod_ep_mesh_dim_names)] - if parallel_dims.ep_enabled - else None - ), - gradient_divide_factor=parallel_dims.fsdp_gradient_divide_factor, ) if parallel_dims.dp_replicate_enabled: @@ -154,12 +119,10 @@ def parallelize_hf_transformers( elif parallel_dims.dp_replicate_enabled: if world_mesh.ndim > 1: raise RuntimeError("DDP has not supported > 1D parallelism") - dp_mesh = world_mesh apply_ddp( model, - dp_mesh, + world_mesh, enable_compile=model_compile_enabled, - enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd, ) return model From 0700bdbe46ef83b31dac0d3315e9433dc87f2702 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 4 Nov 2025 12:16:22 +0000 Subject: [PATCH 099/129] remove moe ep tp for now --- .../transformers_backend/infra/parallelize.py | 84 ------------------- 1 file changed, 84 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 276cf94bcd..87099f883a 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -437,87 +437,3 @@ def apply_fsdp( ) elif model.tok_embeddings is not None: transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings]) - - -def apply_moe_ep_tp( - model: nn.Module, - tp_mesh: DeviceMesh | None, - ep_mesh: DeviceMesh | None, - ep_tp_mesh: DeviceMesh | None, - etp_enabled: bool, -): - for transformer_block in model.layers: - if not transformer_block.moe_enabled: - continue - - moe_block = transformer_block.mlp - if tp_mesh is not None: - moe_layer_plan = { - # input / output sharding on the seqlen dim - # all-gather for input, reduce-scatter for output - "mlp": PrepareModuleInputOutput( - input_layouts=(Shard(1),), - desired_input_layouts=(Replicate(),), - use_local_input=True, - output_layouts=(Partial(),), - desired_output_layouts=(Shard(1),), - ), - # replicate computation for the router - "mlp.gate": NoParallel(), - } - if ep_mesh is not None and not etp_enabled: - # If TP is borrowed for EP, then split the tokens across TP ranks so that - # the reorderer, the all-to-all comms, and routed experts computation - # are effectively running Sequence Parallel (split along the folded bs*slen dim) - moe_layer_plan.update({"mlp.reorderer": ReordererSequenceParallel()}) - if moe_block.shared_experts is not None: - # input Replicate, output Partial - moe_layer_plan.update( - { - "mlp.shared_experts.gate_proj": ColwiseParallel(), - "mlp.shared_experts.up_proj": ColwiseParallel(), - "mlp.shared_experts.down_proj": RowwiseParallel( - output_layouts=Partial() - ), - } - ) - parallelize_module( - module=transformer_block, - device_mesh=tp_mesh, - parallelize_plan=moe_layer_plan, - ) - - if ep_mesh is None: # This is the TP-only case for experts - experts_mesh = tp_mesh - expert_tp_plan = {} - for i in range(len(moe_block.experts)): - expert_tp_plan.update( - { - f"{i}.gate_proj": ColwiseParallel(), - f"{i}.up_proj": ColwiseParallel(), - f"{i}.down_proj": RowwiseParallel(output_layouts=Partial()), - } - ) - parallelize_module( - module=moe_block.experts, - device_mesh=experts_mesh, - parallelize_plan=expert_tp_plan, - ) - else: # EP or ETP enabled - experts_mesh, experts_plan = None, None - if tp_mesh is None: - experts_mesh = ep_mesh - # input / output sharding on the batch / tokens dim - experts_plan = ExpertParallel() - elif etp_enabled: - experts_mesh = ep_tp_mesh - experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh) - else: - experts_mesh = ep_mesh - experts_plan = ExpertParallel() - - parallelize_module( - module=moe_block.experts, - device_mesh=experts_mesh, - parallelize_plan=experts_plan, - ) \ No newline at end of file From 767f71d610d16e25bdd0f6498cb5a4b2c683bc8f Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 5 Nov 2025 12:11:11 +0000 Subject: [PATCH 100/129] fix SequenceParallel for q and k norm --- .../transformers_backend/infra/parallelize.py | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 87099f883a..db78f7ea24 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -8,12 +8,11 @@ import torch.nn as nn from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy -from torch.distributed.tensor import Partial, Replicate, Shard +from torch.distributed.tensor import Replicate, Shard from torch.distributed.tensor.parallel import ( ColwiseParallel, parallelize_module, PrepareModuleInput, - PrepareModuleInputOutput, RowwiseParallel, SequenceParallel, ) @@ -21,11 +20,6 @@ from torchtitan.config import TORCH_DTYPE_MAP from torchtitan.distributed import NoParallel, ParallelDims -from torchtitan.distributed.expert_parallel import ( - ExpertParallel, - ExpertTensorParallel, - ReordererSequenceParallel, -) from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp from torchtitan.tools.logging import logger @@ -33,7 +27,6 @@ from torchtitan.models.llama3.infra.parallelize import apply_ddp from torchtitan.models.llama3.infra.parallelize import apply_compile - def parallelize_hf_transformers( model: nn.Module, parallel_dims: ParallelDims, @@ -230,13 +223,10 @@ def apply_non_moe_tp( layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel( output_layouts=Shard(1) ) - - # For Qwen3 RMSNorm on Q and K - # TODO(3outeille): we should probably shard(1) then replicate => then use SequenceParallel but for now I am fed up - if hasattr(transformer_block.self_attn, "q_norm"): - layer_plan["self_attn.q_norm"] = NoParallel() - if hasattr(transformer_block.self_attn, "k_norm"): - layer_plan["self_attn.k_norm"] = NoParallel() + #For model that uses RMSNorm on Q and K (i.e. Qwen3) + if hasattr(transformer_block.self_attn, "q_norm") and hasattr(transformer_block.self_attn, "k_norm"): + layer_plan["self_attn.q_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True) + layer_plan["self_attn.k_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True) if not transformer_block.moe_enabled: mlp_plan = { From 7f71f885eb76322d7ea03955a6e619be0e23921c Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 5 Nov 2025 12:24:29 +0000 Subject: [PATCH 101/129] job_config.training will always have seq_len --- torchtitan/experiments/transformers_backend/model/args.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 668fa48aeb..4c9ffcae72 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -150,8 +150,7 @@ def update_from_config(self, job_config: JobConfig): if hasattr(self, key) and value is not None: setattr(self, key, value) - if hasattr(job_config.training, 'seq_len') and job_config.training.seq_len != self.max_seq_len: - self.max_seq_len = job_config.training.seq_len + self.max_seq_len = job_config.training.seq_len # Configure HF-specific settings to match TorchTitan settings self.attention_bias = False From 7e63a82541b226fa275716522e72371c412610ba Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 7 Nov 2025 16:06:44 +0000 Subject: [PATCH 102/129] fix loading weights in PP by using Module Dict --- .../transformers_backend/model/model.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index 8041e54f70..e0d5628f1a 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -16,6 +16,30 @@ from .args import HFTransformerModelArgs +class SlicableModuleDict(nn.ModuleDict): + """ + A ModuleDict that supports slicing like ModuleList. + Keys are expected to be string representations of integers (e.g., "0", "1", "2"). + """ + + def __getitem__(self, key): + if isinstance(key, slice): + # Handle slicing: convert slice to list of keys + keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf')) + sliced_keys = keys[key] + # Return a new SlicableModuleDict with the sliced items + return SlicableModuleDict({k: self[k] for k in sliced_keys}) + return super().__getitem__(key) + + def __iter__(self): + # Iterate over values in sorted order by key (as integers) + keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf')) + for key in keys: + yield self[key] + + def __len__(self): + return len(self._modules) + class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): @@ -76,7 +100,15 @@ def __init__(self, model_args: HFTransformerModelArgs): self.max_seq_len = model_args.max_seq_len self.cp_mesh = None - for layer in self.model.model.layers: + # Convert ModuleList to ModuleDict to preserve original indices + # This ensures state dict keys match checkpoint keys + if isinstance(self.model.model.layers, nn.ModuleList): + self.model.model.layers = SlicableModuleDict({ + str(i): layer + for i, layer in enumerate(self.model.model.layers) + }) + + for layer in self.model.model.layers.values(): layer.moe_enabled = False def set_cp_mesh(self, mesh): From 04fb8eb9c1ab022e8cc9d75a0363c66689c71b89 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 13 Nov 2025 09:21:54 +0000 Subject: [PATCH 103/129] clean reference qwen config --- .../transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml index d1433bb7ed..13e3f4ddf0 100644 --- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml +++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml @@ -6,7 +6,7 @@ description = "Qwen 3 debug training" print_config = true [profiling] -enable_profiling = true +enable_profiling = false save_traces_folder = "profile_trace" profile_freq = 5 enable_memory_snapshot = false @@ -41,7 +41,6 @@ decay_type = "linear" min_lr_factor = 0.0 [training] -global_batch_size = 4 local_batch_size = 2 seq_len = 2048 max_norm = 1.0 # grad norm clipping From 0d80f62c64f3cfe2dbe7d7441cd8cceee464bb67 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 13 Nov 2025 09:27:03 +0000 Subject: [PATCH 104/129] error out if no layer_idx --- torchtitan/experiments/transformers_backend/model/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index e0d5628f1a..bb50fd466c 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -171,7 +171,8 @@ def _init_weights_patched(self, module): if isinstance(module, layer_idx_classes): if not hasattr(module, "layer_idx"): - return + raise ValueError(f"Module {module} does not have a layer_idx attribute") + layer_idx = module.layer_idx if hasattr(config, "depth_init") and config.depth_init: From 09f0c94790a5817eb9c2f5d40f5d11236f7c79b9 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 13 Nov 2025 10:06:01 +0000 Subject: [PATCH 105/129] reuse pipeline from torchtitan --- torchtitan/distributed/pipeline_parallel.py | 17 +- .../transformers_backend/infra/pipeline.py | 270 +----------------- 2 files changed, 25 insertions(+), 262 deletions(-) diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py index 06dba40d6f..0c0eb89dcc 100644 --- a/torchtitan/distributed/pipeline_parallel.py +++ b/torchtitan/distributed/pipeline_parallel.py @@ -228,6 +228,7 @@ def generate_llm_fqn_per_model_part( num_layers: int, input_weight: int = 1, output_weight: int = 1, + include_rotary_emb: bool = False, ) -> list[list[str]]: """ Programmatically generates module names model part, focused on LLMs models. @@ -237,6 +238,7 @@ def generate_llm_fqn_per_model_part( num_layers: Total number of transformer layers in the model input_weight: Weight for input modules (tok_embeddings) in layer calculation output_weight: Weight for output modules (norm + output) in layer calculation + include_rotary_emb: Whether to include rotary_emb in each model part Returns: List of lists containing module names for each model part @@ -251,7 +253,10 @@ def generate_llm_fqn_per_model_part( if num_stages == 1: # Single stage gets everything layer_names = [f"layers.{i}" for i in range(num_layers)] - return [["tok_embeddings"] + layer_names + ["norm", "output"]] + result = [["tok_embeddings"] + layer_names + ["norm", "output"]] + if include_rotary_emb: + result[0].append("rotary_emb") + return result # Calculate effective layers including weights num_effective_layers = num_layers + input_weight + output_weight @@ -329,6 +334,8 @@ def generate_llm_fqn_per_model_part( stage_modules.append(f"layers.{current_layer}") current_layer += 1 + if include_rotary_emb: + stage_modules.append("rotary_emb") module_names_per_stage.append(stage_modules) return module_names_per_stage @@ -340,6 +347,7 @@ def pipeline_module_split( pp_schedule: str, device: torch.device, module_names_per_stage: list[list[str]], + use_identity_for_missing_modules: bool = False, ) -> tuple[list[PipelineStage], list[nn.Module]]: """ This API creates pipeline stages based on specified module names for each stage. @@ -361,6 +369,8 @@ def pipeline_module_split( - "layers.0", "layers.1" for specific transformer layers - "norm" for the final normalization layer - "output" for the output projection layer + use_identity_for_missing_modules: If True, replace missing modules with nn.Identity(), + otherwise replace with None Returns: Tuple of (stages, models) where stages are PipelineStage objects and models are the @@ -417,8 +427,9 @@ def _build_stage_from_modules( setattr(model, module_name, nn.ModuleList()) # Handle simple module attributes (e.g., "linear", "norm") elif module_name not in modules_to_keep: - # Replace with None - setattr(model, module_name, None) + # Replace with Identity or None based on configuration + replacement = nn.Identity() if use_identity_for_missing_modules else None + setattr(model, module_name, replacement) stage = PipelineStage( model, diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py index 088cc05642..bfb876e911 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py @@ -3,280 +3,27 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import copy import math import torch import torch.nn as nn -from torch.distributed.device_mesh import DeviceMesh -from torch.distributed.pipelining import PipelineStage from torch.distributed.pipelining.schedules import ( _PipelineSchedule, get_schedule_class, PipelineScheduleSingle, - ScheduleDualPipeV, - ScheduleZBVZeroBubble, ) from torchtitan.components.loss import LossFunction from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.distributed import ParallelDims -from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule +from torchtitan.distributed.pipeline_parallel import ( + build_pipeline_schedule, + generate_llm_fqn_per_model_part, + pipeline_module_split, +) from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction from torchtitan.tools.logging import logger -# NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part - - -def generate_llm_fqn_per_model_part( - num_stages: int, - num_layers: int, - input_weight: int = 1, - output_weight: int = 1, -) -> list[list[str]]: - """ - Programmatically generates module names model part, focused on LLMs models. - Args: - num_stages: Number of pipeline stages - num_layers: Total number of transformer layers in the model - input_weight: Weight for input modules (embed_tokens) in layer calculation - output_weight: Weight for output modules (norm + output) in layer calculation - Returns: - List of lists containing module names for each model part - Example: - generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2) - treats embeddings as 2 layers and norm+output as 2 layers for distribution - """ - if num_stages < 1: - raise ValueError("Number of stages must be at least 1") - - if num_stages == 1: - # Single stage gets everything - layer_names = [f"layers.{i}" for i in range(num_layers)] - return [["tok_embeddings"] + layer_names + ["norm", "output", "rotary_emb"]] - - # Calculate effective layers including weights - num_effective_layers = num_layers + input_weight + output_weight - - if num_stages > num_effective_layers: - raise ValueError( - f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})" - ) - - # Calculate layers per stage (distribute evenly) - layers_per_stage = num_effective_layers // num_stages - extra_layers = num_effective_layers % num_stages - - # Feasibility check: Ensure at least 1 layer in each PP stage - if layers_per_stage == 0: - raise ValueError( - f"Configuration would result in empty stages. " - f"With {num_stages} stages and {num_effective_layers} effective layers " - f"(num_layers={num_layers} + input_weight={input_weight} + output_weight={output_weight}), " - f"each stage would get {layers_per_stage} layers on average. " - f"Reduce num_stages or increase num_layers/weights." - ) - - # Balance check: Ensure weights don't exceed minimum layers per stage - if input_weight > layers_per_stage: - raise ValueError( - f"input_weight ({input_weight}) exceeds minimum layers per stage ({layers_per_stage})." - ) - if output_weight > layers_per_stage: - raise ValueError( - f"output_weight ({output_weight}) exceeds minimum layers per stage ({layers_per_stage})." - ) - - module_names_per_stage = [] - current_layer = 0 - - for stage_idx in range(num_stages): - stage_modules = [] - - # Calculate effective layers for this stage - effective_layers_for_stage = layers_per_stage - if stage_idx < extra_layers: - effective_layers_for_stage += 1 - - # First stage: handle input modules with weighting - if stage_idx == 0: - stage_modules.append("tok_embeddings") - # Account for input weight in layer distribution - remaining_layers_for_stage = effective_layers_for_stage - input_weight - - # Add transformer layers - for _ in range(remaining_layers_for_stage): - if current_layer < num_layers: - stage_modules.append(f"layers.{current_layer}") - current_layer += 1 - - # Last stage: handle output modules with weighting - elif stage_idx == num_stages - 1: - # Account for output weight in layer distribution - remaining_layers_for_stage = effective_layers_for_stage - output_weight - - # Add transformer layers - for _ in range(remaining_layers_for_stage): - if current_layer < num_layers: - stage_modules.append(f"layers.{current_layer}") - current_layer += 1 - - # Add output modules - stage_modules.extend(["norm", "output"]) - - # Middle stages: only transformer layers - else: - for _ in range(effective_layers_for_stage): - if current_layer < num_layers: - stage_modules.append(f"layers.{current_layer}") - current_layer += 1 - - stage_modules.append("rotary_emb") - module_names_per_stage.append(stage_modules) - - return module_names_per_stage - - -def pipeline_module_split( - whole_model: nn.Module, - pp_mesh: DeviceMesh, - pp_schedule: str, - device: torch.device, - module_names_per_stage: list[list[str]], -) -> tuple[list[PipelineStage], list[nn.Module]]: - """ - This API creates pipeline stages based on specified module names for each stage. - - Some model restrictions include: - - forward() method should tolerate deleted layers - - weight initialization methods should tolerate deleted layers - - Does not support nested moduledict and modulelist structures - - Args: - whole_model: The complete model to be split - pp_mesh: Pipeline parallel device mesh - pp_schedule: Name of pipeline parallelism schedule - device: Device - module_names_per_stage: List of lists, where each inner list contains the module names - that should be included in that stage. Module names should be - dot-separated paths. Examples: - - "tok_embeddings" for token embeddings - - "layers.0", "layers.1" for specific transformer layers - - "norm" for the final normalization layer - - "output" for the output projection layer - - Returns: - Tuple of (stages, models) where stages are PipelineStage objects and models are the - corresponding model chunks - - Example usage: - module_names_per_stage = [ - ["tok_embeddings", "layers.0"], # Stage 0: embeddings + first layer - ["layers.1", "layers.2"], # Stage 1: middle layers - ["norm", "output"] # Stage 2: final norm + output - ] - """ - pp_rank = pp_mesh.get_local_rank() - pp_degree = pp_mesh.size() - - def _build_stage_from_modules( - stage_idx: int, module_names: list[str], num_stages: int - ) -> tuple[PipelineStage, nn.Module]: - model = copy.deepcopy(whole_model) - - # Create a set of modules to keep for faster lookup - modules_to_keep = set(module_names) - for module_name, module_value in model.named_children(): - # Handle layer-like structures (e.g., "layers.0", "layers.1") - if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)): - layers_to_keep = { - name.split(".", 1)[1] - for name in modules_to_keep - if name.startswith(f"{module_name}.") - } - if layers_to_keep: - # Keep only specified layers - if isinstance(module_value, nn.ModuleDict): - for layer_name in list(module_value.keys()): - if layer_name not in layers_to_keep: - del module_value[layer_name] - elif isinstance(module_value, nn.ModuleList): - indices_to_keep = { - int(idx) for idx in layers_to_keep if idx.isdigit() - } - new_layers = nn.ModuleList( - [ - layer - for i, layer in enumerate(module_value) - if i in indices_to_keep - ] - ) - setattr(model, module_name, new_layers) - else: - # No layers from this structure needed, set to empty structure - if isinstance(module_value, nn.ModuleDict): - setattr(model, module_name, nn.ModuleDict()) - elif isinstance(module_value, nn.ModuleList): - setattr(model, module_name, nn.ModuleList()) - # Handle simple module attributes (e.g., "linear", "norm") - elif module_name not in modules_to_keep: - # Replace with Identity - setattr(model, module_name, nn.Identity()) - - stage = PipelineStage( - model, - stage_idx, - num_stages, - device, - group=pp_mesh.get_group("pp"), - ) - return stage, model - - num_stages = len(module_names_per_stage) - stages = [] - models = [] - - schedule_class = get_schedule_class(pp_schedule) - style = ( - "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop" - ) - - def _get_stage_indices() -> tuple[int]: - """ - Compute the stage ids for the stages that will run on this pp rank - for either a looped or V style schedule - """ - assert ( - num_stages % pp_degree == 0 - ), f"num_stages {num_stages} must be evenly divisible by pp_degree {pp_degree}" - stages_per_rank = num_stages // pp_degree - if style == "loop": - return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank)) - elif style == "v": - assert ( - stages_per_rank == 2 - ), f"v schedules assume 2 stages per rank, got {stages_per_rank}" - stage_v_pairs = list( - zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1)) - ) - return stage_v_pairs[pp_rank] - - for stage_idx in _get_stage_indices(): - module_names = module_names_per_stage[stage_idx] - stage, model_chunk = _build_stage_from_modules( - stage_idx, - module_names, - num_stages, - ) - logger.info( - f"PP rank {pp_rank} is building stage_idx {stage_idx} " - f"with modules {module_names}" - ) - stages.append(stage) - models.append(model_chunk) - - return stages, models - def pipeline_hf_transformers( model: nn.Module, @@ -355,7 +102,11 @@ def pipeline_hf_transformers( module_names_per_stage = job_config.parallelism.module_fqns_per_model_part if module_names_per_stage is None: module_names_per_stage = generate_llm_fqn_per_model_part( - num_virtual_stages, num_layers, input_weight, output_weight + num_virtual_stages, + num_layers, + input_weight, + output_weight, + include_rotary_emb=True, ) for i, stage_ms in enumerate(module_names_per_stage): logger.debug(f"Stage {i}: {stage_ms}") @@ -366,6 +117,7 @@ def pipeline_hf_transformers( job_config.parallelism.pipeline_parallel_schedule, device, module_names_per_stage, + use_identity_for_missing_modules=True, ) # For PP with looped schedules, each item in model_parts is one stage-model-chunk. From 78d26ff7429fa807e9d6d4a31ae17fba6a7f3285 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 13 Nov 2025 10:09:31 +0000 Subject: [PATCH 106/129] use c4 test for integration_tests --- .../experiments/transformers_backend/tests/integration_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py index 1f2a38d322..8bc8a63a31 100644 --- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py +++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py @@ -22,7 +22,7 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]: [ [ "--model.name meta-llama/Llama-3.2-1B", - "--training.dataset wikitext2-test", + "--training.dataset c4-test", "--parallelism.data_parallel_shard_degree 2", "--parallelism.tensor_parallel_degree 2", "--parallelism.pipeline_parallel_degree 2", From 524379546edbe48275eba94f4a33e56e3b01f449 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 13 Nov 2025 10:22:21 +0000 Subject: [PATCH 107/129] fix ci --- .ci/docker/common/install_conda.sh | 1 + .../docker/requirements-transformers-backend.txt | 0 .ci/docker/ubuntu/Dockerfile | 1 + .github/workflows/integration_test_8gpu_huggingface.yaml | 2 -- 4 files changed, 2 insertions(+), 2 deletions(-) rename torchtitan/experiments/transformers_backend/requirements.txt => .ci/docker/requirements-transformers-backend.txt (100%) diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh index c2f316b04b..d3cb20e7a3 100755 --- a/.ci/docker/common/install_conda.sh +++ b/.ci/docker/common/install_conda.sh @@ -43,6 +43,7 @@ install_pip_dependencies() { pip_install -r /opt/conda/requirements.txt pip_install -r /opt/conda/requirements-flux.txt pip_install -r /opt/conda/requirements-vlm.txt + pip_install -r /opt/conda/requirements-transformers-backend.txt popd } diff --git a/torchtitan/experiments/transformers_backend/requirements.txt b/.ci/docker/requirements-transformers-backend.txt similarity index 100% rename from torchtitan/experiments/transformers_backend/requirements.txt rename to .ci/docker/requirements-transformers-backend.txt diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 8f3bb9789f..7c53d3f1a1 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -33,6 +33,7 @@ COPY requirements-dev.txt /opt/conda/ COPY requirements.txt /opt/conda/ COPY requirements-flux.txt /opt/conda/ COPY requirements-vlm.txt /opt/conda/ +COPY requirements-transformers-backend.txt /opt/conda/ COPY conda-env-ci.txt /opt/conda/ COPY ./common/install_conda.sh install_conda.sh COPY ./common/utils.sh utils.sh diff --git a/.github/workflows/integration_test_8gpu_huggingface.yaml b/.github/workflows/integration_test_8gpu_huggingface.yaml index cde7959510..aea5189d81 100644 --- a/.github/workflows/integration_test_8gpu_huggingface.yaml +++ b/.github/workflows/integration_test_8gpu_huggingface.yaml @@ -49,7 +49,5 @@ jobs: USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 - python -m pip install transformers==4.55.4 - mkdir artifacts-to-be-uploaded python -m torchtitan.experiments.transformers_backend.tests.integration_tests artifacts-to-be-uploaded --ngpu 8 From fe691b892825c249bf43b435e29adfd0b87e7310 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 13 Nov 2025 10:51:48 +0000 Subject: [PATCH 108/129] fix linting --- torchtitan/distributed/pipeline_parallel.py | 4 ++- .../transformers_backend/README.md | 2 +- .../transformers_backend/__init__.py | 3 +- .../transformers_backend/infra/parallelize.py | 22 +++++++++----- .../transformers_backend/infra/pipeline.py | 2 +- .../transformers_backend/job_config.py | 10 ++++++- .../transformers_backend/model/args.py | 2 ++ .../transformers_backend/model/model.py | 30 +++++++++++-------- 8 files changed, 49 insertions(+), 26 deletions(-) diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py index 0c0eb89dcc..b954d32c19 100644 --- a/torchtitan/distributed/pipeline_parallel.py +++ b/torchtitan/distributed/pipeline_parallel.py @@ -428,7 +428,9 @@ def _build_stage_from_modules( # Handle simple module attributes (e.g., "linear", "norm") elif module_name not in modules_to_keep: # Replace with Identity or None based on configuration - replacement = nn.Identity() if use_identity_for_missing_modules else None + replacement = ( + nn.Identity() if use_identity_for_missing_modules else None + ) setattr(model, module_name, replacement) stage = PipelineStage( diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md index 8fbd19f0e8..a5b4059c07 100644 --- a/torchtitan/experiments/transformers_backend/README.md +++ b/torchtitan/experiments/transformers_backend/README.md @@ -18,7 +18,7 @@ hf_assets_path = "./tests/assets/tokenizer" ... ``` - Train: `LOG_RANK=7 CONFIG_FILE= ## Supported Features diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index b72b77760c..fd0cd9b689 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -43,11 +43,12 @@ class TitanDenseModelArgs: use_flex_attn: bool = False attn_mask_type: str = "causal" + flavors = { "debugmodel": HFTransformerModelArgs( titan_dense_args=TitanDenseModelArgs( dim=256, - n_layers=6, + n_layers=2, n_heads=16, n_kv_heads=16, ), diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index db78f7ea24..b2ae3f02a1 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -16,16 +16,16 @@ RowwiseParallel, SequenceParallel, ) -from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.config import TORCH_DTYPE_MAP from torchtitan.distributed import NoParallel, ParallelDims +from torchtitan.distributed.activation_checkpoint import apply_ac + from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp +from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp from torchtitan.tools.logging import logger -from torchtitan.distributed.activation_checkpoint import apply_ac -from torchtitan.models.llama3.infra.parallelize import apply_ddp -from torchtitan.models.llama3.infra.parallelize import apply_compile def parallelize_hf_transformers( model: nn.Module, @@ -223,10 +223,16 @@ def apply_non_moe_tp( layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel( output_layouts=Shard(1) ) - #For model that uses RMSNorm on Q and K (i.e. Qwen3) - if hasattr(transformer_block.self_attn, "q_norm") and hasattr(transformer_block.self_attn, "k_norm"): - layer_plan["self_attn.q_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True) - layer_plan["self_attn.k_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True) + # For model that uses RMSNorm on Q and K (i.e. Qwen3) + if hasattr(transformer_block.self_attn, "q_norm") and hasattr( + transformer_block.self_attn, "k_norm" + ): + layer_plan["self_attn.q_norm"] = SequenceParallel( + sequence_dim=2, use_local_output=True + ) + layer_plan["self_attn.k_norm"] = SequenceParallel( + sequence_dim=2, use_local_output=True + ) if not transformer_block.moe_enabled: mlp_plan = { diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py index bfb876e911..6a891bb271 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py @@ -14,13 +14,13 @@ ) from torchtitan.components.loss import LossFunction -from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.distributed import ParallelDims from torchtitan.distributed.pipeline_parallel import ( build_pipeline_schedule, generate_llm_fqn_per_model_part, pipeline_module_split, ) +from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction from torchtitan.tools.logging import logger diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py index 6344529d20..f3b1667798 100644 --- a/torchtitan/experiments/transformers_backend/job_config.py +++ b/torchtitan/experiments/transformers_backend/job_config.py @@ -1,10 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + from dataclasses import dataclass, field + @dataclass class HFTransformers: model: str = "" """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')""" + @dataclass class JobConfig: - hf_transformers: HFTransformers = field(default_factory=HFTransformers) \ No newline at end of file + hf_transformers: HFTransformers = field(default_factory=HFTransformers) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 4c9ffcae72..5a22edd386 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -82,6 +82,7 @@ def _initialize_dense_attributes(self, titan_dense_args): # Update passed_args self._titan_injected_model_args.update(titan_dense_args.__dict__) + def _configure_hf_attention(self, attn_implementation: str): """Configure HuggingFace attention settings.""" self._titan_injected_model_args["attn_implementation"] = attn_implementation @@ -153,6 +154,7 @@ def update_from_config(self, job_config: JobConfig): self.max_seq_len = job_config.training.seq_len # Configure HF-specific settings to match TorchTitan settings + # TODO: false ? self.attention_bias = False self.mlp_bias = False self.use_cache = False diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index bb50fd466c..8c35ac4e94 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -16,27 +16,32 @@ from .args import HFTransformerModelArgs + class SlicableModuleDict(nn.ModuleDict): """ A ModuleDict that supports slicing like ModuleList. Keys are expected to be string representations of integers (e.g., "0", "1", "2"). """ - + def __getitem__(self, key): if isinstance(key, slice): # Handle slicing: convert slice to list of keys - keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf')) + keys = sorted( + self.keys(), key=lambda x: int(x) if x.isdigit() else float("inf") + ) sliced_keys = keys[key] # Return a new SlicableModuleDict with the sliced items return SlicableModuleDict({k: self[k] for k in sliced_keys}) return super().__getitem__(key) - + def __iter__(self): # Iterate over values in sorted order by key (as integers) - keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf')) + keys = sorted( + self.keys(), key=lambda x: int(x) if x.isdigit() else float("inf") + ) for key in keys: yield self[key] - + def __len__(self): return len(self._modules) @@ -82,9 +87,7 @@ def __init__(self, model_args: HFTransformerModelArgs): mlp_cls=mlp_cls, # mlp_cls can be None ) else: - missing = [ - name for name, cls in required_classes.items() if not cls - ] + missing = [name for name, cls in required_classes.items() if not cls] logger.warning( f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. " "Skipping Llama-like patch." @@ -103,10 +106,9 @@ def __init__(self, model_args: HFTransformerModelArgs): # Convert ModuleList to ModuleDict to preserve original indices # This ensures state dict keys match checkpoint keys if isinstance(self.model.model.layers, nn.ModuleList): - self.model.model.layers = SlicableModuleDict({ - str(i): layer - for i, layer in enumerate(self.model.model.layers) - }) + self.model.model.layers = SlicableModuleDict( + {str(i): layer for i, layer in enumerate(self.model.model.layers)} + ) for layer in self.model.model.layers.values(): layer.moe_enabled = False @@ -171,7 +173,9 @@ def _init_weights_patched(self, module): if isinstance(module, layer_idx_classes): if not hasattr(module, "layer_idx"): - raise ValueError(f"Module {module} does not have a layer_idx attribute") + raise ValueError( + f"Module {module} does not have a layer_idx attribute" + ) layer_idx = module.layer_idx From 5d5ce2b8d8bab215e6acecf1225dc69668627083 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 14 Nov 2025 11:01:33 +0000 Subject: [PATCH 109/129] fix head dims in flops counting --- torchtitan/experiments/transformers_backend/model/args.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 5a22edd386..db1696e7a5 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from torch import nn -from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.config.job_config import JobConfig from torchtitan.models.utils import get_dense_model_nparams_and_flops from torchtitan.protocols import BaseModelArgs from transformers import AutoConfig @@ -132,7 +132,7 @@ def __repr__(self) -> str: def update_from_config(self, job_config: JobConfig): # Load HF config (overwrites our HF attributes) hf_model_config = AutoConfig.from_pretrained( - job_config.hf_transformers.model, + job_config.model.name, attn_implementation=self.attn_implementation, trust_remote_code=True, ) @@ -174,4 +174,4 @@ def update_from_config(self, job_config: JobConfig): return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - return get_dense_model_nparams_and_flops(self, model, seq_len) + return get_dense_model_nparams_and_flops(self, model, head_dims=self.head_dim, seq_len=seq_len) From 6ace9f43c83fb85afb2335800861b7400915392e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 14 Nov 2025 11:05:36 +0000 Subject: [PATCH 110/129] propose an alternative to passing name --- .../experiments/transformers_backend/README.md | 9 ++++----- .../configs/qwen3_fsdp2_tp2_pp2.toml | 5 +---- .../transformers_backend/infra/parallelize.py | 2 +- .../transformers_backend/infra/pipeline.py | 2 +- .../transformers_backend/job_config.py | 18 ------------------ torchtitan/protocols/train_spec.py | 4 ++++ 6 files changed, 11 insertions(+), 29 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/job_config.py diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md index a5b4059c07..4ecbbe8c6f 100644 --- a/torchtitan/experiments/transformers_backend/README.md +++ b/torchtitan/experiments/transformers_backend/README.md @@ -9,15 +9,14 @@ ... [model] - name = "llama3" -+ name = "transformers_backend" ++ name = "Qwen/Qwen3-4B-Instruct-2507" flavor = "debugmodel" hf_assets_path = "./tests/assets/tokenizer" - -+[hf_transformers] -+model = "Qwen/Qwen3-4B-Instruct-2507" ... ``` -- Train: `LOG_RANK=7 CONFIG_FILE=/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --compile.enable` - Make sure you have created the tokenizers beforehand image diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml index 13e3f4ddf0..b0e294ccbe 100644 --- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml +++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml @@ -20,15 +20,12 @@ save_tb_folder = "tb" enable_wandb = false [model] -name = "transformers_backend" +name = "Qwen/Qwen3-4B-Instruct-2507" flavor = "debugmodel" # test folder with tokenizer.json, for debug purpose only hf_assets_path = "./tests/assets/tokenizer" # converters = ["float8"] -[hf_transformers] -model = "Qwen/Qwen3-4B-Instruct-2507" - [optimizer] name = "AdamW" lr = 8e-4 diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index b2ae3f02a1..987fae6049 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -22,7 +22,7 @@ from torchtitan.distributed.activation_checkpoint import apply_ac from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp -from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.config.job_config import JobConfig from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp from torchtitan.tools.logging import logger diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py index 6a891bb271..511297ad7c 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py @@ -20,7 +20,7 @@ generate_llm_fqn_per_model_part, pipeline_module_split, ) -from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.config.job_config import JobConfig from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction from torchtitan.tools.logging import logger diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py deleted file mode 100644 index f3b1667798..0000000000 --- a/torchtitan/experiments/transformers_backend/job_config.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from dataclasses import dataclass, field - - -@dataclass -class HFTransformers: - model: str = "" - """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')""" - - -@dataclass -class JobConfig: - hf_transformers: HFTransformers = field(default_factory=HFTransformers) diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index 22bfa7df9b..3eed6ddd2f 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -77,6 +77,10 @@ def get_train_spec(name: str) -> TrainSpec: from torchtitan.experiments import _supported_experiments from torchtitan.models import _supported_models + if "/" in name: + module = import_module("torchtitan.experiments.transformers_backend") + return module.get_train_spec() + if name in _supported_models: module = import_module(f"torchtitan.models.{name}") return module.get_train_spec() From 97cd6fe0ee601789f719ec92e9be880506a06646 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 14 Nov 2025 11:22:08 +0000 Subject: [PATCH 111/129] fix linting --- .../experiments/transformers_backend/infra/parallelize.py | 2 +- torchtitan/experiments/transformers_backend/infra/pipeline.py | 2 +- torchtitan/experiments/transformers_backend/model/args.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 987fae6049..a4b7e66ad8 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -17,12 +17,12 @@ SequenceParallel, ) from torchtitan.config import TORCH_DTYPE_MAP +from torchtitan.config.job_config import JobConfig from torchtitan.distributed import NoParallel, ParallelDims from torchtitan.distributed.activation_checkpoint import apply_ac from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp -from torchtitan.config.job_config import JobConfig from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp from torchtitan.tools.logging import logger diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py index 511297ad7c..b813225fe6 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py @@ -14,13 +14,13 @@ ) from torchtitan.components.loss import LossFunction +from torchtitan.config.job_config import JobConfig from torchtitan.distributed import ParallelDims from torchtitan.distributed.pipeline_parallel import ( build_pipeline_schedule, generate_llm_fqn_per_model_part, pipeline_module_split, ) -from torchtitan.config.job_config import JobConfig from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction from torchtitan.tools.logging import logger diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index db1696e7a5..9a6271980b 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -174,4 +174,6 @@ def update_from_config(self, job_config: JobConfig): return self def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]: - return get_dense_model_nparams_and_flops(self, model, head_dims=self.head_dim, seq_len=seq_len) + return get_dense_model_nparams_and_flops( + self, model, head_dims=self.head_dim, seq_len=seq_len + ) From 5f1695f0a013e573c1ddea8c6cafbc537769a9e7 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Fri, 14 Nov 2025 13:22:16 +0000 Subject: [PATCH 112/129] bump transformers version from 4.55.4 to 4.57.1 --- .ci/docker/requirements-transformers-backend.txt | 2 +- torchtitan/experiments/transformers_backend/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/requirements-transformers-backend.txt b/.ci/docker/requirements-transformers-backend.txt index 6b0cc637db..76e8886ed0 100644 --- a/.ci/docker/requirements-transformers-backend.txt +++ b/.ci/docker/requirements-transformers-backend.txt @@ -1 +1 @@ -transformers==4.55.4 +transformers==4.57.1 diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md index 4ecbbe8c6f..3d1a2dcf0d 100644 --- a/torchtitan/experiments/transformers_backend/README.md +++ b/torchtitan/experiments/transformers_backend/README.md @@ -2,7 +2,7 @@ ## Quick start -- Requirements `transformers==4.55.4` +- Requirements `transformers==4.57.1` - Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml` ```diff From 2d2b6122a249a69c71f9647ec9c034965204c8a1 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 10:11:46 +0000 Subject: [PATCH 113/129] change qwen3 config name --- .../configs/{qwen3_fsdp2_tp2_pp2.toml => qwen3.toml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename torchtitan/experiments/transformers_backend/configs/{qwen3_fsdp2_tp2_pp2.toml => qwen3.toml} (100%) diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3.toml similarity index 100% rename from torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml rename to torchtitan/experiments/transformers_backend/configs/qwen3.toml From a2ea2ef430d6d50742e3adb8bed1c9bde4a24cac Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 10:37:30 +0000 Subject: [PATCH 114/129] reuse fsdp from llama3. Moe will be handle in another PR --- .../transformers_backend/infra/parallelize.py | 167 +----------------- 1 file changed, 2 insertions(+), 165 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index a4b7e66ad8..163249e04b 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -4,10 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import torch import torch.nn as nn from torch.distributed.device_mesh import DeviceMesh -from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy from torch.distributed.tensor import Replicate, Shard from torch.distributed.tensor.parallel import ( ColwiseParallel, @@ -23,7 +21,7 @@ from torchtitan.distributed.activation_checkpoint import apply_ac from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp -from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp +from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp, apply_fsdp from torchtitan.tools.logging import logger @@ -271,165 +269,4 @@ def apply_non_moe_tp( logger.info( f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}" "Tensor Parallelism to the model" - ) - - -def apply_fsdp( - model: nn.Module, - dp_mesh: DeviceMesh, - param_dtype: torch.dtype, - reduce_dtype: torch.dtype, - pp_enabled: bool, - cpu_offload: bool = False, - reshard_after_forward_policy: str = "default", - ep_degree: int = 1, - dp_mod_ep_mesh: DeviceMesh | None = None, - gradient_divide_factor: int | None = None, -): - """ - Apply data parallelism (via FSDP2) to the model. - - Args: - model (nn.Module): The model to apply data parallelism to. - dp_mesh (DeviceMesh): The device mesh to use for data parallelism. - param_dtype (torch.dtype): The data type to use for model parameters. - reduce_dtype (torch.dtype): The data type to use for reduction operations. - pp_enabled (bool): Whether pipeline parallelism is enabled. - cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False. - reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default". - Other options: "never", "always". - - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios. - - "always" will enable `reshard_after_forward` for all forward passes. - - "never" will disable `reshard_after_forward` for all forward passes. - - """ - mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype) - fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} - if cpu_offload: - fsdp_config["offload_policy"] = CPUOffloadPolicy() - - match reshard_after_forward_policy: - case "always": - reshard_after_forward = True - case "never": - reshard_after_forward = False - case "default": - # For PP, by default do not reshard after forward to avoid per-microbatch - # all-gathers, which can be expensive and non-overlapped - reshard_after_forward = not pp_enabled - case _: - raise ValueError( - f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}." - ) - - if model.tok_embeddings is not None: - fully_shard( - model.tok_embeddings, - **fsdp_config, - reshard_after_forward=reshard_after_forward, - ) - - for transformer_block in model.layers: - # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping - # - the router and the shared experts are sharded together with the TransformerBlock - # - the routed experts are sharded with the remaining dp_mod_ep_mesh - if ( - hasattr(transformer_block, "moe_enabled") - and transformer_block.moe_enabled - and ep_degree > 1 - ): - fsdp_mod_ep_config = fsdp_config.copy() - fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh - moe_block = transformer_block.mlp - # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts). - # When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding - # causes inefficiency, so we choose to do FSDP sharding on dim-1. - # Even when EP is not used, we may still want to shard the experts - # on non-0 dim. For now it may not be worth the complexity to support - # shard_placement_fn on the outer TransformerBlock-level FSDP. - _experts_shard_placement_fn = None - assert dp_mod_ep_mesh is not None - if dp_mod_ep_mesh.size() * ep_degree > moe_block.experts.num_experts: - _experts_shard_placement_fn = lambda param: Shard(1) - - fully_shard( - moe_block.experts, - **fsdp_mod_ep_config, - reshard_after_forward=reshard_after_forward, - shard_placement_fn=_experts_shard_placement_fn, - ) - - # NOTE: # Although the FSDP sharding of experts is done on a mesh of - # a different size than other parameters, the gradient division - # factor should be consistent with data. - moe_block.experts.set_gradient_divide_factor( - gradient_divide_factor, - ) - - fully_shard( - transformer_block, - **fsdp_config, - reshard_after_forward=reshard_after_forward, - ) - - # As an optimization, do not reshard_after_forward the last layers by default - # since FSDP would prefetch them immediately after the forward pass - if model.norm is not None and model.output is not None: - fully_shard( - [model.norm, model.output], - **fsdp_config, - reshard_after_forward=reshard_after_forward_policy == "always", - ) - - fully_shard(model, **fsdp_config) - - # NOTE: set up explicit prefetching when EP is enabled, as D2H syncs - # in EP could interfere with implicit prefetching in FSDP - if ep_degree == 1: - return - - # forward - transformer_blocks = list(model.layers.values()) - next_transformer_blocks = transformer_blocks[1:] + [None] - - if model.tok_embeddings is not None and model.layers is not None: - model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]]) - - for transformer_block, next_transformer_block in zip( - transformer_blocks, next_transformer_blocks - ): - if next_transformer_block is not None: - if next_transformer_block.moe_enabled: - transformer_block.set_modules_to_forward_prefetch( - [next_transformer_block, next_transformer_block.mlp.experts] - ) - else: - transformer_block.set_modules_to_forward_prefetch( - [next_transformer_block] - ) - elif model.norm is not None and model.output is not None: - transformer_block.set_modules_to_forward_prefetch( - [model.norm, model.output] - ) - - # backward - reversed_transformer_blocks = list(reversed(model.layers.values())) - prev_transformer_blocks = reversed_transformer_blocks[1:] + [None] - - if model.norm is not None and model.output is not None and model.layers is not None: - model.output.set_modules_to_backward_prefetch([reversed_transformer_blocks[0]]) - - for transformer_block, prev_transformer_block in zip( - reversed_transformer_blocks, prev_transformer_blocks - ): - if prev_transformer_block is not None: - if prev_transformer_block.moe_enabled: - transformer_block.set_modules_to_backward_prefetch( - [prev_transformer_block, prev_transformer_block.mlp.experts] - ) - else: - transformer_block.set_modules_to_backward_prefetch( - [prev_transformer_block] - ) - elif model.tok_embeddings is not None: - transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings]) + ) \ No newline at end of file From 47fb2eab0e01cdebf79544caea04fba84aad6bc1 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 10:41:16 +0000 Subject: [PATCH 115/129] clean logging --- .../experiments/transformers_backend/infra/parallelize.py | 8 ++++++-- .../experiments/transformers_backend/infra/pipeline.py | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index 163249e04b..cb68826e87 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -21,7 +21,11 @@ from torchtitan.distributed.activation_checkpoint import apply_ac from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp -from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp, apply_fsdp +from torchtitan.models.llama3.infra.parallelize import ( + apply_compile, + apply_ddp, + apply_fsdp, +) from torchtitan.tools.logging import logger @@ -269,4 +273,4 @@ def apply_non_moe_tp( logger.info( f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}" "Tensor Parallelism to the model" - ) \ No newline at end of file + ) diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py index b813225fe6..c8904f4352 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py @@ -22,7 +22,6 @@ pipeline_module_split, ) from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction -from torchtitan.tools.logging import logger def pipeline_hf_transformers( @@ -108,8 +107,6 @@ def pipeline_hf_transformers( output_weight, include_rotary_emb=True, ) - for i, stage_ms in enumerate(module_names_per_stage): - logger.debug(f"Stage {i}: {stage_ms}") stages, model_parts = pipeline_module_split( model, From 20308d31fe0719463fb93b4464d509c8c7d79172 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 10:46:49 +0000 Subject: [PATCH 116/129] move TitanDenseModelArgs to args --- .../transformers_backend/__init__.py | 20 +------------------ .../transformers_backend/model/args.py | 17 ++++++++++++++++ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index fd0cd9b689..6d74050608 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -15,33 +15,15 @@ from .infra.parallelize import parallelize_hf_transformers from .infra.pipeline import pipeline_hf_transformers -from .model.args import HFTransformerModelArgs +from .model.args import HFTransformerModelArgs, TitanDenseModelArgs from .model.model import HFTransformerModel - __all__ = [ "HFTransformerModelArgs", "HFTransformerModel", ] -@dataclass -class TitanDenseModelArgs: - """Arguments for the base TorchTitan model.""" - - dim: int = 4096 - n_layers: int = 32 - n_heads: int = 32 - n_kv_heads: int | None = None - vocab_size: int | None = None - multiple_of: int = 256 - ffn_dim_multiplier: float | None = None - norm_eps: float = 1e-5 - rope_theta: float = 10000 - max_seq_len: int = 2048 - depth_init: bool = True - use_flex_attn: bool = False - attn_mask_type: str = "causal" flavors = { diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 9a6271980b..69f4ebc9bc 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -15,6 +15,23 @@ from transformers.integrations.sdpa_attention import sdpa_attention_forward from transformers.modeling_utils import AttentionInterface +@dataclass +class TitanDenseModelArgs: + """Arguments for the base TorchTitan model.""" + + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: int | None = None + vocab_size: int | None = None + multiple_of: int = 256 + ffn_dim_multiplier: float | None = None + norm_eps: float = 1e-5 + rope_theta: float = 10000 + max_seq_len: int = 2048 + depth_init: bool = True + use_flex_attn: bool = False + attn_mask_type: str = "causal" @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): From 019f2cc557110bd2158a5b1971d44cd2e8f92cc8 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 12:58:00 +0000 Subject: [PATCH 117/129] clean --- torchtitan/experiments/transformers_backend/model/args.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 69f4ebc9bc..b3d9daa723 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -79,7 +79,6 @@ def __init__( self._create_getter_setter_dynamically(has_moe=False) self._titan_injected_model_args = {} - self._titan_injected_model_args.update(kwargs) self._configure_hf_attention(attn_implementation) self._initialize_dense_attributes(titan_dense_args) From fc93b4f4866ef805750a1e1760a310bf3e8ef171 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 13:02:32 +0000 Subject: [PATCH 118/129] fix integration tests --- torchtitan/experiments/transformers_backend/__init__.py | 4 ---- torchtitan/experiments/transformers_backend/model/args.py | 2 ++ .../transformers_backend/tests/integration_tests.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index 6d74050608..aec28a0bdd 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -3,8 +3,6 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from dataclasses import dataclass - from torchtitan.components.loss import build_cross_entropy_loss from torchtitan.components.lr_scheduler import build_lr_schedulers from torchtitan.components.optimizer import build_optimizers @@ -24,8 +22,6 @@ ] - - flavors = { "debugmodel": HFTransformerModelArgs( titan_dense_args=TitanDenseModelArgs( diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index b3d9daa723..d261dcd5e4 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -15,6 +15,7 @@ from transformers.integrations.sdpa_attention import sdpa_attention_forward from transformers.modeling_utils import AttentionInterface + @dataclass class TitanDenseModelArgs: """Arguments for the base TorchTitan model.""" @@ -33,6 +34,7 @@ class TitanDenseModelArgs: use_flex_attn: bool = False attn_mask_type: str = "causal" + @dataclass class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs): """ diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py index 8bc8a63a31..5629b45f5c 100644 --- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py +++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py @@ -22,7 +22,6 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]: [ [ "--model.name meta-llama/Llama-3.2-1B", - "--training.dataset c4-test", "--parallelism.data_parallel_shard_degree 2", "--parallelism.tensor_parallel_degree 2", "--parallelism.pipeline_parallel_degree 2", @@ -63,7 +62,7 @@ def main(): if os.listdir(args.output_dir): raise RuntimeError("Please provide an empty output directory.") - test_list = _TEST_SUITES_FUNCTION["transformers_backend"]()() + test_list = _TEST_SUITES_FUNCTION["transformers_backend"]() run_tests(args, test_list) From f9e8e11d23160ff16d59fada7664eee6fdc8bcf1 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 13:08:48 +0000 Subject: [PATCH 119/129] rename integration test file --- ...gface.yaml => integration_test_8gpu_transformers_backend.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{integration_test_8gpu_huggingface.yaml => integration_test_8gpu_transformers_backend.yaml} (100%) diff --git a/.github/workflows/integration_test_8gpu_huggingface.yaml b/.github/workflows/integration_test_8gpu_transformers_backend.yaml similarity index 100% rename from .github/workflows/integration_test_8gpu_huggingface.yaml rename to .github/workflows/integration_test_8gpu_transformers_backend.yaml From 83b0437aeed5fa1d6a84dd6a3306f003e822a8c1 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 13:15:54 +0000 Subject: [PATCH 120/129] update README --- torchtitan/experiments/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md index 5a2c0b28e5..02372dbe16 100644 --- a/torchtitan/experiments/README.md +++ b/torchtitan/experiments/README.md @@ -30,6 +30,5 @@ We provide this `experiments/` folder to host experiments that add significant v | [torchcomms](./torchcomms/) | [![TorchComms 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_torchcomms.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_torchcomms.yaml?query=branch%3Amain) | [@d4l3k](https://https://github.com/d4l3k) [@fduwjj](https://github.com/fduwjj) [@mori360 ](https://github.com/mori360) | | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) | | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) | -| [compiler_toolkit](./compiler_tookit/) | TBA | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) | -| [transformers_backend](./transformers_backend/) | TBA | [@3outeille](https://github.com/3outeille) | | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) | +| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) | \ No newline at end of file From fb978ddbf095249a8beb2fa083794df276fec747 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 13:20:06 +0000 Subject: [PATCH 121/129] revert accidental changes linting --- torchtitan/experiments/README.md | 2 +- torchtitan/train.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md index 02372dbe16..9b25cdc7a6 100644 --- a/torchtitan/experiments/README.md +++ b/torchtitan/experiments/README.md @@ -31,4 +31,4 @@ We provide this `experiments/` folder to host experiments that add significant v | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) | | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) | | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) | -| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) | \ No newline at end of file +| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) | diff --git a/torchtitan/train.py b/torchtitan/train.py index a8dca7efd7..d157a3a307 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -163,6 +163,7 @@ def __init__(self, job_config: JobConfig): model_param_count, self.metrics_processor.num_flops_per_token, ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len) + logger.info( f"{color.blue}Model {job_config.model.name} {job_config.model.flavor} " f"{color.red}size: {model_param_count:,} total parameters{color.reset}" @@ -245,6 +246,7 @@ def __init__(self, job_config: JobConfig): else: # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel model = self.train_spec.parallelize_fn(model, parallel_dims, job_config) + model.to_empty(device=init_device) with torch.no_grad(): model.init_weights(buffer_device=buffer_device) From 71ff098cb3bb5786015238b0b8d1543cf60ba006 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 13:27:21 +0000 Subject: [PATCH 122/129] typo in naming --- .../experiments/transformers_backend/model/model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index 8c35ac4e94..3b589b4d43 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -17,7 +17,7 @@ from .args import HFTransformerModelArgs -class SlicableModuleDict(nn.ModuleDict): +class SliceableModuleDict(nn.ModuleDict): """ A ModuleDict that supports slicing like ModuleList. Keys are expected to be string representations of integers (e.g., "0", "1", "2"). @@ -30,8 +30,8 @@ def __getitem__(self, key): self.keys(), key=lambda x: int(x) if x.isdigit() else float("inf") ) sliced_keys = keys[key] - # Return a new SlicableModuleDict with the sliced items - return SlicableModuleDict({k: self[k] for k in sliced_keys}) + # Return a new SliceableModuleDict with the sliced items + return SliceableModuleDict({k: self[k] for k in sliced_keys}) return super().__getitem__(key) def __iter__(self): @@ -106,7 +106,7 @@ def __init__(self, model_args: HFTransformerModelArgs): # Convert ModuleList to ModuleDict to preserve original indices # This ensures state dict keys match checkpoint keys if isinstance(self.model.model.layers, nn.ModuleList): - self.model.model.layers = SlicableModuleDict( + self.model.model.layers = SliceableModuleDict( {str(i): layer for i, layer in enumerate(self.model.model.layers)} ) From 663a4157727949d76f6683a67a6dffec891a944e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 13:43:34 +0000 Subject: [PATCH 123/129] refactor --- torchtitan/distributed/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py index 790d84a5ed..60c05f1612 100644 --- a/torchtitan/distributed/utils.py +++ b/torchtitan/distributed/utils.py @@ -106,13 +106,13 @@ def set_determinism( if debug_config.deterministic: logger.info("Deterministic algorithm enabled (expect perf degradation).") torch.use_deterministic_algorithms(True) - # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan - torch.utils.deterministic.fill_uninitialized_memory = False torch.use_deterministic_algorithms( True, warn_only=debug_config.deterministic_warn_only ) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + # Otherwise, Huggignface modeling register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan + torch.utils.deterministic.fill_uninitialized_memory = False # env var for deterministic CuBLAS # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" From 3dbe6fab79c503a440f82a535fd7d997a844ad49 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Tue, 18 Nov 2025 14:03:09 +0000 Subject: [PATCH 124/129] revert the way we select HF modeling in config --- .../transformers_backend/README.md | 11 +- .../transformers_backend/configs/qwen3.toml | 5 +- .../transformers_backend/infra/parallelize.py | 171 +++++++++++++++++- .../transformers_backend/infra/pipeline.py | 2 +- .../transformers_backend/job_config.py | 18 ++ .../transformers_backend/model/args.py | 2 +- torchtitan/protocols/train_spec.py | 4 - 7 files changed, 195 insertions(+), 18 deletions(-) create mode 100644 torchtitan/experiments/transformers_backend/job_config.py diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md index 3d1a2dcf0d..805afb9ab9 100644 --- a/torchtitan/experiments/transformers_backend/README.md +++ b/torchtitan/experiments/transformers_backend/README.md @@ -4,19 +4,20 @@ - Requirements `transformers==4.57.1` -- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml` +- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3.toml` ```diff ... [model] - name = "llama3" -+ name = "Qwen/Qwen3-4B-Instruct-2507" ++ name = "transformers_backend" flavor = "debugmodel" hf_assets_path = "./tests/assets/tokenizer" + ++[hf_transformers] ++model = "Qwen/Qwen3-4B-Instruct-2507" ... ``` -**Note:** Any model name containing "/" is automatically recognized as a HuggingFace model ID and will use the `transformers_backend`. - -- Train: `LOG_RANK=7 CONFIG_FILE=/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --compile.enable` +- Train: `LOG_RANK=7 CONFIG_FILE=/torchtitan/experiments/transformers_backend/configs/qwen3.toml ./run_train.sh --job.custom_config_module=torchtitan.experiments.transformers_backend.job_config --compile.enable` - Make sure you have created the tokenizers beforehand image diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3.toml b/torchtitan/experiments/transformers_backend/configs/qwen3.toml index b0e294ccbe..13e3f4ddf0 100644 --- a/torchtitan/experiments/transformers_backend/configs/qwen3.toml +++ b/torchtitan/experiments/transformers_backend/configs/qwen3.toml @@ -20,12 +20,15 @@ save_tb_folder = "tb" enable_wandb = false [model] -name = "Qwen/Qwen3-4B-Instruct-2507" +name = "transformers_backend" flavor = "debugmodel" # test folder with tokenizer.json, for debug purpose only hf_assets_path = "./tests/assets/tokenizer" # converters = ["float8"] +[hf_transformers] +model = "Qwen/Qwen3-4B-Instruct-2507" + [optimizer] name = "AdamW" lr = 8e-4 diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py index cb68826e87..b2ae3f02a1 100644 --- a/torchtitan/experiments/transformers_backend/infra/parallelize.py +++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py @@ -4,8 +4,10 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import torch import torch.nn as nn from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy from torch.distributed.tensor import Replicate, Shard from torch.distributed.tensor.parallel import ( ColwiseParallel, @@ -15,17 +17,13 @@ SequenceParallel, ) from torchtitan.config import TORCH_DTYPE_MAP -from torchtitan.config.job_config import JobConfig from torchtitan.distributed import NoParallel, ParallelDims from torchtitan.distributed.activation_checkpoint import apply_ac from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp -from torchtitan.models.llama3.infra.parallelize import ( - apply_compile, - apply_ddp, - apply_fsdp, -) +from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp from torchtitan.tools.logging import logger @@ -274,3 +272,164 @@ def apply_non_moe_tp( f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}" "Tensor Parallelism to the model" ) + + +def apply_fsdp( + model: nn.Module, + dp_mesh: DeviceMesh, + param_dtype: torch.dtype, + reduce_dtype: torch.dtype, + pp_enabled: bool, + cpu_offload: bool = False, + reshard_after_forward_policy: str = "default", + ep_degree: int = 1, + dp_mod_ep_mesh: DeviceMesh | None = None, + gradient_divide_factor: int | None = None, +): + """ + Apply data parallelism (via FSDP2) to the model. + + Args: + model (nn.Module): The model to apply data parallelism to. + dp_mesh (DeviceMesh): The device mesh to use for data parallelism. + param_dtype (torch.dtype): The data type to use for model parameters. + reduce_dtype (torch.dtype): The data type to use for reduction operations. + pp_enabled (bool): Whether pipeline parallelism is enabled. + cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False. + reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default". + Other options: "never", "always". + - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios. + - "always" will enable `reshard_after_forward` for all forward passes. + - "never" will disable `reshard_after_forward` for all forward passes. + + """ + mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype) + fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} + if cpu_offload: + fsdp_config["offload_policy"] = CPUOffloadPolicy() + + match reshard_after_forward_policy: + case "always": + reshard_after_forward = True + case "never": + reshard_after_forward = False + case "default": + # For PP, by default do not reshard after forward to avoid per-microbatch + # all-gathers, which can be expensive and non-overlapped + reshard_after_forward = not pp_enabled + case _: + raise ValueError( + f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}." + ) + + if model.tok_embeddings is not None: + fully_shard( + model.tok_embeddings, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + + for transformer_block in model.layers: + # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping + # - the router and the shared experts are sharded together with the TransformerBlock + # - the routed experts are sharded with the remaining dp_mod_ep_mesh + if ( + hasattr(transformer_block, "moe_enabled") + and transformer_block.moe_enabled + and ep_degree > 1 + ): + fsdp_mod_ep_config = fsdp_config.copy() + fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh + moe_block = transformer_block.mlp + # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts). + # When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding + # causes inefficiency, so we choose to do FSDP sharding on dim-1. + # Even when EP is not used, we may still want to shard the experts + # on non-0 dim. For now it may not be worth the complexity to support + # shard_placement_fn on the outer TransformerBlock-level FSDP. + _experts_shard_placement_fn = None + assert dp_mod_ep_mesh is not None + if dp_mod_ep_mesh.size() * ep_degree > moe_block.experts.num_experts: + _experts_shard_placement_fn = lambda param: Shard(1) + + fully_shard( + moe_block.experts, + **fsdp_mod_ep_config, + reshard_after_forward=reshard_after_forward, + shard_placement_fn=_experts_shard_placement_fn, + ) + + # NOTE: # Although the FSDP sharding of experts is done on a mesh of + # a different size than other parameters, the gradient division + # factor should be consistent with data. + moe_block.experts.set_gradient_divide_factor( + gradient_divide_factor, + ) + + fully_shard( + transformer_block, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + + # As an optimization, do not reshard_after_forward the last layers by default + # since FSDP would prefetch them immediately after the forward pass + if model.norm is not None and model.output is not None: + fully_shard( + [model.norm, model.output], + **fsdp_config, + reshard_after_forward=reshard_after_forward_policy == "always", + ) + + fully_shard(model, **fsdp_config) + + # NOTE: set up explicit prefetching when EP is enabled, as D2H syncs + # in EP could interfere with implicit prefetching in FSDP + if ep_degree == 1: + return + + # forward + transformer_blocks = list(model.layers.values()) + next_transformer_blocks = transformer_blocks[1:] + [None] + + if model.tok_embeddings is not None and model.layers is not None: + model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]]) + + for transformer_block, next_transformer_block in zip( + transformer_blocks, next_transformer_blocks + ): + if next_transformer_block is not None: + if next_transformer_block.moe_enabled: + transformer_block.set_modules_to_forward_prefetch( + [next_transformer_block, next_transformer_block.mlp.experts] + ) + else: + transformer_block.set_modules_to_forward_prefetch( + [next_transformer_block] + ) + elif model.norm is not None and model.output is not None: + transformer_block.set_modules_to_forward_prefetch( + [model.norm, model.output] + ) + + # backward + reversed_transformer_blocks = list(reversed(model.layers.values())) + prev_transformer_blocks = reversed_transformer_blocks[1:] + [None] + + if model.norm is not None and model.output is not None and model.layers is not None: + model.output.set_modules_to_backward_prefetch([reversed_transformer_blocks[0]]) + + for transformer_block, prev_transformer_block in zip( + reversed_transformer_blocks, prev_transformer_blocks + ): + if prev_transformer_block is not None: + if prev_transformer_block.moe_enabled: + transformer_block.set_modules_to_backward_prefetch( + [prev_transformer_block, prev_transformer_block.mlp.experts] + ) + else: + transformer_block.set_modules_to_backward_prefetch( + [prev_transformer_block] + ) + elif model.tok_embeddings is not None: + transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings]) diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py index c8904f4352..53aee86180 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py @@ -14,13 +14,13 @@ ) from torchtitan.components.loss import LossFunction -from torchtitan.config.job_config import JobConfig from torchtitan.distributed import ParallelDims from torchtitan.distributed.pipeline_parallel import ( build_pipeline_schedule, generate_llm_fqn_per_model_part, pipeline_module_split, ) +from torchtitan.experiments.transformers_backend.job_config import JobConfig from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py new file mode 100644 index 0000000000..f3b1667798 --- /dev/null +++ b/torchtitan/experiments/transformers_backend/job_config.py @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass, field + + +@dataclass +class HFTransformers: + model: str = "" + """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')""" + + +@dataclass +class JobConfig: + hf_transformers: HFTransformers = field(default_factory=HFTransformers) diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index d261dcd5e4..4093f66194 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -150,7 +150,7 @@ def __repr__(self) -> str: def update_from_config(self, job_config: JobConfig): # Load HF config (overwrites our HF attributes) hf_model_config = AutoConfig.from_pretrained( - job_config.model.name, + job_config.hf_transformers.model, attn_implementation=self.attn_implementation, trust_remote_code=True, ) diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py index 3eed6ddd2f..22bfa7df9b 100644 --- a/torchtitan/protocols/train_spec.py +++ b/torchtitan/protocols/train_spec.py @@ -77,10 +77,6 @@ def get_train_spec(name: str) -> TrainSpec: from torchtitan.experiments import _supported_experiments from torchtitan.models import _supported_models - if "/" in name: - module = import_module("torchtitan.experiments.transformers_backend") - return module.get_train_spec() - if name in _supported_models: module = import_module(f"torchtitan.models.{name}") return module.get_train_spec() From 9be95dac760a5006d7362de5d629683caddaeb75 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 19 Nov 2025 11:06:44 +0000 Subject: [PATCH 125/129] Revert "reuse pipeline from torchtitan" This reverts commit 09f0c94790a5817eb9c2f5d40f5d11236f7c79b9. --- torchtitan/distributed/pipeline_parallel.py | 19 +- .../transformers_backend/__init__.py | 2 +- .../transformers_backend/infra/pipeline.py | 142 ------- .../infra/pipeline_parallel.py | 390 ++++++++++++++++++ 4 files changed, 394 insertions(+), 159 deletions(-) delete mode 100644 torchtitan/experiments/transformers_backend/infra/pipeline.py create mode 100644 torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py index b954d32c19..06dba40d6f 100644 --- a/torchtitan/distributed/pipeline_parallel.py +++ b/torchtitan/distributed/pipeline_parallel.py @@ -228,7 +228,6 @@ def generate_llm_fqn_per_model_part( num_layers: int, input_weight: int = 1, output_weight: int = 1, - include_rotary_emb: bool = False, ) -> list[list[str]]: """ Programmatically generates module names model part, focused on LLMs models. @@ -238,7 +237,6 @@ def generate_llm_fqn_per_model_part( num_layers: Total number of transformer layers in the model input_weight: Weight for input modules (tok_embeddings) in layer calculation output_weight: Weight for output modules (norm + output) in layer calculation - include_rotary_emb: Whether to include rotary_emb in each model part Returns: List of lists containing module names for each model part @@ -253,10 +251,7 @@ def generate_llm_fqn_per_model_part( if num_stages == 1: # Single stage gets everything layer_names = [f"layers.{i}" for i in range(num_layers)] - result = [["tok_embeddings"] + layer_names + ["norm", "output"]] - if include_rotary_emb: - result[0].append("rotary_emb") - return result + return [["tok_embeddings"] + layer_names + ["norm", "output"]] # Calculate effective layers including weights num_effective_layers = num_layers + input_weight + output_weight @@ -334,8 +329,6 @@ def generate_llm_fqn_per_model_part( stage_modules.append(f"layers.{current_layer}") current_layer += 1 - if include_rotary_emb: - stage_modules.append("rotary_emb") module_names_per_stage.append(stage_modules) return module_names_per_stage @@ -347,7 +340,6 @@ def pipeline_module_split( pp_schedule: str, device: torch.device, module_names_per_stage: list[list[str]], - use_identity_for_missing_modules: bool = False, ) -> tuple[list[PipelineStage], list[nn.Module]]: """ This API creates pipeline stages based on specified module names for each stage. @@ -369,8 +361,6 @@ def pipeline_module_split( - "layers.0", "layers.1" for specific transformer layers - "norm" for the final normalization layer - "output" for the output projection layer - use_identity_for_missing_modules: If True, replace missing modules with nn.Identity(), - otherwise replace with None Returns: Tuple of (stages, models) where stages are PipelineStage objects and models are the @@ -427,11 +417,8 @@ def _build_stage_from_modules( setattr(model, module_name, nn.ModuleList()) # Handle simple module attributes (e.g., "linear", "norm") elif module_name not in modules_to_keep: - # Replace with Identity or None based on configuration - replacement = ( - nn.Identity() if use_identity_for_missing_modules else None - ) - setattr(model, module_name, replacement) + # Replace with None + setattr(model, module_name, None) stage = PipelineStage( model, diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index aec28a0bdd..dc4322623b 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -12,7 +12,7 @@ from .infra.parallelize import parallelize_hf_transformers -from .infra.pipeline import pipeline_hf_transformers +from .infra.pipeline_parallel import pipeline_hf_transformers from .model.args import HFTransformerModelArgs, TitanDenseModelArgs from .model.model import HFTransformerModel diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py deleted file mode 100644 index 53aee86180..0000000000 --- a/torchtitan/experiments/transformers_backend/infra/pipeline.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -import math - -import torch -import torch.nn as nn -from torch.distributed.pipelining.schedules import ( - _PipelineSchedule, - get_schedule_class, - PipelineScheduleSingle, -) - -from torchtitan.components.loss import LossFunction -from torchtitan.distributed import ParallelDims -from torchtitan.distributed.pipeline_parallel import ( - build_pipeline_schedule, - generate_llm_fqn_per_model_part, - pipeline_module_split, -) -from torchtitan.experiments.transformers_backend.job_config import JobConfig -from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction - - -def pipeline_hf_transformers( - model: nn.Module, - parallel_dims: ParallelDims, - job_config: JobConfig, - device: torch.device, - model_args: BaseModelArgs, - parallelize_fn: ParallelizeFunction, - loss_fn: LossFunction, -) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]: - pp_mesh = parallel_dims.world_mesh["pp"] - - # Determine the number of virtual stages based on schedule type - schedule_class = get_schedule_class( - job_config.parallelism.pipeline_parallel_schedule - ) - is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle) - layers_per_stage = job_config.parallelism.pipeline_parallel_layers_per_stage - if hasattr(model_args, "n_layers"): - num_layers = model_args.n_layers - else: - raise ValueError("Model does not have n_layers attribute.") - - # You can adjust these weights based on the computational cost of embeddings and output layers - # Higher weights mean these modules are treated as "heavier" in the distribution - input_weight = job_config.parallelism.pipeline_parallel_first_stage_less_layers - output_weight = job_config.parallelism.pipeline_parallel_last_stage_less_layers - - # Calculate number of virtual stages - if layers_per_stage is not None: - - # Calculate number of virtual stages needed (using ceiling division) - # This allows for unequal distribution where stages can differ by at most 1 layer - num_virtual_stages = math.ceil( - (num_layers + input_weight + output_weight) / layers_per_stage - ) - - # Validation: check stages per rank based on schedule type - model_config_info = f"Model has {num_layers} layers with pipeline_parallel_layers_per_stage={layers_per_stage}" - stage_distribution_info = ( - f"resulting in {num_virtual_stages=} across {parallel_dims.pp} PP ranks" - ) - - if num_virtual_stages % parallel_dims.pp != 0: - raise ValueError( - f"Number of virtual stages ({num_virtual_stages}) must be divisible by " - f"pipeline parallel size ({parallel_dims.pp}). " - f"{model_config_info}. " - f"Please adjust pipeline_parallel_layers_per_stage to a value that results in a number of stages " - f"divisible by {parallel_dims.pp}." - ) - - stages_per_rank = num_virtual_stages // parallel_dims.pp - - if is_single_stage_schedule and stages_per_rank != 1: - raise ValueError( - f"Single stage schedule requires exactly 1 stage per rank, but got {stages_per_rank} stages per rank. " - f"{model_config_info}, {stage_distribution_info}. " - f"Please increase pipeline_parallel_layers_per_stage to {num_layers // parallel_dims.pp} or higher " - f"to achieve 1 stage per rank." - ) - - if not is_single_stage_schedule and stages_per_rank < 2: - raise ValueError( - f"Multi-stage schedule requires at least 2 stages per rank, but got {stages_per_rank} stages per rank. " - f"{model_config_info}, {stage_distribution_info}. " - f"Please decrease pipeline_parallel_layers_per_stage to achieve at least 2 stages per rank." - ) - else: - # Fallback to default behavior when layers_per_stage is not provided - # For multi-stage schedules, default is 2 virtual stages per rank - # For single-stage schedules, default is 1 virtual stage per rank - stages_per_rank = 1 if is_single_stage_schedule else 2 - num_virtual_stages = parallel_dims.pp * stages_per_rank - - module_names_per_stage = job_config.parallelism.module_fqns_per_model_part - if module_names_per_stage is None: - module_names_per_stage = generate_llm_fqn_per_model_part( - num_virtual_stages, - num_layers, - input_weight, - output_weight, - include_rotary_emb=True, - ) - - stages, model_parts = pipeline_module_split( - model, - pp_mesh, - job_config.parallelism.pipeline_parallel_schedule, - device, - module_names_per_stage, - use_identity_for_missing_modules=True, - ) - - # For PP with looped schedules, each item in model_parts is one stage-model-chunk. - # We need to iterate through model_parts to apply SPMD parallelisms, compilation, - # optimizer, and checkpointing - for i, m in enumerate(model_parts): - # apply SPMD-style PT-D techniques - m = parallelize_fn(m, parallel_dims, job_config) - model_parts[i] = m - # NOTE: this is to update the model in the stage - # in case the model is modified e.g. by torch.compile - stages[i].submod = m - - pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn) - - # This is used in the train loop to determine whether to pass in the input_ids and labels - has_first_stage = False - has_last_stage = False - for stage in stages: - if stage.is_first: - has_first_stage = True - if stage.is_last: - has_last_stage = True - - return pp_schedule, model_parts, has_first_stage, has_last_stage diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py new file mode 100644 index 0000000000..8610b201dc --- /dev/null +++ b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py @@ -0,0 +1,390 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import copy +import math + +import torch +import torch.nn as nn +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.pipelining import PipelineStage +from torch.distributed.pipelining.schedules import ( + _PipelineSchedule, + get_schedule_class, + PipelineScheduleSingle, + ScheduleDualPipeV, + ScheduleZBVZeroBubble, +) + +from torchtitan.components.loss import LossFunction +from torchtitan.distributed import ParallelDims +from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule +from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction +from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.tools.logging import logger + +# NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part + +def generate_llm_fqn_per_model_part( + num_stages: int, + num_layers: int, + input_weight: int = 1, + output_weight: int = 1, +) -> list[list[str]]: + """ + Programmatically generates module names model part, focused on LLMs models. + Args: + num_stages: Number of pipeline stages + num_layers: Total number of transformer layers in the model + input_weight: Weight for input modules (embed_tokens) in layer calculation + output_weight: Weight for output modules (norm + output) in layer calculation + Returns: + List of lists containing module names for each model part + Example: + generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2) + treats embeddings as 2 layers and norm+output as 2 layers for distribution + """ + if num_stages < 1: + raise ValueError("Number of stages must be at least 1") + + if num_stages == 1: + # Single stage gets everything + layer_names = [f"layers.{i}" for i in range(num_layers)] + return [["tok_embeddings"] + layer_names + ["norm", "output", "rotary_emb"]] + + # Calculate effective layers including weights + num_effective_layers = num_layers + input_weight + output_weight + + if num_stages > num_effective_layers: + raise ValueError( + f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})" + ) + + # Calculate layers per stage (distribute evenly) + layers_per_stage = num_effective_layers // num_stages + extra_layers = num_effective_layers % num_stages + + # Feasibility check: Ensure at least 1 layer in each PP stage + if layers_per_stage == 0: + raise ValueError( + f"Configuration would result in empty stages. " + f"With {num_stages} stages and {num_effective_layers} effective layers " + f"(num_layers={num_layers} + input_weight={input_weight} + output_weight={output_weight}), " + f"each stage would get {layers_per_stage} layers on average. " + f"Reduce num_stages or increase num_layers/weights." + ) + + # Balance check: Ensure weights don't exceed minimum layers per stage + if input_weight > layers_per_stage: + raise ValueError( + f"input_weight ({input_weight}) exceeds minimum layers per stage ({layers_per_stage})." + ) + if output_weight > layers_per_stage: + raise ValueError( + f"output_weight ({output_weight}) exceeds minimum layers per stage ({layers_per_stage})." + ) + + module_names_per_stage = [] + current_layer = 0 + + for stage_idx in range(num_stages): + stage_modules = [] + + # Calculate effective layers for this stage + effective_layers_for_stage = layers_per_stage + if stage_idx < extra_layers: + effective_layers_for_stage += 1 + + # First stage: handle input modules with weighting + if stage_idx == 0: + stage_modules.append("tok_embeddings") + # Account for input weight in layer distribution + remaining_layers_for_stage = effective_layers_for_stage - input_weight + + # Add transformer layers + for _ in range(remaining_layers_for_stage): + if current_layer < num_layers: + stage_modules.append(f"layers.{current_layer}") + current_layer += 1 + + # Last stage: handle output modules with weighting + elif stage_idx == num_stages - 1: + # Account for output weight in layer distribution + remaining_layers_for_stage = effective_layers_for_stage - output_weight + + # Add transformer layers + for _ in range(remaining_layers_for_stage): + if current_layer < num_layers: + stage_modules.append(f"layers.{current_layer}") + current_layer += 1 + + # Add output modules + stage_modules.extend(["norm", "output"]) + + # Middle stages: only transformer layers + else: + for _ in range(effective_layers_for_stage): + if current_layer < num_layers: + stage_modules.append(f"layers.{current_layer}") + current_layer += 1 + + stage_modules.append("rotary_emb") + module_names_per_stage.append(stage_modules) + + return module_names_per_stage + + +def pipeline_module_split( + whole_model: nn.Module, + pp_mesh: DeviceMesh, + pp_schedule: str, + device: torch.device, + module_names_per_stage: list[list[str]], +) -> tuple[list[PipelineStage], list[nn.Module]]: + """ + This API creates pipeline stages based on specified module names for each stage. + + Some model restrictions include: + - forward() method should tolerate deleted layers + - weight initialization methods should tolerate deleted layers + - Does not support nested moduledict and modulelist structures + + Args: + whole_model: The complete model to be split + pp_mesh: Pipeline parallel device mesh + pp_schedule: Name of pipeline parallelism schedule + device: Device + module_names_per_stage: List of lists, where each inner list contains the module names + that should be included in that stage. Module names should be + dot-separated paths. Examples: + - "tok_embeddings" for token embeddings + - "layers.0", "layers.1" for specific transformer layers + - "norm" for the final normalization layer + - "output" for the output projection layer + + Returns: + Tuple of (stages, models) where stages are PipelineStage objects and models are the + corresponding model chunks + + Example usage: + module_names_per_stage = [ + ["tok_embeddings", "layers.0"], # Stage 0: embeddings + first layer + ["layers.1", "layers.2"], # Stage 1: middle layers + ["norm", "output"] # Stage 2: final norm + output + ] + """ + pp_rank = pp_mesh.get_local_rank() + pp_degree = pp_mesh.size() + + def _build_stage_from_modules( + stage_idx: int, module_names: list[str], num_stages: int + ) -> tuple[PipelineStage, nn.Module]: + model = copy.deepcopy(whole_model) + + # Create a set of modules to keep for faster lookup + modules_to_keep = set(module_names) + for module_name, module_value in model.named_children(): + # Handle layer-like structures (e.g., "layers.0", "layers.1") + if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)): + layers_to_keep = { + name.split(".", 1)[1] + for name in modules_to_keep + if name.startswith(f"{module_name}.") + } + if layers_to_keep: + # Keep only specified layers + if isinstance(module_value, nn.ModuleDict): + for layer_name in list(module_value.keys()): + if layer_name not in layers_to_keep: + del module_value[layer_name] + elif isinstance(module_value, nn.ModuleList): + indices_to_keep = { + int(idx) for idx in layers_to_keep if idx.isdigit() + } + new_layers = nn.ModuleList( + [ + layer + for i, layer in enumerate(module_value) + if i in indices_to_keep + ] + ) + setattr(model, module_name, new_layers) + else: + # No layers from this structure needed, set to empty structure + if isinstance(module_value, nn.ModuleDict): + setattr(model, module_name, nn.ModuleDict()) + elif isinstance(module_value, nn.ModuleList): + setattr(model, module_name, nn.ModuleList()) + # Handle simple module attributes (e.g., "linear", "norm") + elif module_name not in modules_to_keep: + # Replace with Identity + setattr(model, module_name, nn.Identity()) + + stage = PipelineStage( + model, + stage_idx, + num_stages, + device, + group=pp_mesh.get_group("pp"), + ) + return stage, model + + num_stages = len(module_names_per_stage) + stages = [] + models = [] + + schedule_class = get_schedule_class(pp_schedule) + style = ( + "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop" + ) + + def _get_stage_indices() -> tuple[int]: + """ + Compute the stage ids for the stages that will run on this pp rank + for either a looped or V style schedule + """ + assert ( + num_stages % pp_degree == 0 + ), f"num_stages {num_stages} must be evenly divisible by pp_degree {pp_degree}" + stages_per_rank = num_stages // pp_degree + if style == "loop": + return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank)) + elif style == "v": + assert ( + stages_per_rank == 2 + ), f"v schedules assume 2 stages per rank, got {stages_per_rank}" + stage_v_pairs = list( + zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1)) + ) + return stage_v_pairs[pp_rank] + + for stage_idx in _get_stage_indices(): + module_names = module_names_per_stage[stage_idx] + stage, model_chunk = _build_stage_from_modules( + stage_idx, + module_names, + num_stages, + ) + logger.info( + f"PP rank {pp_rank} is building stage_idx {stage_idx} " + f"with modules {module_names}" + ) + stages.append(stage) + models.append(model_chunk) + + return stages, models + + +def pipeline_hf_transformers( + model: nn.Module, + parallel_dims: ParallelDims, + job_config: JobConfig, + device: torch.device, + model_args: BaseModelArgs, + parallelize_fn: ParallelizeFunction, + loss_fn: LossFunction, +) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]: + pp_mesh = parallel_dims.world_mesh["pp"] + + # Determine the number of virtual stages based on schedule type + schedule_class = get_schedule_class( + job_config.parallelism.pipeline_parallel_schedule + ) + is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle) + layers_per_stage = job_config.parallelism.pipeline_parallel_layers_per_stage + if hasattr(model_args, "n_layers"): + num_layers = model_args.n_layers + else: + raise ValueError("Model does not have n_layers attribute.") + + # You can adjust these weights based on the computational cost of embeddings and output layers + # Higher weights mean these modules are treated as "heavier" in the distribution + input_weight = job_config.parallelism.pipeline_parallel_first_stage_less_layers + output_weight = job_config.parallelism.pipeline_parallel_last_stage_less_layers + + # Calculate number of virtual stages + if layers_per_stage is not None: + + # Calculate number of virtual stages needed (using ceiling division) + # This allows for unequal distribution where stages can differ by at most 1 layer + num_virtual_stages = math.ceil( + (num_layers + input_weight + output_weight) / layers_per_stage + ) + + # Validation: check stages per rank based on schedule type + model_config_info = f"Model has {num_layers} layers with pipeline_parallel_layers_per_stage={layers_per_stage}" + stage_distribution_info = ( + f"resulting in {num_virtual_stages=} across {parallel_dims.pp} PP ranks" + ) + + if num_virtual_stages % parallel_dims.pp != 0: + raise ValueError( + f"Number of virtual stages ({num_virtual_stages}) must be divisible by " + f"pipeline parallel size ({parallel_dims.pp}). " + f"{model_config_info}. " + f"Please adjust pipeline_parallel_layers_per_stage to a value that results in a number of stages " + f"divisible by {parallel_dims.pp}." + ) + + stages_per_rank = num_virtual_stages // parallel_dims.pp + + if is_single_stage_schedule and stages_per_rank != 1: + raise ValueError( + f"Single stage schedule requires exactly 1 stage per rank, but got {stages_per_rank} stages per rank. " + f"{model_config_info}, {stage_distribution_info}. " + f"Please increase pipeline_parallel_layers_per_stage to {num_layers // parallel_dims.pp} or higher " + f"to achieve 1 stage per rank." + ) + + if not is_single_stage_schedule and stages_per_rank < 2: + raise ValueError( + f"Multi-stage schedule requires at least 2 stages per rank, but got {stages_per_rank} stages per rank. " + f"{model_config_info}, {stage_distribution_info}. " + f"Please decrease pipeline_parallel_layers_per_stage to achieve at least 2 stages per rank." + ) + else: + # Fallback to default behavior when layers_per_stage is not provided + # For multi-stage schedules, default is 2 virtual stages per rank + # For single-stage schedules, default is 1 virtual stage per rank + stages_per_rank = 1 if is_single_stage_schedule else 2 + num_virtual_stages = parallel_dims.pp * stages_per_rank + + module_names_per_stage = job_config.parallelism.module_fqns_per_model_part + if module_names_per_stage is None: + module_names_per_stage = generate_llm_fqn_per_model_part( + num_virtual_stages, num_layers, input_weight, output_weight + ) + + stages, model_parts = pipeline_module_split( + model, + pp_mesh, + job_config.parallelism.pipeline_parallel_schedule, + device, + module_names_per_stage, + ) + + # For PP with looped schedules, each item in model_parts is one stage-model-chunk. + # We need to iterate through model_parts to apply SPMD parallelisms, compilation, + # optimizer, and checkpointing + for i, m in enumerate(model_parts): + # apply SPMD-style PT-D techniques + m = parallelize_fn(m, parallel_dims, job_config) + model_parts[i] = m + # NOTE: this is to update the model in the stage + # in case the model is modified e.g. by torch.compile + stages[i].submod = m + + pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn) + + # This is used in the train loop to determine whether to pass in the input_ids and labels + has_first_stage = False + has_last_stage = False + for stage in stages: + if stage.is_first: + has_first_stage = True + if stage.is_last: + has_last_stage = True + + return pp_schedule, model_parts, has_first_stage, has_last_stage From c0c273c5e9071db19c7e735ffd360860cc2cff2e Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 19 Nov 2025 11:24:46 +0000 Subject: [PATCH 126/129] pass deterministic.fill_uninitialized_memory to HF model --- torchtitan/distributed/utils.py | 2 -- torchtitan/experiments/transformers_backend/model/args.py | 2 ++ torchtitan/experiments/transformers_backend/model/model.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py index 60c05f1612..b209ddfd68 100644 --- a/torchtitan/distributed/utils.py +++ b/torchtitan/distributed/utils.py @@ -111,8 +111,6 @@ def set_determinism( ) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - # Otherwise, Huggignface modeling register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan - torch.utils.deterministic.fill_uninitialized_memory = False # env var for deterministic CuBLAS # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py index 4093f66194..25ab328f15 100644 --- a/torchtitan/experiments/transformers_backend/model/args.py +++ b/torchtitan/experiments/transformers_backend/model/args.py @@ -171,6 +171,8 @@ def update_from_config(self, job_config: JobConfig): self.max_seq_len = job_config.training.seq_len + self.deterministic = job_config.debug.deterministic + # Configure HF-specific settings to match TorchTitan settings # TODO: false ? self.attention_bias = False diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index 3b589b4d43..2b42a1abc6 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -50,6 +50,10 @@ class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): super().__init__() + #NOTE(3outeille): This prevents Hugging Face modeling from initializing ROPE (inv_freq) buffers to NaN. Usefull when loading from seed checkpoint. + if hasattr(model_args, 'deterministic') and model_args.deterministic: + torch.utils.deterministic.fill_uninitialized_memory = False + # Try to import the model class dynamically from the transformers library if not found in globals model_class_name = model_args.architectures[0] model_cls = globals().get(model_class_name, None) From 4c50a0005a93c606ad2cb1cb3157663c3458c4c9 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 19 Nov 2025 11:27:07 +0000 Subject: [PATCH 127/129] fix linting --- .../transformers_backend/infra/pipeline_parallel.py | 3 ++- torchtitan/experiments/transformers_backend/model/model.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py index 8610b201dc..04452c5ede 100644 --- a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py +++ b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py @@ -21,12 +21,13 @@ from torchtitan.components.loss import LossFunction from torchtitan.distributed import ParallelDims from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule -from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction from torchtitan.experiments.transformers_backend.job_config import JobConfig +from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction from torchtitan.tools.logging import logger # NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part + def generate_llm_fqn_per_model_part( num_stages: int, num_layers: int, diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py index 2b42a1abc6..b88fffc54b 100644 --- a/torchtitan/experiments/transformers_backend/model/model.py +++ b/torchtitan/experiments/transformers_backend/model/model.py @@ -50,8 +50,9 @@ class HFTransformerModel(nn.Module): def __init__(self, model_args: HFTransformerModelArgs): super().__init__() - #NOTE(3outeille): This prevents Hugging Face modeling from initializing ROPE (inv_freq) buffers to NaN. Usefull when loading from seed checkpoint. - if hasattr(model_args, 'deterministic') and model_args.deterministic: + # NOTE(3outeille): This prevents Hugging Face modeling from initializing ROPE (inv_freq) buffers to NaN. + # Needed when loading from seed checkpoint. + if hasattr(model_args, "deterministic") and model_args.deterministic: torch.utils.deterministic.fill_uninitialized_memory = False # Try to import the model class dynamically from the transformers library if not found in globals From 5b8d38c1c32f0e8cadad6c08ace83e87adad8e8b Mon Sep 17 00:00:00 2001 From: 3outeille Date: Wed, 19 Nov 2025 11:47:27 +0000 Subject: [PATCH 128/129] fix integration tests --- .../transformers_backend/tests/integration_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py index 5629b45f5c..f8a5c4e7e3 100644 --- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py +++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py @@ -21,7 +21,8 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]: OverrideDefinitions( [ [ - "--model.name meta-llama/Llama-3.2-1B", + "--job.custom_config_module=torchtitan.experiments.transformers_backend.job_config", + "--hf_transformers.model Qwen/Qwen2.5-7B", "--parallelism.data_parallel_shard_degree 2", "--parallelism.tensor_parallel_degree 2", "--parallelism.pipeline_parallel_degree 2", From 57bb8dd872b9253f5441f80f8d125a52c2a43074 Mon Sep 17 00:00:00 2001 From: 3outeille Date: Thu, 20 Nov 2025 17:38:29 +0000 Subject: [PATCH 129/129] fix minor stuff --- torchtitan/experiments/README.md | 2 +- .../transformers_backend/__init__.py | 2 +- .../configs/{qwen3.toml => debug_model.toml} | 8 +- .../transformers_backend/configs/full.toml | 87 +++++++++++++++++++ .../{pipeline_parallel.py => pipeline.py} | 0 .../tests/integration_tests.py | 1 + 6 files changed, 93 insertions(+), 7 deletions(-) rename torchtitan/experiments/transformers_backend/configs/{qwen3.toml => debug_model.toml} (91%) create mode 100644 torchtitan/experiments/transformers_backend/configs/full.toml rename torchtitan/experiments/transformers_backend/infra/{pipeline_parallel.py => pipeline.py} (100%) diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md index 9b25cdc7a6..08dc692bf9 100644 --- a/torchtitan/experiments/README.md +++ b/torchtitan/experiments/README.md @@ -31,4 +31,4 @@ We provide this `experiments/` folder to host experiments that add significant v | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) | | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) | | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) | -| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) | +| [transformers_backend](./transformers_backend/) | [![Transformers backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml?query=branch%3Amain) | [@3outeille](https://github.com/3outeille) | diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py index dc4322623b..aec28a0bdd 100644 --- a/torchtitan/experiments/transformers_backend/__init__.py +++ b/torchtitan/experiments/transformers_backend/__init__.py @@ -12,7 +12,7 @@ from .infra.parallelize import parallelize_hf_transformers -from .infra.pipeline_parallel import pipeline_hf_transformers +from .infra.pipeline import pipeline_hf_transformers from .model.args import HFTransformerModelArgs, TitanDenseModelArgs from .model.model import HFTransformerModel diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3.toml b/torchtitan/experiments/transformers_backend/configs/debug_model.toml similarity index 91% rename from torchtitan/experiments/transformers_backend/configs/qwen3.toml rename to torchtitan/experiments/transformers_backend/configs/debug_model.toml index 13e3f4ddf0..7b3de04b87 100644 --- a/torchtitan/experiments/transformers_backend/configs/qwen3.toml +++ b/torchtitan/experiments/transformers_backend/configs/debug_model.toml @@ -47,16 +47,14 @@ max_norm = 1.0 # grad norm clipping steps = 10 dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M) dataset_path = "./tests/assets/c4_test" -mixed_precision_param = "float32" # force float32 for comparison -mixed_precision_reduce = "float32" [parallelism] data_parallel_replicate_degree = 1 -data_parallel_shard_degree = 2 +data_parallel_shard_degree = -1 fsdp_reshard_after_forward = "default" # default / never / always -tensor_parallel_degree = 2 +tensor_parallel_degree = 1 enable_async_tensor_parallel = false -pipeline_parallel_degree = 2 +pipeline_parallel_degree = 1 pipeline_parallel_schedule = "1F1B" context_parallel_degree = 1 expert_parallel_degree = 1 diff --git a/torchtitan/experiments/transformers_backend/configs/full.toml b/torchtitan/experiments/transformers_backend/configs/full.toml new file mode 100644 index 0000000000..45eaa785de --- /dev/null +++ b/torchtitan/experiments/transformers_backend/configs/full.toml @@ -0,0 +1,87 @@ +# torchtitan Config.toml + +[job] +dump_folder = "./outputs" +description = "Qwen 3 full training" +print_config = true + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 5 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +disable_color_printing = false +enable_tensorboard = false +save_tb_folder = "tb" +enable_wandb = false + +[model] +name = "transformers_backend" +flavor = "full" +# test folder with tokenizer.json, for debug purpose only +hf_assets_path = "./tests/assets/tokenizer" +# converters = ["float8"] + +[hf_transformers] +model = "Qwen/Qwen3-4B-Instruct-2507" + +[optimizer] +name = "AdamW" +lr = 8e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps +decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps +decay_type = "linear" +min_lr_factor = 0.0 + +[training] +local_batch_size = 2 +seq_len = 2048 +max_norm = 1.0 # grad norm clipping +steps = 10 +dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = -1 +fsdp_reshard_after_forward = "default" # default / never / always +tensor_parallel_degree = 1 +enable_async_tensor_parallel = false +pipeline_parallel_degree = 1 +pipeline_parallel_schedule = "1F1B" +context_parallel_degree = 1 +expert_parallel_degree = 1 +expert_tensor_parallel_degree = 1 + +[checkpoint] +enable = false +folder = "checkpoint" +interval = 10 +last_save_model_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = "selective" # ["none", "selective", "full"] +selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy + +[compile] +enable=false +components = ["model", "loss"] + +[quantize.linear.float8] +enable_fsdp_float8_all_gather = false +precompute_float8_dynamic_scale_for_fsdp = false +filter_fqns = ["output"] + +[validation] +enable = false +dataset = "c4_validation" +freq = 5 +steps = 10 diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py similarity index 100% rename from torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py rename to torchtitan/experiments/transformers_backend/infra/pipeline.py diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py index f8a5c4e7e3..35d09d6a94 100644 --- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py +++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py @@ -21,6 +21,7 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]: OverrideDefinitions( [ [ + "--model.name transformers_backend", "--job.custom_config_module=torchtitan.experiments.transformers_backend.job_config", "--hf_transformers.model Qwen/Qwen2.5-7B", "--parallelism.data_parallel_shard_degree 2",