From 7488385b5a13fa8689115556e90a47c9e7d721cc Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 28 Aug 2025 07:56:30 +0000
Subject: [PATCH 001/129] create transformer_backend folder with debug run

---
 .../debug/configs/debug_fsdp_2_gpu.toml       | 65 +++++++++++++++++++
 .../transformers_backend/debug/run_train.sh   | 33 ++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml
 create mode 100755 torchtitan/experiments/transformers_backend/debug/run_train.sh

diff --git a/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml b/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml
new file mode 100644
index 0000000000..db97c9b339
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml
@@ -0,0 +1,65 @@
+# FSDP-only configuration for a 2-GPU setup.
+# Model is sharded across GPUs.
+
+[job]
+dump_folder = "./outputs"
+description = "Llama 3 debug training with FSDP on 2 GPUs"
+print_args = false
+use_for_integration_test = true
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "llama3"
+flavor = "debugmodel"
+tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2
+decay_ratio = 0.8
+decay_type = "linear"
+min_lr_factor = 0.0
+
+[training]
+local_batch_size = 8
+seq_len = 2048
+max_norm = 1.0
+steps = 10
+compile = false
+dataset = "c4_test"
+dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 2
+tensor_parallel_degree = 1
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+expert_parallel_degree = 1
+
+[checkpoint]
+enable_checkpoint = false
+
+[activation_checkpoint]
+mode = "selective"
+selective_ac_option = '2'
+
+[validation]
+enabled = false 
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/debug/run_train.sh b/torchtitan/experiments/transformers_backend/debug/run_train.sh
new file mode 100755
index 0000000000..fc259612bc
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/debug/run_train.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+# use envs as local overwrites for convenience
+# e.g.
+# LOG_RANK=0,1 NGPU=4 ./run_train.sh
+NGPU=${NGPU:-"8"}
+export LOG_RANK=${LOG_RANK:-0}
+
+# Option to switch between debug and train
+MODE=${MODE:-"train"}  # Set MODE=debug or MODE=train
+
+CONFIG_FILE=${CONFIG_FILE:-"configs/debug_fsdp_2_gpu.toml"}
+
+if [ "$MODE" = "debug" ]; then
+    PYTHON_CMD="debugpy-run -m torch.distributed.run --"
+else
+    PYTHON_CMD="torchrun"
+fi
+
+TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
+
+PYTORCH_ALLOC_CONF="expandable_segments:True" \
+TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \
+$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+-m torchtitan.train --job.config_file ${CONFIG_FILE} "$@"
\ No newline at end of file

From 39a3b34907975732d0777eba07aec2111bfd658f Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 28 Aug 2025 08:45:18 +0000
Subject: [PATCH 002/129] add hf config

---
 .../configs/debug_1_gpu.toml                  | 63 +++++++++++++++++++
 .../configs/debug_1_gpu_hf.toml               | 62 ++++++++++++++++++
 .../{debug => }/configs/debug_fsdp_2_gpu.toml |  0
 .../{debug => }/run_train.sh                  |  2 +-
 4 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
 create mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
 rename torchtitan/experiments/transformers_backend/{debug => }/configs/debug_fsdp_2_gpu.toml (100%)
 rename torchtitan/experiments/transformers_backend/{debug => }/run_train.sh (94%)

diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
new file mode 100644
index 0000000000..c2f4dd7136
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
@@ -0,0 +1,63 @@
+[job]
+dump_folder = "./outputs"
+description = "Llama 3 debug training with FSDP on 2 GPUs"
+print_args = false
+use_for_integration_test = true
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "llama3"
+hf_name = "Llama-3.2-3B"
+flavor = "debugmodel"
+tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2
+decay_ratio = 0.8
+decay_type = "linear"
+min_lr_factor = 0.0
+
+[training]
+local_batch_size = 8
+seq_len = 2048
+max_norm = 1.0
+steps = 10
+compile = false
+dataset = "c4_test"
+dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+tensor_parallel_degree = 1
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+expert_parallel_degree = 1
+
+[checkpoint]
+enable_checkpoint = false
+
+[activation_checkpoint]
+mode = "selective"
+selective_ac_option = '2'
+
+[validation]
+enabled = false 
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
new file mode 100644
index 0000000000..a314d1711e
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
@@ -0,0 +1,62 @@
+[job]
+dump_folder = "./outputs"
+description = "Llama 3 debug training with FSDP on 2 GPUs"
+print_args = false
+use_for_integration_test = true
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "meta-llama/Llama-3.2-3B"
+flavor = "debugmodel"
+tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2
+decay_ratio = 0.8
+decay_type = "linear"
+min_lr_factor = 0.0
+
+[training]
+local_batch_size = 8
+seq_len = 2048
+max_norm = 1.0
+steps = 10
+compile = false
+dataset = "c4_test"
+dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+tensor_parallel_degree = 1
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+expert_parallel_degree = 1
+
+[checkpoint]
+enable_checkpoint = false
+
+[activation_checkpoint]
+mode = "selective"
+selective_ac_option = '2'
+
+[validation]
+enabled = false 
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml
similarity index 100%
rename from torchtitan/experiments/transformers_backend/debug/configs/debug_fsdp_2_gpu.toml
rename to torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml
diff --git a/torchtitan/experiments/transformers_backend/debug/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh
similarity index 94%
rename from torchtitan/experiments/transformers_backend/debug/run_train.sh
rename to torchtitan/experiments/transformers_backend/run_train.sh
index fc259612bc..74ef5603b1 100755
--- a/torchtitan/experiments/transformers_backend/debug/run_train.sh
+++ b/torchtitan/experiments/transformers_backend/run_train.sh
@@ -16,7 +16,7 @@ export LOG_RANK=${LOG_RANK:-0}
 # Option to switch between debug and train
 MODE=${MODE:-"train"}  # Set MODE=debug or MODE=train
 
-CONFIG_FILE=${CONFIG_FILE:-"configs/debug_fsdp_2_gpu.toml"}
+CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu.toml"}
 
 if [ "$MODE" = "debug" ]; then
     PYTHON_CMD="debugpy-run -m torch.distributed.run --"

From ea7c594c263aa856f1e82899427517bdc315bf8e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 28 Aug 2025 09:22:45 +0000
Subject: [PATCH 003/129] can now register train spec for hf model

---
 torchtitan/experiments/__init__.py            |   1 +
 .../transformers_backend/__init__.py          |  56 ++
 .../infra/parallelize_hf_transformers.py      | 503 ++++++++++++++++++
 .../model/hf_transformers_args.py             | 127 +++++
 4 files changed, 687 insertions(+)
 create mode 100644 torchtitan/experiments/transformers_backend/__init__.py
 create mode 100644 torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
 create mode 100644 torchtitan/experiments/transformers_backend/model/hf_transformers_args.py

diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
index 9d81f6b885..32a41004a2 100644
--- a/torchtitan/experiments/__init__.py
+++ b/torchtitan/experiments/__init__.py
@@ -7,3 +7,4 @@
 import torchtitan.experiments.llama4  # noqa: F401
 import torchtitan.experiments.qwen3
 import torchtitan.experiments.simple_fsdp  # noqa: F401
+import torchtitan.experiments.transformers_backend # noqa: F401
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
new file mode 100644
index 0000000000..5ec6386a2b
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import dataclasses
+
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.components.tokenizer import build_hf_tokenizer
+
+from torchtitan.models.llama3 import pipeline_llama
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+
+from .infra.parallelize_hf_transformers import parallelize_hf_transformers
+from .model.hf_transformers_args import HFTransformerModelArgs
+
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+
+
+__all__ = [
+    "HFTransformerModelArgs",
+    "LlamaForCausalLM", #TODO(3outeille): later use AutoModelForCausalLM
+    "hf_transformers_configs",
+]
+
+
+hf_configs = {
+    "debugmodel": HFTransformerModelArgs(
+        dim=256,
+        n_layers=6,
+        n_heads=16,
+        rope_theta=500000,
+    ),
+}
+
+hf_train_spec = TrainSpec(
+    name="hf_auto_model",
+    model_cls=LlamaForCausalLM,
+    model_args=hf_configs,
+    parallelize_fn=parallelize_hf_transformers,
+    pipelining_fn=pipeline_llama,
+    build_optimizers_fn=build_optimizers,
+    build_lr_schedulers_fn=build_lr_schedulers,
+    build_dataloader_fn=build_hf_dataloader,
+    build_tokenizer_fn=build_hf_tokenizer,
+    build_loss_fn=build_cross_entropy_loss,
+)
+
+# Register multiple train_specs under the same name
+register_train_spec(hf_train_spec)
+register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-3B"))
+register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-1B"))
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
new file mode 100644
index 0000000000..3f26036dc8
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -0,0 +1,503 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
+from torch.distributed.tensor import Partial, Replicate, Shard
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    PrepareModuleInput,
+    PrepareModuleInputOutput,
+    RowwiseParallel,
+    SequenceParallel,
+)
+from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+
+from torchtitan.distributed.expert_parallel import (
+    ExpertParallel,
+    ExpertTensorParallel,
+    NoParallel,
+    ReordererSequenceParallel,
+    TensorParallel,
+)
+from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
+
+from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
+from torchtitan.tools.logging import logger
+
+
+def parallelize_hf_transformers(
+    model: nn.Module,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    world_mesh = parallel_dims.world_mesh
+    # TODO: TP currently cannot handle uneven seq_len because we set
+    #       `use_local_output=True` to use plain Tensors for legacy reasons.
+    #       Need to revisit this.
+    assert (
+        job_config.training.seq_len % parallel_dims.seq_len_divisor == 0
+    ), f"""
+        Sequence length {job_config.training.seq_len} must be divisible by the product of TP degree
+        ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
+        """
+
+    if (
+        job_config.parallelism.context_parallel_degree > 1
+        and model.model_args.use_flex_attn
+    ):
+        raise NotImplementedError("CP support for FlexAttention is still in progress.")
+
+    if parallel_dims.tp_enabled:
+        enable_float8_linear = "float8" in job_config.model.converters
+        float8_is_rowwise = job_config.float8.recipe_name in (
+            "rowwise",
+            "rowwise_with_gw_hp",
+        )
+
+        # For now, float8 all-gather with TP is only supported for tensorwise
+        # float8 scaling recipes. For rowwise recipes, we use regular TP and
+        # all-gather happens in high precision.
+        enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+
+        apply_non_moe_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=not job_config.parallelism.disable_loss_parallel,
+            enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+        )
+        maybe_enable_async_tp(job_config, world_mesh["tp"])
+
+    if parallel_dims.tp_enabled or parallel_dims.ep_enabled:
+        apply_moe_ep_tp(
+            model,
+            tp_mesh=world_mesh["tp"] if parallel_dims.tp_enabled else None,
+            ep_mesh=world_mesh["ep"] if parallel_dims.ep_enabled else None,
+            ep_tp_mesh=(
+                world_mesh["ep", "tp"]
+                if parallel_dims.tp_enabled
+                and parallel_dims.ep_enabled
+                and parallel_dims.etp_enabled
+                else None
+            ),
+            etp_enabled=parallel_dims.etp_enabled,
+        )
+
+    if job_config.activation_checkpoint.mode != "none":
+        apply_ac(model, job_config.activation_checkpoint)
+
+    model_compile_enabled = (
+        job_config.compile.enable and "model" in job_config.compile.components
+    )
+    # turn on per-TransformerBlock compile after AC wrapping and before FSDP
+    if model_compile_enabled:
+        # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
+        torch._dynamo.config.capture_scalar_outputs = True
+        apply_compile(model)
+
+    dp_mesh: DeviceMesh | None = None
+    if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
+        # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+        dp_mesh = world_mesh[tuple(dp_mesh_dim_names)]
+
+        # the mesh dim names of which the MoE params are sharded on via FSDP/HSDP
+        dp_mod_ep_mesh_dim_names = []
+        if parallel_dims.ep_enabled:
+            if parallel_dims.dp_replicate_enabled:
+                dp_mod_ep_mesh_dim_names.append("dp_replicate")
+            dp_mod_ep_mesh_dim_names.append("dp_shard_mod_ep")
+
+        apply_fsdp(
+            model,
+            dp_mesh,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
+            ep_degree=parallel_dims.ep,
+            dp_mod_ep_mesh=(
+                world_mesh[tuple(dp_mod_ep_mesh_dim_names)]
+                if parallel_dims.ep_enabled
+                else None
+            ),
+            gradient_divide_factor=parallel_dims.fsdp_gradient_divide_factor,
+        )
+
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        dp_mesh = world_mesh
+        apply_ddp(
+            model,
+            dp_mesh,
+            enable_compile=model_compile_enabled,
+            enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
+        )
+
+    return model
+
+
+def apply_non_moe_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+    loss_parallel: bool,
+    enable_float8_tensorwise_tp: bool,
+):
+    """Apply tensor parallelism."""
+    # 1. Parallelize the embedding and shard its outputs (which are the first
+    # transformer block's inputs)
+    # 2. Parallelize the root norm layer over the sequence dim
+    # 3. Parallelize the final linear output layer
+    parallelize_module(
+        model,
+        tp_mesh,
+        {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            "norm": SequenceParallel(),
+            "output": ColwiseParallel(
+                input_layouts=Shard(1),
+                output_layouts=Shard(-1) if loss_parallel else Replicate(),
+                use_local_output=not loss_parallel,
+            ),
+        },
+    )
+
+    # Parallel styles used for transformer block linear weights and their
+    # inputs may be different for float8 linears with tensorwise scaling.
+    if enable_float8_tensorwise_tp:
+        # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
+        from torchao.float8.float8_tensor_parallel import (
+            Float8ColwiseParallel,
+            Float8RowwiseParallel,
+            PrepareFloat8ModuleInput,
+        )
+
+        rowwise_parallel, colwise_parallel, prepare_module_input = (
+            Float8RowwiseParallel,
+            Float8ColwiseParallel,
+            PrepareFloat8ModuleInput,
+        )
+    else:
+        rowwise_parallel, colwise_parallel, prepare_module_input = (
+            RowwiseParallel,
+            ColwiseParallel,
+            PrepareModuleInput,
+        )
+
+    # Apply tensor + sequence parallelism to every transformer block
+    for transformer_block in model.layers.values():
+        layer_plan = {
+            "attention_norm": SequenceParallel(),
+            "attention": prepare_module_input(
+                input_layouts=(Shard(1), None),
+                desired_input_layouts=(Replicate(), None),
+            ),
+            "attention.wq": colwise_parallel(),
+            "attention.wk": colwise_parallel(),
+            "attention.wv": colwise_parallel(),
+            "attention.wo": rowwise_parallel(output_layouts=Shard(1)),
+            "ffn_norm": SequenceParallel(),
+        }
+        if not transformer_block.moe_enabled:
+            layer_plan.update(
+                {
+                    "feed_forward": prepare_module_input(
+                        input_layouts=(Shard(1),),
+                        desired_input_layouts=(Replicate(),),
+                    ),
+                    "feed_forward.w1": colwise_parallel(),
+                    "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)),
+                    "feed_forward.w3": colwise_parallel(),
+                }
+            )
+
+        parallelize_module(
+            module=transformer_block,
+            device_mesh=tp_mesh,
+            parallelize_plan=layer_plan,
+        )
+
+    logger.info(
+        f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}"
+        "Tensor Parallelism to the model"
+    )
+
+
+def apply_fsdp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    param_dtype: torch.dtype,
+    reduce_dtype: torch.dtype,
+    pp_enabled: bool,
+    cpu_offload: bool = False,
+    reshard_after_forward_policy: str = "default",
+    ep_degree: int = 1,
+    dp_mod_ep_mesh: DeviceMesh | None = None,
+    gradient_divide_factor: int | None = None,
+):
+    """
+    Apply data parallelism (via FSDP2) to the model.
+
+    Args:
+        model (nn.Module): The model to apply data parallelism to.
+        dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
+        param_dtype (torch.dtype): The data type to use for model parameters.
+        reduce_dtype (torch.dtype): The data type to use for reduction operations.
+        pp_enabled (bool): Whether pipeline parallelism is enabled.
+        cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
+        reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default".
+            Other options: "never", "always".
+            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
+            - "always" will enable `reshard_after_forward` for all forward passes.
+            - "never" will disable `reshard_after_forward` for all forward passes.
+
+    """
+    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+    fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+    if cpu_offload:
+        fsdp_config["offload_policy"] = CPUOffloadPolicy()
+
+    match reshard_after_forward_policy:
+        case "always":
+            reshard_after_forward = True
+        case "never":
+            reshard_after_forward = False
+        case "default":
+            # For PP, by default do not reshard after forward to avoid per-microbatch
+            # all-gathers, which can be expensive and non-overlapped
+            reshard_after_forward = not pp_enabled
+        case _:
+            raise ValueError(
+                f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
+            )
+
+    if model.tok_embeddings is not None:
+        fully_shard(
+            model.tok_embeddings,
+            **fsdp_config,
+            reshard_after_forward=reshard_after_forward,
+        )
+
+    for layer_id, transformer_block in model.layers.items():
+        # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping
+        # - the router and the shared experts are sharded together with the TransformerBlock
+        # - the routed experts are sharded with the remaining dp_mod_ep_mesh
+        if transformer_block.moe_enabled and ep_degree > 1:
+            fsdp_mod_ep_config = fsdp_config.copy()
+            fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh
+
+            # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts).
+            #       When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding
+            #       causes inefficiency, so we choose to do FSDP sharding on dim-1.
+            #       Even when EP is not used, we may still want to shard the experts
+            #       on non-0 dim. For now it may not be worth the complexity to support
+            #       shard_placement_fn on the outer TransformerBlock-level FSDP.
+            _experts_shard_placement_fn = None
+            assert dp_mod_ep_mesh is not None
+            assert hasattr(transformer_block, "moe")
+            if (
+                dp_mod_ep_mesh.size() * ep_degree
+                > transformer_block.moe.experts.num_experts
+            ):
+                _experts_shard_placement_fn = lambda param: Shard(1)
+
+            fully_shard(
+                transformer_block.moe.experts,
+                **fsdp_mod_ep_config,
+                reshard_after_forward=reshard_after_forward,
+                shard_placement_fn=_experts_shard_placement_fn,
+            )
+
+            # NOTE: # Although the FSDP sharding of experts is done on a mesh of
+            #       a different size than other parameters, the gradient division
+            #       factor should be consistent with data.
+            transformer_block.moe.experts.set_gradient_divide_factor(
+                gradient_divide_factor,
+            )
+
+        fully_shard(
+            transformer_block,
+            **fsdp_config,
+            reshard_after_forward=reshard_after_forward,
+        )
+
+    # As an optimization, do not reshard_after_forward the last layers by default
+    # since FSDP would prefetch them immediately after the forward pass
+    if model.norm is not None and model.output is not None:
+        fully_shard(
+            [model.norm, model.output],
+            **fsdp_config,
+            reshard_after_forward=reshard_after_forward_policy == "always",
+        )
+
+    fully_shard(model, **fsdp_config)
+
+    # NOTE: set up explicit prefetching when EP is enabled, as D2H syncs
+    # in EP could interfere with implicit prefetching in FSDP
+    if ep_degree == 1:
+        return
+
+    # forward
+    transformer_blocks = list(model.layers.values())
+    next_transformer_blocks = transformer_blocks[1:] + [None]
+
+    if model.tok_embeddings is not None and model.layers is not None:
+        model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]])
+
+    for transformer_block, next_transformer_block in zip(
+        transformer_blocks, next_transformer_blocks
+    ):
+        if next_transformer_block is not None:
+            if next_transformer_block.moe_enabled:
+                transformer_block.set_modules_to_forward_prefetch(
+                    [next_transformer_block, next_transformer_block.moe.experts]
+                )
+            else:
+                transformer_block.set_modules_to_forward_prefetch(
+                    [next_transformer_block]
+                )
+        elif model.norm is not None and model.output is not None:
+            transformer_block.set_modules_to_forward_prefetch(
+                [model.norm, model.output]
+            )
+
+    # backward
+    reversed_transformer_blocks = list(reversed(model.layers.values()))
+    prev_transformer_blocks = reversed_transformer_blocks[1:] + [None]
+
+    if model.norm is not None and model.output is not None and model.layers is not None:
+        model.output.set_modules_to_backward_prefetch([reversed_transformer_blocks[0]])
+
+    for transformer_block, prev_transformer_block in zip(
+        reversed_transformer_blocks, prev_transformer_blocks
+    ):
+        if prev_transformer_block is not None:
+            if prev_transformer_block.moe_enabled:
+                transformer_block.set_modules_to_backward_prefetch(
+                    [prev_transformer_block, prev_transformer_block.moe.experts]
+                )
+            else:
+                transformer_block.set_modules_to_backward_prefetch(
+                    [prev_transformer_block]
+                )
+        elif model.tok_embeddings is not None:
+            transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings])
+
+
+def apply_moe_ep_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh | None,
+    ep_mesh: DeviceMesh | None,
+    ep_tp_mesh: DeviceMesh | None,
+    etp_enabled: bool,
+):
+    for transformer_block in model.layers.values():
+        if not transformer_block.moe_enabled:
+            continue
+
+        if tp_mesh is not None:
+            moe_layer_plan = {
+                # input / output sharding on the seqlen dim
+                # all-gather for input, reduce-scatter for output
+                "moe": PrepareModuleInputOutput(
+                    input_layouts=(Shard(1),),
+                    desired_input_layouts=(Replicate(),),
+                    use_local_input=True,
+                    output_layouts=(Partial(),),
+                    desired_output_layouts=(Shard(1),),
+                ),
+                # replicate computation for the router
+                "moe.router.gate": NoParallel(),
+            }
+            if ep_mesh is not None and not etp_enabled:
+                # If TP is borrowed for EP, then split the tokens across TP ranks so that
+                # the reorderer, the all-to-all comms, and routed experts computation
+                # are effectively running Sequence Parallel (split along the folded bs*slen dim)
+                moe_layer_plan.update({"moe.reorderer": ReordererSequenceParallel()})
+            if transformer_block.moe.shared_experts is not None:
+                # input Replicate, output Partial
+                moe_layer_plan.update(
+                    {
+                        "moe.shared_experts.w1": ColwiseParallel(),
+                        "moe.shared_experts.w2": RowwiseParallel(
+                            output_layouts=Partial()
+                        ),
+                        "moe.shared_experts.w3": ColwiseParallel(),
+                    }
+                )
+            parallelize_module(
+                module=transformer_block,
+                device_mesh=tp_mesh,
+                parallelize_plan=moe_layer_plan,
+            )
+
+        experts_mesh, experts_plan = None, None
+        if ep_mesh is None:
+            experts_mesh = tp_mesh
+            # input Replicate, output Partial
+            experts_plan = TensorParallel()
+        elif tp_mesh is None:
+            experts_mesh = ep_mesh
+            # input / output sharding on the batch / tokens dim
+            experts_plan = ExpertParallel()
+        elif etp_enabled:
+            experts_mesh = ep_tp_mesh
+            experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
+        else:
+            experts_mesh = ep_mesh
+            experts_plan = ExpertParallel()
+
+        parallelize_module(
+            module=transformer_block.moe.experts,
+            device_mesh=experts_mesh,
+            parallelize_plan=experts_plan,
+        )
+
+
+def apply_compile(model: nn.Module):
+    """
+    Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
+    repeated structure. Alternatively one can compile the whole model (after applying DP).
+    """
+    for layer_id, transformer_block in model.layers.named_children():
+        # TODO: remove when torch.compile supports fullgraph=True for MoE
+        fullgraph = True
+        if transformer_block.moe_enabled:
+            fullgraph = False
+        transformer_block = torch.compile(transformer_block, fullgraph=fullgraph)
+        model.layers.register_module(layer_id, transformer_block)
+
+    logger.info("Compiling each TransformerBlock with torch.compile")
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
new file mode 100644
index 0000000000..92e149625b
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -0,0 +1,127 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from dataclasses import dataclass, field
+
+from torch import nn
+
+from torchtitan.config import JobConfig
+
+from torchtitan.models.moe import MoEArgs
+from torchtitan.protocols import BaseModelArgs
+from torchtitan.tools.logging import logger
+from torchtitan.tools.utils import has_cuda_capability
+
+
+@dataclass
+class HFTransformerModelArgs(BaseModelArgs):
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: int | None = None
+    vocab_size: int = 202048
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: float | None = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000
+
+    max_seq_len: int = 1048576
+    # If `True`, then each transformer block init uses its layer ID, and if
+    # `False`, each uses the total number of transformer blocks
+    depth_init: bool = True
+
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
+    # iRoPE settings
+    # When ``every_n_layers_nope`` is specified, NoPE (no positional embedding) is
+    # used every n layers. Other layers uses RoPE (rotary positional embedding) and
+    # the inner attention of those layer will use the fixed block size specified by
+    # ``fixed_attn_block_size``. ``fixed_attn_block_size`` means that the query will
+    # only attend to the tokens within the same block regardless how long is the
+    # sequence.
+    every_n_layers_nope: int | None = None
+    fixed_attn_block_size: int = 8192
+
+    # MoE
+    moe_args: MoEArgs = field(default_factory=MoEArgs)
+    auto_scale_hidden_dim: bool = True
+    # frequency of using MoE layer instead of feedforward layer in a transformer block
+    interleave_moe_layer_step: int = 2
+
+    def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
+        seq_len = job_config.training.seq_len
+        if seq_len > self.max_seq_len:
+            logger.warning(
+                f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}."
+            )
+        self.max_seq_len = seq_len
+
+        if self.moe_args.use_grouped_mm and not has_cuda_capability(9, 0):
+            logger.warning(
+                "Failed to use grouped mm, which is only supported on SM90 or later",
+            )
+            self.moe_args.use_grouped_mm = False
+
+        if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
+            raise NotImplementedError(
+                "CP support for FlexAttention is still in progress."
+            )
+
+    def get_nparams_and_flops(
+        self, model: nn.Module, seq_len: int
+    ) -> tuple[int, float]:
+        return 0, 0
+        # nparams_embedding = 0
+        # nparams_moe_router = 0
+        # nparams_shared_experts = 0
+        # nparams_experts = 0
+        # nparams_dense = 0
+
+        # for name, p in model.named_parameters():
+        #     if "embedding" in name:
+        #         nparams_embedding += p.numel()
+        #         nparams_dense += p.numel()
+        #     elif "moe.shared_experts" in name:
+        #         nparams_shared_experts += p.numel()
+        #     elif "moe.router" in name:
+        #         nparams_moe_router += p.numel()
+        #     elif "moe.experts" in name:
+        #         nparams_experts += p.numel()
+        #     else:
+        #         nparams_dense += p.numel()
+
+        # nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
+        # nparams = nparams_dense + nparams_sparse
+        # nparams_sparse_active = (
+        #     nparams_moe_router
+        #     + nparams_shared_experts
+        #     + nparams_experts * self.moe_args.top_k // self.moe_args.num_experts
+        # )
+
+        # logger.info(
+        #     f"Total parameter count: dense {nparams_dense:,}, "
+        #     f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
+        # )
+
+        # l, h, q, t = (
+        #     self.n_layers,
+        #     self.n_heads,
+        #     self.dim // self.n_heads,
+        #     seq_len,
+        # )
+        # # Reasoning behind the factor of 12 for the self-attention part of the formula:
+        # # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+        # # 2. the flash attention does 1 more matmul recomputation in the backward
+        # #    but recomputation should not be counted in calculating MFU           (+0)
+        # # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+        # # 4. we follow the convention and do not account for sparsity in causal attention
+        # num_flops_per_token = (
+        #     6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
+        #     + 12 * l * h * q * t
+        # )
+
+        # return nparams, num_flops_per_token

From 5f0adf5c226aa9af321cdd27b7d379fd03823e10 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 28 Aug 2025 13:56:04 +0000
Subject: [PATCH 004/129] can now switch with different flavors using HF Llama
 modeling

---
 .../transformers_backend/__init__.py          |  15 +-
 .../configs/debug_1_gpu.toml                  |   1 -
 .../configs/debug_1_gpu_hf.toml               |   4 +-
 .../model/hf_transformers_args.py             | 148 ++++++------------
 4 files changed, 64 insertions(+), 104 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 5ec6386a2b..e416731205 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -28,19 +28,26 @@
 ]
 
 
-hf_configs = {
-    "debugmodel": HFTransformerModelArgs(
-        dim=256,
+flavors = {
+    "debug": HFTransformerModelArgs(
+        dim=1,
         n_layers=6,
         n_heads=16,
         rope_theta=500000,
     ),
+    "medium": HFTransformerModelArgs(
+        dim=40,
+        n_layers=24,
+        n_heads=32,
+        rope_theta=500000,
+    ),
+    "full": HFTransformerModelArgs(),
 }
 
 hf_train_spec = TrainSpec(
     name="hf_auto_model",
     model_cls=LlamaForCausalLM,
-    model_args=hf_configs,
+    model_args=flavors,
     parallelize_fn=parallelize_hf_transformers,
     pipelining_fn=pipeline_llama,
     build_optimizers_fn=build_optimizers,
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
index c2f4dd7136..34f6953869 100644
--- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
+++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
@@ -20,7 +20,6 @@ enable_wandb = false
 
 [model]
 name = "llama3"
-hf_name = "Llama-3.2-3B"
 flavor = "debugmodel"
 tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
 
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
index a314d1711e..30872e903c 100644
--- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
+++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
@@ -19,8 +19,8 @@ save_tb_folder = "tb"
 enable_wandb = false
 
 [model]
-name = "meta-llama/Llama-3.2-3B"
-flavor = "debugmodel"
+name = "meta-llama/Llama-3.2-1B"
+flavor = "medium"
 tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
 
 [optimizer]
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 92e149625b..956ce5a853 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -4,124 +4,78 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from typing import Optional, Union
+import os
 
 from torch import nn
-
 from torchtitan.config import JobConfig
-
-from torchtitan.models.moe import MoEArgs
 from torchtitan.protocols import BaseModelArgs
 from torchtitan.tools.logging import logger
-from torchtitan.tools.utils import has_cuda_capability
+from transformers.models.llama.configuration_llama import LlamaConfig
 
 
 @dataclass
 class HFTransformerModelArgs(BaseModelArgs):
+    # Torchtitan naming
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
-    n_kv_heads: int | None = None
-    vocab_size: int = 202048
-    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    ffn_dim_multiplier: float | None = None
-    norm_eps: float = 1e-5
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = 128256
+    multiple_of: int = 256
+    ffn_dim_multiplier: Optional[float] = None
     rope_theta: float = 10000
-
-    max_seq_len: int = 1048576
-    # If `True`, then each transformer block init uses its layer ID, and if
-    # `False`, each uses the total number of transformer blocks
+    max_seq_len: int = 2048
+    
+    # HF compatibility
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = True
     depth_init: bool = True
-
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
-    # iRoPE settings
-    # When ``every_n_layers_nope`` is specified, NoPE (no positional embedding) is
-    # used every n layers. Other layers uses RoPE (rotary positional embedding) and
-    # the inner attention of those layer will use the fixed block size specified by
-    # ``fixed_attn_block_size``. ``fixed_attn_block_size`` means that the query will
-    # only attend to the tokens within the same block regardless how long is the
-    # sequence.
-    every_n_layers_nope: int | None = None
-    fixed_attn_block_size: int = 8192
-
-    # MoE
-    moe_args: MoEArgs = field(default_factory=MoEArgs)
-    auto_scale_hidden_dim: bool = True
-    # frequency of using MoE layer instead of feedforward layer in a transformer block
-    interleave_moe_layer_step: int = 2
-
-    def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
+    eos_id: int = 0
+
+    def update_from_config(self, job_config: JobConfig):
+        #TODO(3outeille): what if we dont specify flavor? Should use full as default
+        flavor = getattr(job_config.model, "flavor", None)
+        
+        if flavor == "full":
+            model_name_or_config: Union[LlamaConfig, str, os.PathLike] = job_config.model.name
+            hf_model_config = LlamaConfig.from_pretrained(model_name_or_config)
+
+            #TODO(3outeille): use getattr to handle models that don't have all the attributes
+            self.dim = hf_model_config.hidden_size
+            self.n_layers = hf_model_config.num_hidden_layers
+            self.n_heads = hf_model_config.num_attention_heads
+            self.n_kv_heads = hf_model_config.num_key_value_heads
+            self.vocab_size = hf_model_config.vocab_size
+            self.rope_theta = getattr(hf_model_config, "rope_theta", 10000.0)
+            self.max_seq_len = hf_model_config.max_position_embeddings
+            self.rms_norm_eps = getattr(hf_model_config, "rms_norm_eps", 1e-6)
+
+            if hasattr(hf_model_config, "intermediate_size") and hf_model_config.intermediate_size:
+                self.ffn_dim_multiplier = hf_model_config.intermediate_size / hf_model_config.hidden_size
+
+        # Always update max_seq_len to match training seq_len, warn if exceeded
         seq_len = job_config.training.seq_len
         if seq_len > self.max_seq_len:
-            logger.warning(
-                f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}."
-            )
+            logger.warning(f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}.")
         self.max_seq_len = seq_len
 
-        if self.moe_args.use_grouped_mm and not has_cuda_capability(9, 0):
-            logger.warning(
-                "Failed to use grouped mm, which is only supported on SM90 or later",
-            )
-            self.moe_args.use_grouped_mm = False
-
         if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
-            raise NotImplementedError(
-                "CP support for FlexAttention is still in progress."
-            )
-
-    def get_nparams_and_flops(
-        self, model: nn.Module, seq_len: int
-    ) -> tuple[int, float]:
-        return 0, 0
-        # nparams_embedding = 0
-        # nparams_moe_router = 0
-        # nparams_shared_experts = 0
-        # nparams_experts = 0
-        # nparams_dense = 0
-
-        # for name, p in model.named_parameters():
-        #     if "embedding" in name:
-        #         nparams_embedding += p.numel()
-        #         nparams_dense += p.numel()
-        #     elif "moe.shared_experts" in name:
-        #         nparams_shared_experts += p.numel()
-        #     elif "moe.router" in name:
-        #         nparams_moe_router += p.numel()
-        #     elif "moe.experts" in name:
-        #         nparams_experts += p.numel()
-        #     else:
-        #         nparams_dense += p.numel()
-
-        # nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
-        # nparams = nparams_dense + nparams_sparse
-        # nparams_sparse_active = (
-        #     nparams_moe_router
-        #     + nparams_shared_experts
-        #     + nparams_experts * self.moe_args.top_k // self.moe_args.num_experts
-        # )
+            raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
-        # logger.info(
-        #     f"Total parameter count: dense {nparams_dense:,}, "
-        #     f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
-        # )
+        return self
 
-        # l, h, q, t = (
-        #     self.n_layers,
-        #     self.n_heads,
-        #     self.dim // self.n_heads,
-        #     seq_len,
-        # )
-        # # Reasoning behind the factor of 12 for the self-attention part of the formula:
-        # # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
-        # # 2. the flash attention does 1 more matmul recomputation in the backward
-        # #    but recomputation should not be counted in calculating MFU           (+0)
-        # # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
-        # # 4. we follow the convention and do not account for sparsity in causal attention
-        # num_flops_per_token = (
-        #     6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
-        #     + 12 * l * h * q * t
-        # )
+    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        nparams = sum(p.numel() for p in model.parameters())
+        nparams_embedding = sum(
+            sum(p.numel() for p in m.parameters())
+            for m in model.children()
+            if isinstance(m, nn.Embedding)
+        )
 
-        # return nparams, num_flops_per_token
+        l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len
+        num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+        return nparams, num_flops_per_token

From 7c3795cf1b1ada252aefdb9e89a900cd4b10f0f4 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 28 Aug 2025 14:30:24 +0000
Subject: [PATCH 005/129] it is now working up to apply_ac

---
 .../model/hf_transformers_args.py             | 29 ++++++++++++++++++-
 torchtitan/train.py                           |  2 +-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 956ce5a853..a9c24dd30d 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional, Union
 import os
 
@@ -36,6 +36,8 @@ class HFTransformerModelArgs(BaseModelArgs):
     attn_mask_type: str = "causal"
     eos_id: int = 0
 
+    _torchtitan_args: dict = field(init=False, repr=False, default_factory=dict)
+
     def update_from_config(self, job_config: JobConfig):
         #TODO(3outeille): what if we dont specify flavor? Should use full as default
         flavor = getattr(job_config.model, "flavor", None)
@@ -45,6 +47,7 @@ def update_from_config(self, job_config: JobConfig):
             hf_model_config = LlamaConfig.from_pretrained(model_name_or_config)
 
             #TODO(3outeille): use getattr to handle models that don't have all the attributes
+            # Fill torchtitan args with HF ones
             self.dim = hf_model_config.hidden_size
             self.n_layers = hf_model_config.num_hidden_layers
             self.n_heads = hf_model_config.num_attention_heads
@@ -66,8 +69,32 @@ def update_from_config(self, job_config: JobConfig):
         if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
             raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
+        self._torchtitan_args = {
+            "dim": self.dim,
+            "n_layers": self.n_layers,
+            "n_heads": self.n_heads,
+            "n_kv_heads": self.n_kv_heads,
+            "vocab_size": self.vocab_size,
+            "multiple_of": self.multiple_of,
+            "ffn_dim_multiplier": self.ffn_dim_multiplier,
+            "rope_theta": self.rope_theta,
+            "max_seq_len": self.max_seq_len,
+            "rms_norm_eps": self.rms_norm_eps,
+            "use_cache": self.use_cache,
+            "depth_init": self.depth_init,
+            "use_flex_attn": self.use_flex_attn,
+            "attn_mask_type": self.attn_mask_type,
+            "eos_id": self.eos_id,
+        }
         return self
 
+    def convert_to_hf_config(self) -> LlamaConfig:
+        if not self._torchtitan_args:
+            raise RuntimeError(
+                "`update_from_config` must be called before `convert_to_hf_config` to prepare the arguments."
+            )
+        return LlamaConfig(**self._torchtitan_args)
+
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
         nparams_embedding = sum(
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 9b69fd6798..76737c5fc7 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -155,7 +155,7 @@ def __init__(self, job_config: JobConfig):
             f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
         )
         with torch.device("meta"):
-            model = self.train_spec.model_cls(model_args)
+            model = self.train_spec.model_cls(model_args.convert_to_hf_config())
 
         # Build the collection of model converters. No-op if `model.converters` empty
         model_converters = build_model_converters(job_config, parallel_dims)

From 3fb2bf825e5224d805ca8845132b76226cb97984 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sat, 6 Sep 2025 08:03:45 +0000
Subject: [PATCH 006/129] now working up to init_weights

---
 torchtitan/models/llama3/infra/parallelize.py | 11 +++++++++--
 torchtitan/train.py                           |  7 ++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
index 7d0b5de92b..8165f8e907 100644
--- a/torchtitan/models/llama3/infra/parallelize.py
+++ b/torchtitan/models/llama3/infra/parallelize.py
@@ -34,6 +34,7 @@
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.tools.logging import logger
 
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
 
 def parallelize_llama(
     model: nn.Module,
@@ -325,11 +326,17 @@ def selective_checkpointing_context_fn():
 
 def apply_ac(model: nn.Module, ac_config: ACConfig):
     """Apply activation checkpointing to the model."""
-    for layer_id, transformer_block in model.layers.named_children():
+    # TODO(3outeille): Make it more generic later
+    if isinstance(model, LlamaForCausalLM):
+        layers = model.model.layers
+    else:
+        layers = model.layers
+
+    for layer_id, transformer_block in layers.named_children():
         transformer_block = _apply_ac_to_transformer_block(
             transformer_block, ac_config, base_fqn=f"layers.{layer_id}"
         )
-        model.layers.register_module(layer_id, transformer_block)
+        layers.register_module(layer_id, transformer_block)
 
     logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
 
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 76737c5fc7..a21bd7bf9d 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -33,6 +33,8 @@
     maybe_enable_profiling,
 )
 
+from torchtitan.experiments.transformers_backend.model.hf_transformers_args import HFTransformerModelArgs
+
 
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):
     # core configs
@@ -155,7 +157,10 @@ def __init__(self, job_config: JobConfig):
             f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
         )
         with torch.device("meta"):
-            model = self.train_spec.model_cls(model_args.convert_to_hf_config())
+            if isinstance(model_args, HFTransformerModelArgs):
+                model = self.train_spec.model_cls(model_args.convert_to_hf_config())
+            else:
+                model = self.train_spec.model_cls(model_args)
 
         # Build the collection of model converters. No-op if `model.converters` empty
         model_converters = build_model_converters(job_config, parallel_dims)

From 25daecaaa1952cef8ade604708544c224e29f454 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sat, 6 Sep 2025 09:19:03 +0000
Subject: [PATCH 007/129] fix mapping when convert_to_hf_config + add breaking
 test to ensure proper mapping

---
 .../model/hf_transformers_args.py             | 27 +++++-----
 .../test_hf_torchtitan_model_args.py          | 51 +++++++++++++++++++
 torchtitan/train.py                           |  6 ++-
 3 files changed, 68 insertions(+), 16 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index a9c24dd30d..94b014dfd7 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -36,7 +36,7 @@ class HFTransformerModelArgs(BaseModelArgs):
     attn_mask_type: str = "causal"
     eos_id: int = 0
 
-    _torchtitan_args: dict = field(init=False, repr=False, default_factory=dict)
+    _hf_args: dict = field(init=False, repr=False, default_factory=dict)
 
     def update_from_config(self, job_config: JobConfig):
         #TODO(3outeille): what if we dont specify flavor? Should use full as default
@@ -69,31 +69,28 @@ def update_from_config(self, job_config: JobConfig):
         if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
             raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
-        self._torchtitan_args = {
-            "dim": self.dim,
-            "n_layers": self.n_layers,
-            "n_heads": self.n_heads,
-            "n_kv_heads": self.n_kv_heads,
+        self._hf_args = {
+            "hidden_size": self.dim,
+            "num_hidden_layers": self.n_layers,
+            "num_attention_heads": self.n_heads,
+            "num_key_value_heads": self.n_kv_heads,
             "vocab_size": self.vocab_size,
-            "multiple_of": self.multiple_of,
-            "ffn_dim_multiplier": self.ffn_dim_multiplier,
+            "rope_scaling": {"type": "dynamic", "factor": 2.0},
+            "intermediate_size": self.ffn_dim_multiplier,
             "rope_theta": self.rope_theta,
-            "max_seq_len": self.max_seq_len,
+            "max_position_embeddings": self.max_seq_len,
             "rms_norm_eps": self.rms_norm_eps,
             "use_cache": self.use_cache,
-            "depth_init": self.depth_init,
-            "use_flex_attn": self.use_flex_attn,
-            "attn_mask_type": self.attn_mask_type,
-            "eos_id": self.eos_id,
+            "pad_token_id": self.eos_id,
         }
         return self
 
     def convert_to_hf_config(self) -> LlamaConfig:
-        if not self._torchtitan_args:
+        if not self._hf_args:
             raise RuntimeError(
                 "`update_from_config` must be called before `convert_to_hf_config` to prepare the arguments."
             )
-        return LlamaConfig(**self._torchtitan_args)
+        return LlamaConfig(**self._hf_args)
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
diff --git a/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py b/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py
new file mode 100644
index 0000000000..d83f268091
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py
@@ -0,0 +1,51 @@
+from transformers.models.llama.configuration_llama import LlamaConfig
+from torchtitan.experiments.transformers_backend.model.hf_transformers_args import (
+    HFTransformerModelArgs,
+)
+from torchtitan.config import JobConfig
+
+
+def print_comparison_keys(ref_dict, tt_dict):
+    all_keys = sorted(list(set(ref_dict.keys()) | set(tt_dict.keys())))
+    print(f"{'Attribute':<30} | {'Original HF':<20} | {'TorchTitan HF':<20}")
+    print("-" * 75)
+    for key in all_keys:
+        ref_val = ref_dict.get(key, "N/A")
+        tt_val = tt_dict.get(key, "N/A")
+        if str(ref_val) != str(tt_val):
+            # Red for different
+            print(f"\033[91m{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}\033[0m")
+        else:
+            print(f"{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}")
+
+def compare_hf_tt_configs(model_name, flavor):
+        ref_hf_config = LlamaConfig()
+        
+        model_args = HFTransformerModelArgs()
+        job_config = JobConfig()
+        job_config.model.name = model_name
+        job_config.model.flavor = flavor
+        model_args.update_from_config(job_config)
+        tt_hf_config = model_args.convert_to_hf_config()
+
+        ref_dict = ref_hf_config.to_dict()
+        tt_dict = tt_hf_config.to_dict()
+
+        try:
+            assert ref_dict == tt_dict
+            print(f"✅ Configs match for model name {model_name} with flavor: {flavor}")
+        except AssertionError:
+            print(f"❌ Configs do not match for model name {model_name} with flavor: {flavor}! Showing differences:")
+            print_comparison_keys(ref_dict, tt_dict)
+            raise
+
+if __name__ == "__main__":
+    model_names = [
+        "meta-llama/Llama-3.2-1B",
+    ]
+    flavors = ["full"]
+
+    for model_name in model_names:
+        for flavor in flavors:
+            print(f"\nTesting model name: {model_name} with flavor: {flavor}")
+            compare_hf_tt_configs(model_name, flavor)
\ No newline at end of file
diff --git a/torchtitan/train.py b/torchtitan/train.py
index a21bd7bf9d..d4a00ad98e 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -34,6 +34,7 @@
 )
 
 from torchtitan.experiments.transformers_backend.model.hf_transformers_args import HFTransformerModelArgs
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
 
 
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):
@@ -266,7 +267,10 @@ def __init__(self, job_config: JobConfig):
 
             model.to_empty(device=init_device)
             with torch.no_grad():
-                model.init_weights(buffer_device=buffer_device)
+                if isinstance(model, LlamaForCausalLM):
+                    model.post_init()
+                else:
+                    model.init_weights(buffer_device=buffer_device)
             model.train()
 
             self.model_parts = [model]

From 3e67f2cccee7c889a3fd1d23e71dc8dc648f5ad8 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 8 Sep 2025 08:42:21 +0000
Subject: [PATCH 008/129] define own apply_ac for transformer backend instead
 of reusing llama3

---
 .../infra/parallelize_hf_transformers.py      | 142 +++++++++++++++++-
 .../model/hf_transformers_args.py             |   2 +-
 torchtitan/models/llama3/infra/parallelize.py |  10 +-
 3 files changed, 143 insertions(+), 11 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 3f26036dc8..04ffaaeffb 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -4,9 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from collections import defaultdict
+from typing import Optional
 
 import torch
 import torch.nn as nn
+from torch.distributed._composable.replicate import replicate
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper as ptd_checkpoint_wrapper,
+)
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Partial, Replicate, Shard
@@ -29,10 +35,142 @@
     TensorParallel,
 )
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-
-from torchtitan.models.llama3.infra.parallelize import apply_ac, apply_ddp
+from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
 from torchtitan.tools.logging import logger
 
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+
+# for selective op activation checkpointing
+_save_list = {
+    torch.ops.aten.mm.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
+    torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    # for low precision training, it's useful to always save
+    # the result of max, since the absolute maximum is
+    # used to compute the scaling factor for quantization.
+    torch.ops.aten.max.default,
+    torch._higher_order_ops.flex_attention,
+}
+
+def _apply_ac_to_transformer_block(
+    module: nn.Module, ac_config: ACConfig, *, base_fqn: Optional[str] = None
+):
+    valid_ac_modes = ("full", "selective")
+    if ac_config.mode not in valid_ac_modes:
+        raise ValueError(
+            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
+        )
+
+    if ac_config.mode == "full":
+        return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+
+    assert ac_config.mode == "selective", f"{ac_config.mode}"
+    use_op_sac = ac_config.selective_ac_option == "op"
+    use_layer_sac = ac_config.selective_ac_option.isdigit()
+    if not use_op_sac and not use_layer_sac:
+        raise ValueError(
+            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
+            f"Valid options: 'op' or a positive int representing layer frequency"
+        )
+    if use_op_sac:
+        from torch.utils.checkpoint import (
+            CheckpointPolicy,
+            create_selective_checkpoint_contexts,
+        )
+
+        mm_recompute_shapes = set()
+        if len(ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns) > 0:
+            for module_fqn, submod in module.named_modules():
+                fqn = module_fqn
+                if base_fqn is not None:
+                    fqn = f"{base_fqn}.{module_fqn}"
+                if not any(
+                    filter_fqn in fqn
+                    for filter_fqn in ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns
+                ):
+                    continue
+                if not isinstance(submod, nn.Linear):
+                    raise ValueError(
+                        "per_op_sac_force_recompute_mm_shapes_by_fqns expected to match "
+                        f"a nn.Linear, but got: {submod}"
+                    )
+                out_f, in_f = submod.weight.shape
+                mm_recompute_shapes.add((in_f, out_f))
+            logger.debug(
+                f"Selective op AC force recomputing mms with rhs shapes {mm_recompute_shapes}"
+            )
+
+        def _get_custom_policy(meta):
+            def _custom_policy(ctx, func, *args, **kwargs):
+                mode = "recompute" if ctx.is_recompute else "forward"
+                mm_count_key = f"{mode}_mm_count"
+                if func == torch.ops.aten.mm.default:
+                    if args[1].shape in mm_recompute_shapes:
+                        return CheckpointPolicy.PREFER_RECOMPUTE
+                    meta[mm_count_key] += 1
+                # Saves output of all compute ops, except every second mm
+                to_save = func in _save_list and not (
+                    func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
+                )
+                return (
+                    CheckpointPolicy.MUST_SAVE
+                    if to_save
+                    else CheckpointPolicy.PREFER_RECOMPUTE
+                )
+
+            return _custom_policy
+
+        def selective_checkpointing_context_fn():
+            meta = defaultdict(int)
+            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
+
+        return ptd_checkpoint_wrapper(
+            module,
+            context_fn=selective_checkpointing_context_fn,
+            preserve_rng_state=False,
+        )
+    elif use_layer_sac:
+        # Checkpoint every `ac_freq` of the modules passed to this function
+        ac_freq = int(ac_config.selective_ac_option)
+        ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0)
+        ptd_checkpoint_wrapper._count += 1
+        if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0:
+            return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+        else:
+            return module
+
+def apply_ac(model: nn.Module, ac_config: ACConfig):
+    """Apply activation checkpointing to the model."""
+    # TODO(3outeille): Make it more generic later
+    layers = model.model.layers
+
+    for layer_id, transformer_block in layers.named_children():
+        transformer_block = _apply_ac_to_transformer_block(
+            transformer_block, ac_config, base_fqn=f"layers.{layer_id}"
+        )
+        layers.register_module(layer_id, transformer_block)
+
+    logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
+
+def apply_ddp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    enable_compile: bool,
+    enable_compiled_autograd: bool,
+):
+    if enable_compile:
+        if enable_compiled_autograd:
+            torch._dynamo.config.optimize_ddp = (
+                "python_reducer_without_compiled_forward"
+            )
+        else:
+            torch._dynamo.config.optimize_ddp = "ddp_optimizer"
+
+    replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+
+    logger.info("Applied DDP to the model")
+
 
 def parallelize_hf_transformers(
     model: nn.Module,
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 94b014dfd7..e20da24c5b 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -58,7 +58,7 @@ def update_from_config(self, job_config: JobConfig):
             self.rms_norm_eps = getattr(hf_model_config, "rms_norm_eps", 1e-6)
 
             if hasattr(hf_model_config, "intermediate_size") and hf_model_config.intermediate_size:
-                self.ffn_dim_multiplier = hf_model_config.intermediate_size / hf_model_config.hidden_size
+                self.ffn_dim_multiplier = hf_model_config.intermediate_size // hf_model_config.hidden_size
 
         # Always update max_seq_len to match training seq_len, warn if exceeded
         seq_len = job_config.training.seq_len
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
index 8165f8e907..6da44a321d 100644
--- a/torchtitan/models/llama3/infra/parallelize.py
+++ b/torchtitan/models/llama3/infra/parallelize.py
@@ -326,17 +326,11 @@ def selective_checkpointing_context_fn():
 
 def apply_ac(model: nn.Module, ac_config: ACConfig):
     """Apply activation checkpointing to the model."""
-    # TODO(3outeille): Make it more generic later
-    if isinstance(model, LlamaForCausalLM):
-        layers = model.model.layers
-    else:
-        layers = model.layers
-
-    for layer_id, transformer_block in layers.named_children():
+    for layer_id, transformer_block in model.layers.named_children():
         transformer_block = _apply_ac_to_transformer_block(
             transformer_block, ac_config, base_fqn=f"layers.{layer_id}"
         )
-        layers.register_module(layer_id, transformer_block)
+        model.layers.register_module(layer_id, transformer_block)
 
     logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
 

From 8c5c0ae63b0d784dac3476140ead3089fd62bdc4 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 9 Sep 2025 08:00:32 +0000
Subject: [PATCH 009/129] HF model without any parallelism now train (but
 grad_norm is high)

---
 .../transformers_backend/__init__.py          |  14 +--
 .../infra/parallelize_hf_transformers.py      |   3 -
 .../model/hf_transformers_args.py             | 104 +++++++++---------
 torchtitan/train.py                           |  38 ++++++-
 4 files changed, 89 insertions(+), 70 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index e416731205..504adfc88e 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -29,17 +29,13 @@
 
 
 flavors = {
-    "debug": HFTransformerModelArgs(
-        dim=1,
-        n_layers=6,
-        n_heads=16,
-        rope_theta=500000,
+    "debugmodel": HFTransformerModelArgs(
+        n_layers=2,
+        vocab_size=2000,
     ),
     "medium": HFTransformerModelArgs(
-        dim=40,
-        n_layers=24,
-        n_heads=32,
-        rope_theta=500000,
+        dim=1024,
+        n_layers=12,
     ),
     "full": HFTransformerModelArgs(),
 }
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 04ffaaeffb..2f0d9167b0 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -38,8 +38,6 @@
 from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
 from torchtitan.tools.logging import logger
 
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
-
 # for selective op activation checkpointing
 _save_list = {
     torch.ops.aten.mm.default,
@@ -142,7 +140,6 @@ def selective_checkpointing_context_fn():
 
 def apply_ac(model: nn.Module, ac_config: ACConfig):
     """Apply activation checkpointing to the model."""
-    # TODO(3outeille): Make it more generic later
     layers = model.model.layers
 
     for layer_id, transformer_block in layers.named_children():
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index e20da24c5b..63e252d851 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -16,7 +16,7 @@
 
 
 @dataclass
-class HFTransformerModelArgs(BaseModelArgs):
+class HFTransformerModelArgs(LlamaConfig, BaseModelArgs):
     # Torchtitan naming
     dim: int = 4096
     n_layers: int = 32
@@ -25,72 +25,72 @@ class HFTransformerModelArgs(BaseModelArgs):
     vocab_size: int = 128256
     multiple_of: int = 256
     ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
     rope_theta: float = 10000
-    max_seq_len: int = 2048
     
-    # HF compatibility
-    rms_norm_eps: float = 1e-6
-    use_cache: bool = True
+    max_seq_len: int = 2048
     depth_init: bool = True
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
     eos_id: int = 0
-
-    _hf_args: dict = field(init=False, repr=False, default_factory=dict)
+    
+    # HF args
+    attn_implementation: str = "eager"
 
     def update_from_config(self, job_config: JobConfig):
-        #TODO(3outeille): what if we dont specify flavor? Should use full as default
-        flavor = getattr(job_config.model, "flavor", None)
         
-        if flavor == "full":
-            model_name_or_config: Union[LlamaConfig, str, os.PathLike] = job_config.model.name
-            hf_model_config = LlamaConfig.from_pretrained(model_name_or_config)
+        #TODO(3outeille): clean this mess once grad norm is stabilized
+        default_args = HFTransformerModelArgs()
 
-            #TODO(3outeille): use getattr to handle models that don't have all the attributes
-            # Fill torchtitan args with HF ones
-            self.dim = hf_model_config.hidden_size
-            self.n_layers = hf_model_config.num_hidden_layers
-            self.n_heads = hf_model_config.num_attention_heads
-            self.n_kv_heads = hf_model_config.num_key_value_heads
-            self.vocab_size = hf_model_config.vocab_size
-            self.rope_theta = getattr(hf_model_config, "rope_theta", 10000.0)
-            self.max_seq_len = hf_model_config.max_position_embeddings
-            self.rms_norm_eps = getattr(hf_model_config, "rms_norm_eps", 1e-6)
+        args_to_override = {}
+        for key in default_args.__dict__:
+            if hasattr(self, key):
+                current_value = getattr(self, key)
+                default_value = getattr(default_args, key)
+                if current_value != default_value:
+                    args_to_override[key] = current_value
 
-            if hasattr(hf_model_config, "intermediate_size") and hf_model_config.intermediate_size:
-                self.ffn_dim_multiplier = hf_model_config.intermediate_size // hf_model_config.hidden_size
+        hf_model_config = LlamaConfig.from_pretrained(
+            job_config.model.name,
+            attn_implementation=self.attn_implementation,
+        )
+        # n_layers = 32
+        self.__dict__.update(hf_model_config.__dict__)
 
-        # Always update max_seq_len to match training seq_len, warn if exceeded
-        seq_len = job_config.training.seq_len
-        if seq_len > self.max_seq_len:
-            logger.warning(f"Sequence length {seq_len} exceeds original maximum {self.max_seq_len}.")
-        self.max_seq_len = seq_len
+        # num_hidden_layers = 16
 
-        if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
-            raise NotImplementedError("CP support for FlexAttention is still in progress.")
+        # Update TT args with HF args (for keys that exist in both but differ in namings)
+        self.dim = self.hidden_size
+        self.n_layers = self.num_hidden_layers
+        self.n_heads = self.num_attention_heads
+        self.n_kv_heads = self.num_key_value_heads
+        self.norm_eps = self.rms_norm_eps
+        self.max_seq_len = self.max_position_embeddings
+        self.eos_id = self.eos_token_id
 
-        self._hf_args = {
-            "hidden_size": self.dim,
-            "num_hidden_layers": self.n_layers,
-            "num_attention_heads": self.n_heads,
-            "num_key_value_heads": self.n_kv_heads,
-            "vocab_size": self.vocab_size,
-            "rope_scaling": {"type": "dynamic", "factor": 2.0},
-            "intermediate_size": self.ffn_dim_multiplier,
-            "rope_theta": self.rope_theta,
-            "max_position_embeddings": self.max_seq_len,
-            "rms_norm_eps": self.rms_norm_eps,
-            "use_cache": self.use_cache,
-            "pad_token_id": self.eos_id,
-        }
-        return self
+        # n_layers = 16
+        
+        self.__dict__.update(args_to_override)
+        
+        # n_layers = 2
+        # num_hidden_layers = 16
 
-    def convert_to_hf_config(self) -> LlamaConfig:
-        if not self._hf_args:
-            raise RuntimeError(
-                "`update_from_config` must be called before `convert_to_hf_config` to prepare the arguments."
-            )
-        return LlamaConfig(**self._hf_args)
+        # Update HF args with TT override args because HF modeling uses HF args and not TT args
+        # TODO(3outeille): find a cleaner way to handle the mapping
+        self.hidden_size = self.dim
+        self.num_hidden_layers = self.n_layers
+        self.num_attention_heads = self.n_heads
+        self.num_key_value_heads = self.n_kv_heads
+        self.rms_norm_eps = self.norm_eps
+        self.max_position_embeddings = self.max_seq_len
+        self.eos_token_id = self.eos_id
+        
+        # n_layers = 2
+        # num_hidden_layers = 2
+
+        print(self)
+        self.use_cache = False
+        return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
diff --git a/torchtitan/train.py b/torchtitan/train.py
index d4a00ad98e..bc8128d0fa 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -33,8 +33,34 @@
     maybe_enable_profiling,
 )
 
-from torchtitan.experiments.transformers_backend.model.hf_transformers_args import HFTransformerModelArgs
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
+from transformers.models.llama.modeling_llama import LlamaForCausalLM, CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+
+
+# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
+# The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
+# which prevents subsequent proper initialization.
+def _initialize_weights_patched(self, module):
+    """
+    Patched version of _initialize_weights that skips initialization and setting
+    the _is_hf_initialized flag if the module is on a meta device.
+    """
+    if getattr(module, "_is_hf_initialized", False):
+        return
+
+    # Check if any parameter is on the meta device
+    for param in module.parameters(recurse=False):
+        if param.device.type == "meta":
+            return
+    
+    #TODO(3outeille): check if register bufffer is init 
+
+    # If not on a meta device, call the original weight initialization
+    self._init_weights(module)
+    module._is_hf_initialized = True
+
+
+PreTrainedModel._initialize_weights = _initialize_weights_patched
 
 
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):
@@ -158,10 +184,7 @@ def __init__(self, job_config: JobConfig):
             f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
         )
         with torch.device("meta"):
-            if isinstance(model_args, HFTransformerModelArgs):
-                model = self.train_spec.model_cls(model_args.convert_to_hf_config())
-            else:
-                model = self.train_spec.model_cls(model_args)
+            model = self.train_spec.model_cls(model_args)
 
         # Build the collection of model converters. No-op if `model.converters` empty
         model_converters = build_model_converters(job_config, parallel_dims)
@@ -468,6 +491,9 @@ def forward_backward_step(
                 assert len(model_parts) == 1
                 with self.maybe_enable_amp:
                     pred = model_parts[0](inputs)
+                    #NOTE(3outeille): just trying to make it work for now. Will refactor later.
+                    if isinstance(pred, CausalLMOutputWithPast):
+                        pred = pred.logits
                     loss = self.loss_fn(pred, labels)
                 # need to free to before bwd to avoid peaking memory
                 del pred

From 4ae9560258936d3096052d4858ad5d79d1c857fe Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 10 Sep 2025 07:41:35 +0000
Subject: [PATCH 010/129] a bit cleaner way to get passed args

---
 .../transformers_backend/__init__.py          | 19 +++--
 .../transformers_backend/compare_tt_hf_run.sh | 76 +++++++++++++++++++
 .../model/hf_transformers_args.py             | 15 +---
 3 files changed, 93 insertions(+), 17 deletions(-)
 create mode 100755 torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 504adfc88e..876e7ae8fa 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -28,16 +28,25 @@
 ]
 
 
+def hf_transformer_model_args_builder(**kwargs):
+    # Capture the kwargs in the passed_args field
+    args = HFTransformerModelArgs(**kwargs)
+    args.passed_args = kwargs
+    return args
+
+
 flavors = {
-    "debugmodel": HFTransformerModelArgs(
-        n_layers=2,
-        vocab_size=2000,
+    "debugmodel": hf_transformer_model_args_builder(
+        # n_layers=2,
+        # vocab_size=2000,
+        max_seq_len=2048,
+        dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000
     ),
-    "medium": HFTransformerModelArgs(
+    "medium": hf_transformer_model_args_builder(
         dim=1024,
         n_layers=12,
     ),
-    "full": HFTransformerModelArgs(),
+    "full": hf_transformer_model_args_builder(),
 }
 
 hf_train_spec = TrainSpec(
diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
new file mode 100755
index 0000000000..4085461e3a
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+set -o pipefail
+
+# Common settings
+NGPU=${NGPU:-"1"}
+export LOG_RANK=${LOG_RANK:-0}
+
+
+run_tt() {
+    echo "##############################################"
+    echo "### Running TorchTitan (native) training ###"
+    echo "##############################################"
+    TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml"
+
+    # Use CUDA_VISIBLE_DEVICES=0 for TT run
+    CUDA_VISIBLE_DEVICES=0 \
+    torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+    --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+    -m torchtitan.train --job.config_file ${TT_CONFIG} "$@"
+}
+
+run_hf() {
+    echo "#######################################################"
+    echo "### Running TorchTitan with HF backend training ###"
+    echo "#######################################################"
+    HF_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml"
+
+    # Use CUDA_VISIBLE_DEVICES=1 for HF run
+    CUDA_VISIBLE_DEVICES=1 \
+    torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+    --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+    -m torchtitan.train --job.config_file ${HF_CONFIG} "$@"
+}
+
+
+TT_LOG="tt_run.log"
+HF_LOG="hf_run.log"
+DIFF_LOG="run_diff.log"
+
+run_tt "$@" 2>&1 | tee ${TT_LOG}
+run_hf "$@" 2>&1 | tee ${HF_LOG}
+
+# Filter logs to remove noisy differences
+TT_LOG_FILTERED="${TT_LOG}.filtered"
+HF_LOG_FILTERED="${HF_LOG}.filtered"
+
+# This sed command removes timestamps, PIDs, master ports, and other
+# volatile details that change between runs.
+# Feel free to adjust the regex patterns to better suit your log format.
+sed -E \
+    -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \
+    -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \
+    -e 's/PID [0-9]+/PID XXXX/g' \
+    -e 's/localhost:[0-9]+/localhost:XXXX/g' \
+    < "${TT_LOG}" > "${TT_LOG_FILTERED}"
+
+sed -E \
+    -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \
+    -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \
+    -e 's/PID [0-9]+/PID XXXX/g' \
+    -e 's/localhost:[0-9]+/localhost:XXXX/g' \
+    < "${HF_LOG}" > "${HF_LOG_FILTERED}"
+
+echo "############################################"
+echo "### Diff between TT and HF run logs      ###"
+echo "############################################"
+echo "### Log diff is being saved to ${DIFF_LOG}"
+echo "############################################"
+git diff --no-index --color=always --word-diff=color "${TT_LOG_FILTERED}" "${HF_LOG_FILTERED}" | tee "${DIFF_LOG}" || true
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 63e252d851..bb9d1b814d 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -37,19 +37,10 @@ class HFTransformerModelArgs(LlamaConfig, BaseModelArgs):
     # HF args
     attn_implementation: str = "eager"
 
+    passed_args: dict = field(init=False, repr=False, default_factory=dict)
+
     def update_from_config(self, job_config: JobConfig):
         
-        #TODO(3outeille): clean this mess once grad norm is stabilized
-        default_args = HFTransformerModelArgs()
-
-        args_to_override = {}
-        for key in default_args.__dict__:
-            if hasattr(self, key):
-                current_value = getattr(self, key)
-                default_value = getattr(default_args, key)
-                if current_value != default_value:
-                    args_to_override[key] = current_value
-
         hf_model_config = LlamaConfig.from_pretrained(
             job_config.model.name,
             attn_implementation=self.attn_implementation,
@@ -70,7 +61,7 @@ def update_from_config(self, job_config: JobConfig):
 
         # n_layers = 16
         
-        self.__dict__.update(args_to_override)
+        self.__dict__.update(self.passed_args)
         
         # n_layers = 2
         # num_hidden_layers = 16

From 9be95f98f518efbdd25c3e90e5edd5a60971d8d0 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 10 Sep 2025 09:01:12 +0000
Subject: [PATCH 011/129] now same number of params +  same attention backend
 but noticed higher gradnorm and less tps with HF model

---
 .../transformers_backend/__init__.py          |  3 +-
 .../configs/debug_1_gpu_hf.toml               |  4 +-
 .../model/hf_transformers_args.py             | 63 ++++++++++++++++++-
 .../transformers_backend/run_train.sh         | 17 ++++-
 4 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 876e7ae8fa..b8fc47b9e7 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -40,7 +40,8 @@ def hf_transformer_model_args_builder(**kwargs):
         # n_layers=2,
         # vocab_size=2000,
         max_seq_len=2048,
-        dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000
+        #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else
+        dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16
     ),
     "medium": hf_transformer_model_args_builder(
         dim=1024,
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
index 30872e903c..3144011b62 100644
--- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
+++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
@@ -1,6 +1,6 @@
 [job]
 dump_folder = "./outputs"
-description = "Llama 3 debug training with FSDP on 2 GPUs"
+description = "HF Llama 3 debug training"
 print_args = false
 use_for_integration_test = true
 
@@ -20,7 +20,7 @@ enable_wandb = false
 
 [model]
 name = "meta-llama/Llama-3.2-1B"
-flavor = "medium"
+flavor = "debugmodel"
 tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
 
 [optimizer]
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index bb9d1b814d..b21a0604a2 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -35,7 +35,7 @@ class HFTransformerModelArgs(LlamaConfig, BaseModelArgs):
     eos_id: int = 0
     
     # HF args
-    attn_implementation: str = "eager"
+    attn_implementation: str = "sdpa"
 
     passed_args: dict = field(init=False, repr=False, default_factory=dict)
 
@@ -76,15 +76,74 @@ def update_from_config(self, job_config: JobConfig):
         self.max_position_embeddings = self.max_seq_len
         self.eos_token_id = self.eos_id
         
+        # Match torchtitan parameter counts
+        self.tie_word_embeddings = False
+        self.attention_bias = False
+        self.mlp_bias = False
+
+        # Match torchtitan intermediate size calculation
+        ffn_hidden_size = 4 * self.hidden_size
+        ffn_hidden_size = int(2 * ffn_hidden_size / 3)
+        if self.ffn_dim_multiplier is not None:
+            ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
+        self.intermediate_size = self.multiple_of * (
+            (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
+        )
+        # Forced it as HF has config.head_dim and the modeling retrieves it instead of doing config.hidden_size // config.num_attention_heads
+        self.head_dim = self.dim // self.num_attention_heads
+        
         # n_layers = 2
         # num_hidden_layers = 2
 
-        print(self)
         self.use_cache = False
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
+
+        layer_params = {}  # int -> int
+        embedding_params = 0
+        norm_params = 0
+        lm_head_params = 0
+        misc_params = {}
+
+        for name, p in model.named_parameters():
+            if "model.embed_tokens" in name:
+                embedding_params += p.numel()
+            elif "model.layers." in name:
+                try:
+                    layer_num = int(name.split("layers.")[1].split(".")[0])
+                    if layer_num not in layer_params:
+                        layer_params[layer_num] = 0
+                    layer_params[layer_num] += p.numel()
+                except (ValueError, IndexError):
+                    # Should not happen with standard HF llama names
+                    component = "misc_layer_parts"
+                    if component not in misc_params:
+                        misc_params[component] = 0
+                    misc_params[component] += p.numel()
+            elif "model.norm" in name:
+                norm_params += p.numel()
+            elif "lm_head" in name:
+                lm_head_params += p.numel()
+            else:
+                # Catch anything else
+                component = name.split(".")[0]
+                if component not in misc_params:
+                    misc_params[component] = 0
+                misc_params[component] += p.numel()
+
+        logger.info("Parameter breakdown:")
+        logger.info(f"  - embedding: {embedding_params:,} parameters")
+        for layer_num in sorted(layer_params.keys()):
+            params = layer_params[layer_num]
+            logger.info(f"  - layer_{layer_num}: {params:,} parameters")
+        logger.info(f"  - final_norm: {norm_params:,} parameters")
+        logger.info(f"  - lm_head: {lm_head_params:,} parameters")
+        if misc_params:
+            for name, params in misc_params.items():
+                logger.info(f"  - {name} (misc): {params:,} parameters")
+
         nparams_embedding = sum(
             sum(p.numel() for p in m.parameters())
             for m in model.children()
diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh
index 74ef5603b1..6151fcda64 100755
--- a/torchtitan/experiments/transformers_backend/run_train.sh
+++ b/torchtitan/experiments/transformers_backend/run_train.sh
@@ -9,17 +9,28 @@ set -ex
 
 # use envs as local overwrites for convenience
 # e.g.
-# LOG_RANK=0,1 NGPU=4 ./run_train.sh
+# BACKEND=tt LOG_RANK=0,1 NGPU=4 ./run_train.sh
 NGPU=${NGPU:-"8"}
 export LOG_RANK=${LOG_RANK:-0}
 
+DEBUG_PORT=${DEBUG_PORT:-5678}
 # Option to switch between debug and train
 MODE=${MODE:-"train"}  # Set MODE=debug or MODE=train
 
-CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu.toml"}
+# Option to switch between hf and tt backend
+BACKEND=${BACKEND:-"hf"}
+
+if [ "$BACKEND" = "tt" ]; then
+    CONFIG_FILE=${CONFIG_FILE:-"/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml"}
+elif [ "$BACKEND" = "hf" ]; then
+    CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu_hf.toml"}
+else
+    echo "Invalid BACKEND set: ${BACKEND}"
+    exit 1
+fi
 
 if [ "$MODE" = "debug" ]; then
-    PYTHON_CMD="debugpy-run -m torch.distributed.run --"
+    PYTHON_CMD="debugpy-run -p ${DEBUG_PORT} -m torch.distributed.run --"
 else
     PYTHON_CMD="torchrun"
 fi

From bf9144779ca28fed110aa010e5eaece0ae0278bc Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 11 Sep 2025 08:35:37 +0000
Subject: [PATCH 012/129] fix seed and deterministic

---
 .../experiments/transformers_backend/compare_tt_hf_run.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
index 4085461e3a..81b33091fb 100755
--- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
@@ -23,7 +23,7 @@ run_tt() {
     CUDA_VISIBLE_DEVICES=0 \
     torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
     --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${TT_CONFIG} "$@"
+    -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic "$@"
 }
 
 run_hf() {
@@ -36,7 +36,7 @@ run_hf() {
     CUDA_VISIBLE_DEVICES=1 \
     torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
     --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${HF_CONFIG} "$@"
+    -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic "$@"
 }
 
 
@@ -45,7 +45,9 @@ HF_LOG="hf_run.log"
 DIFF_LOG="run_diff.log"
 
 run_tt "$@" 2>&1 | tee ${TT_LOG}
-run_hf "$@" 2>&1 | tee ${HF_LOG}
+# run_hf "$@" 2>&1 | tee ${HF_LOG}
+run_tt "$@" 2>&1 | tee ${HF_LOG}
+
 
 # Filter logs to remove noisy differences
 TT_LOG_FILTERED="${TT_LOG}.filtered"

From 4c2fc0bbd04aa1667296ccd124f26c3cb8cf15fb Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 11 Sep 2025 13:41:45 +0000
Subject: [PATCH 013/129] fix torch deterministic for HF modeling that was
 producing Nans

---
 torchtitan/train.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchtitan/train.py b/torchtitan/train.py
index bc8128d0fa..3c9718df1b 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -8,6 +8,7 @@
 import os
 import time
 from datetime import timedelta
+from transformers.utils import is_torch_deterministic
 from typing import Any, Generator, Iterable, Optional
 
 import torch
@@ -287,7 +288,9 @@ def __init__(self, job_config: JobConfig):
         else:
             # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
             model = self.train_spec.parallelize_fn(model, parallel_dims, job_config)
-
+            if is_torch_deterministic():
+                # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
+                torch.utils.deterministic.fill_uninitialized_memory = False
             model.to_empty(device=init_device)
             with torch.no_grad():
                 if isinstance(model, LlamaForCausalLM):

From 9bffa386f7a97454ff580ee11c5ba39a5a1b51fe Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 15 Sep 2025 13:10:57 +0000
Subject: [PATCH 014/129] HF model now numerically stable compared  to TT
 (given a fixed attention backend)

---
 .../model/hf_transformers_args.py             | 99 +++++++++++++++++++
 torchtitan/models/attention.py                |  4 +-
 torchtitan/train.py                           | 31 +-----
 3 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index b21a0604a2..61282d9fb0 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -14,6 +14,102 @@
 from torchtitan.tools.logging import logger
 from transformers.models.llama.configuration_llama import LlamaConfig
 
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP
+
+# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
+# The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
+# which prevents subsequent proper initialization.
+def _initialize_weights_patched(self, module):
+    """
+    Patched version of _initialize_weights that skips initialization and setting
+    the _is_hf_initialized flag if the module is on a meta device.
+    """
+    if getattr(module, "_is_hf_initialized", False):
+        return
+
+    # Check if any parameter is on the meta device
+    for param in module.parameters(recurse=False):
+        if param.device.type == "meta":
+            return
+    
+    #TODO(3outeille): check if register bufffer is init 
+
+    # If not on a meta device, call the original weight initialization
+    self._init_weights(module)
+    module._is_hf_initialized = True
+
+
+#TODO(3outeille): find a better way to do this
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+
+_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__
+
+def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int):
+    _original_llama_decoder_layer_init(self, config, layer_idx)
+    self.mlp.layer_idx = layer_idx
+
+LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched
+
+
+def _init_weights_patched(self, module):
+    """
+    Patched version of _init_weights to match TorchTitan's initialization for Llama.
+    `self` is a LlamaPreTrainedModel instance.
+    """
+    config = self.config
+
+    if isinstance(module, (LlamaAttention, LlamaMLP)):
+        layer_idx = module.layer_idx
+
+        if config.depth_init:
+            init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+        else:
+            init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+    if isinstance(module, LlamaAttention):
+        nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
+    
+    elif isinstance(module, LlamaMLP):
+        nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std)
+        nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
+
+    elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head
+        final_out_std = config.hidden_size**-0.5
+        cutoff_factor = 3
+        nn.init.trunc_normal_(
+            module.weight,
+            mean=0.0,
+            std=final_out_std,
+            a=-cutoff_factor * final_out_std,
+            b=cutoff_factor * final_out_std,
+        )
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        std = config.initializer_range
+        module.weight.data.normal_(mean=0.0, std=std)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    
+    elif (
+        isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d))
+        or "LayerNorm" in module.__class__.__name__
+        or "RMSNorm" in module.__class__.__name__
+    ):
+        # Norms can exist without weights (in which case they are None from torch primitives)
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(1.0)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.zero_()
+
+
+PreTrainedModel._init_weights = _init_weights_patched
+PreTrainedModel._initialize_weights = _initialize_weights_patched
 
 @dataclass
 class HFTransformerModelArgs(LlamaConfig, BaseModelArgs):
@@ -96,6 +192,9 @@ def update_from_config(self, job_config: JobConfig):
         # num_hidden_layers = 2
 
         self.use_cache = False
+
+        # HF numerical stability matching
+        self.initializer_range = 1.0 # use as std for normal init in embedding
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
index f66361a6d2..9d99622cc1 100644
--- a/torchtitan/models/attention.py
+++ b/torchtitan/models/attention.py
@@ -205,9 +205,9 @@ def _init_backend(cls) -> None:
 
         # Add CuDNN on B200 w/ highest priority
         cls.backends = [
-            SDPBackend.FLASH_ATTENTION,
+            # SDPBackend.FLASH_ATTENTION,
             SDPBackend.EFFICIENT_ATTENTION,
-            SDPBackend.MATH,
+        #     SDPBackend.MATH,
         ]
         if has_cuda_capability(10, 0):
             cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION)
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 3c9718df1b..7b43e6b866 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -35,33 +35,6 @@
 )
 
 from transformers.models.llama.modeling_llama import LlamaForCausalLM, CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-
-
-# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
-# The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
-# which prevents subsequent proper initialization.
-def _initialize_weights_patched(self, module):
-    """
-    Patched version of _initialize_weights that skips initialization and setting
-    the _is_hf_initialized flag if the module is on a meta device.
-    """
-    if getattr(module, "_is_hf_initialized", False):
-        return
-
-    # Check if any parameter is on the meta device
-    for param in module.parameters(recurse=False):
-        if param.device.type == "meta":
-            return
-    
-    #TODO(3outeille): check if register bufffer is init 
-
-    # If not on a meta device, call the original weight initialization
-    self._init_weights(module)
-    module._is_hf_initialized = True
-
-
-PreTrainedModel._initialize_weights = _initialize_weights_patched
 
 
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):
@@ -294,6 +267,10 @@ def __init__(self, job_config: JobConfig):
             model.to_empty(device=init_device)
             with torch.no_grad():
                 if isinstance(model, LlamaForCausalLM):
+                    print("Now done with meta device, calling post_init")
+                    for m in model.modules():
+                        if hasattr(m, "_is_hf_initialized"):
+                            m._is_hf_initialized = False
                     model.post_init()
                 else:
                     model.init_weights(buffer_device=buffer_device)

From 40d84cc4098c51ceac8e30fc966dc787a0905a43 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 15 Sep 2025 14:02:03 +0000
Subject: [PATCH 015/129] handling the is_hf_initialized flag in patch

---
 .../transformers_backend/model/hf_transformers_args.py        | 3 +--
 torchtitan/train.py                                           | 4 ----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 61282d9fb0..5a64dd0dc6 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -28,8 +28,7 @@ def _initialize_weights_patched(self, module):
     if getattr(module, "_is_hf_initialized", False):
         return
 
-    # Check if any parameter is on the meta device
-    for param in module.parameters(recurse=False):
+    for param in module.parameters(recurse=True):
         if param.device.type == "meta":
             return
     
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 7b43e6b866..f3617eb415 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -267,10 +267,6 @@ def __init__(self, job_config: JobConfig):
             model.to_empty(device=init_device)
             with torch.no_grad():
                 if isinstance(model, LlamaForCausalLM):
-                    print("Now done with meta device, calling post_init")
-                    for m in model.modules():
-                        if hasattr(m, "_is_hf_initialized"):
-                            m._is_hf_initialized = False
                     model.post_init()
                 else:
                     model.init_weights(buffer_device=buffer_device)

From bd3f3327060b1ef56583f27a8f01c8b7d8390e74 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 09:12:46 +0000
Subject: [PATCH 016/129] refactor HF transformer model args

---
 .../model/hf_transformers_args.py             | 209 ++++++++++++------
 1 file changed, 147 insertions(+), 62 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 5a64dd0dc6..d558ec1550 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -89,6 +89,7 @@ def _init_weights_patched(self, module):
         )
         if module.bias is not None:
             module.bias.data.zero_()
+
     elif isinstance(module, nn.Embedding):
         std = config.initializer_range
         module.weight.data.normal_(mean=0.0, std=std)
@@ -112,88 +113,172 @@ def _init_weights_patched(self, module):
 
 @dataclass
 class HFTransformerModelArgs(LlamaConfig, BaseModelArgs):
-    # Torchtitan naming
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = 128256
-    multiple_of: int = 256
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    rope_theta: float = 10000
-    
-    max_seq_len: int = 2048
-    depth_init: bool = True
-    use_flex_attn: bool = False
-    attn_mask_type: str = "causal"
-    eos_id: int = 0
-    
-    # HF args
-    attn_implementation: str = "sdpa"
-
-    passed_args: dict = field(init=False, repr=False, default_factory=dict)
+    """
+    Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
+    
+    Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
+    """
+    
+    def __init__(
+        self,
+        # TorchTitan args
+        dim: int = 4096,
+        n_layers: int = 32,
+        n_heads: int = 32,
+        n_kv_heads: Optional[int] = None,
+        vocab_size: int = 128256,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        rope_theta: float = 10000,
+        max_seq_len: int = 2048,
+        depth_init: bool = True,
+        use_flex_attn: bool = False,
+        attn_mask_type: str = "causal",
+        eos_id: int = 0,
+        # HuggingFace specific args
+        attn_implementation: str = "sdpa",
+        **kwargs
+    ):
+        # Map TorchTitan arguments to HuggingFace arguments for parent class initialization
+        hf_config_dict = dict(
+            hidden_size=dim,
+            num_hidden_layers=n_layers,
+            num_attention_heads=n_heads,
+            num_key_value_heads=n_kv_heads,
+            vocab_size=vocab_size,
+            rms_norm_eps=norm_eps,
+            rope_theta=rope_theta,
+            max_position_embeddings=max_seq_len,
+            eos_token_id=eos_id,
+            **kwargs
+        )
+        
+        super().__init__(**hf_config_dict)
+        
+        # Store TorchTitan-specific args (no HF equivalent)
+        self.multiple_of = multiple_of
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.depth_init = depth_init
+        self.use_flex_attn = use_flex_attn
+        self.attn_mask_type = attn_mask_type
+        
+        # HuggingFace specific args
+        self.attn_implementation = attn_implementation
+
+        self._passed_args = dict(
+            dim=dim,
+            n_layers=n_layers,
+            n_heads=n_heads,
+            n_kv_heads=n_kv_heads,
+            vocab_size=vocab_size,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+            norm_eps=norm_eps,
+            rope_theta=rope_theta,
+            max_seq_len=max_seq_len,
+            depth_init=depth_init,
+            use_flex_attn=use_flex_attn,
+            attn_mask_type=attn_mask_type,
+            eos_id=eos_id,
+            attn_implementation=attn_implementation,
+            **kwargs
+        )
+
+    @property
+    def dim(self) -> int:
+        """TorchTitan: Model dimension (alias for HF hidden_size)"""
+        return self.hidden_size
+    
+    @dim.setter
+    def dim(self, value: int):
+        self.hidden_size = value
+    
+    @property
+    def n_layers(self) -> int:
+        """TorchTitan: Number of layers (alias for HF num_hidden_layers)"""
+        return self.num_hidden_layers
+    
+    @n_layers.setter
+    def n_layers(self, value: int):
+        self.num_hidden_layers = value
+    
+    @property
+    def n_heads(self) -> int:
+        """TorchTitan: Number of attention heads (alias for HF num_attention_heads)"""
+        return self.num_attention_heads
+    
+    @n_heads.setter
+    def n_heads(self, value: int):
+        self.num_attention_heads = value
+    
+    @property
+    def n_kv_heads(self) -> Optional[int]:
+        """TorchTitan: Number of key-value heads (alias for HF num_key_value_heads)"""
+        return self.num_key_value_heads
+    
+    @n_kv_heads.setter
+    def n_kv_heads(self, value: Optional[int]):
+        self.num_key_value_heads = value
+    
+    @property
+    def norm_eps(self) -> float:
+        """TorchTitan: Layer norm epsilon (alias for HF rms_norm_eps)"""
+        return self.rms_norm_eps
+    
+    @norm_eps.setter
+    def norm_eps(self, value: float):
+        self.rms_norm_eps = value
+    
+    @property
+    def max_seq_len(self) -> int:
+        """TorchTitan: Maximum sequence length (alias for HF max_position_embeddings)"""
+        return self.max_position_embeddings
+    
+    @max_seq_len.setter
+    def max_seq_len(self, value: int):
+        self.max_position_embeddings = value
+    
+    @property
+    def eos_id(self) -> int:
+        """TorchTitan: End of sequence token ID (alias for HF eos_token_id)"""
+        return self.eos_token_id
+    
+    @eos_id.setter
+    def eos_id(self, value: int):
+        self.eos_token_id = value
 
     def update_from_config(self, job_config: JobConfig):
-        
+        # Load HF config (overwrites our HF attributes)
         hf_model_config = LlamaConfig.from_pretrained(
             job_config.model.name,
             attn_implementation=self.attn_implementation,
         )
-        # n_layers = 32
-        self.__dict__.update(hf_model_config.__dict__)
-
-        # num_hidden_layers = 16
-
-        # Update TT args with HF args (for keys that exist in both but differ in namings)
-        self.dim = self.hidden_size
-        self.n_layers = self.num_hidden_layers
-        self.n_heads = self.num_attention_heads
-        self.n_kv_heads = self.num_key_value_heads
-        self.norm_eps = self.rms_norm_eps
-        self.max_seq_len = self.max_position_embeddings
-        self.eos_id = self.eos_token_id
 
-        # n_layers = 16
-        
-        self.__dict__.update(self.passed_args)
+        self.__dict__.update(hf_model_config.__dict__)
         
-        # n_layers = 2
-        # num_hidden_layers = 16
-
-        # Update HF args with TT override args because HF modeling uses HF args and not TT args
-        # TODO(3outeille): find a cleaner way to handle the mapping
-        self.hidden_size = self.dim
-        self.num_hidden_layers = self.n_layers
-        self.num_attention_heads = self.n_heads
-        self.num_key_value_heads = self.n_kv_heads
-        self.rms_norm_eps = self.norm_eps
-        self.max_position_embeddings = self.max_seq_len
-        self.eos_token_id = self.eos_id
+        # Update our attributes with the passed args from flavors
+        for key, value in self._passed_args.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
         
-        # Match torchtitan parameter counts
+        # Configure HF-specific settings to match TorchTitan settings
         self.tie_word_embeddings = False
         self.attention_bias = False
         self.mlp_bias = False
-
-        # Match torchtitan intermediate size calculation
-        ffn_hidden_size = 4 * self.hidden_size
+        self.use_cache = False
+        self.initializer_range = 1.0  # use as std for normal init in embedding
+        
+        ffn_hidden_size = 4 * self.dim
         ffn_hidden_size = int(2 * ffn_hidden_size / 3)
         if self.ffn_dim_multiplier is not None:
             ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
         self.intermediate_size = self.multiple_of * (
             (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
         )
-        # Forced it as HF has config.head_dim and the modeling retrieves it instead of doing config.hidden_size // config.num_attention_heads
+        
         self.head_dim = self.dim // self.num_attention_heads
         
-        # n_layers = 2
-        # num_hidden_layers = 2
-
-        self.use_cache = False
-
-        # HF numerical stability matching
-        self.initializer_range = 1.0 # use as std for normal init in embedding
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:

From 249be928393bb82534dcbbf34986c2386bb7332a Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 09:33:07 +0000
Subject: [PATCH 017/129] wrapper model class to avoid transformers to be
 explicit in train.py

---
 .../transformers_backend/__init__.py          |  6 +--
 .../model/hf_transformers_args.py             | 39 ++++++++++++++++++-
 torchtitan/train.py                           |  5 +--
 3 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index b8fc47b9e7..9f7ee13484 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -16,14 +16,14 @@
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
-from .model.hf_transformers_args import HFTransformerModelArgs
+from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel
 
 from transformers.models.llama.modeling_llama import LlamaForCausalLM
 
 
 __all__ = [
     "HFTransformerModelArgs",
-    "LlamaForCausalLM", #TODO(3outeille): later use AutoModelForCausalLM
+    "HFTransformerModel",
     "hf_transformers_configs",
 ]
 
@@ -52,7 +52,7 @@ def hf_transformer_model_args_builder(**kwargs):
 
 hf_train_spec = TrainSpec(
     name="hf_auto_model",
-    model_cls=LlamaForCausalLM,
+    model_cls=HFTransformerModel,
     model_args=flavors,
     parallelize_fn=parallelize_hf_transformers,
     pipelining_fn=pipeline_llama,
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index d558ec1550..fad9e35f28 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -16,6 +16,7 @@
 
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
 
 # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
 # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
@@ -32,8 +33,6 @@ def _initialize_weights_patched(self, module):
         if param.device.type == "meta":
             return
     
-    #TODO(3outeille): check if register bufffer is init 
-
     # If not on a meta device, call the original weight initialization
     self._init_weights(module)
     module._is_hf_initialized = True
@@ -336,3 +335,39 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in
         l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len
         num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
         return nparams, num_flops_per_token
+
+
+class HFTransformerModel(LlamaForCausalLM):
+    def __init__(self, model_args: HFTransformerModelArgs):
+        super().__init__(model_args)
+
+    def init_weights(self, *args, **kwargs):
+        # Taken from transformers.modeling_utils.PreTrainedModel.init_weights
+        super().init_weights()
+        self._backward_compatibility_gradient_checkpointing()
+
+        # Make sure the modules correctly exist if the flag is active
+        if self._keep_in_fp32_modules is not None or self._keep_in_fp32_modules_strict is not None:
+            all_parameters = {name for name, _ in self.named_parameters() if len(name) > 0}
+            unique_module_names = set()
+            # Get all unique module names in the module graph, without the prefixes
+            for param in all_parameters:
+                unique_module_names.update(
+                    [name for name in param.split(".") if not name.isnumeric() and name not in ["weight", "bias"]]
+                )
+            # Check that every module in the keep_in_fp32 list is part of the module graph
+            if self._keep_in_fp32_modules is not None:
+                for module in self._keep_in_fp32_modules:
+                    if module not in unique_module_names:
+                        raise ValueError(
+                            f"{module} was specified in the `_keep_in_fp32_modules` list, but is not part of the modules in"
+                            f" {self.__class__.__name__}"
+                        )
+
+            if self._keep_in_fp32_modules_strict is not None:
+                for module in self._keep_in_fp32_modules_strict:
+                    if module not in unique_module_names:
+                        raise ValueError(
+                            f"{module} was specified in the `_keep_in_fp32_modules_strict` list, but is not part of the modules in"
+                            f" {self.__class__.__name__}"
+                        )
\ No newline at end of file
diff --git a/torchtitan/train.py b/torchtitan/train.py
index f3617eb415..7ae5881f2a 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -266,10 +266,7 @@ def __init__(self, job_config: JobConfig):
                 torch.utils.deterministic.fill_uninitialized_memory = False
             model.to_empty(device=init_device)
             with torch.no_grad():
-                if isinstance(model, LlamaForCausalLM):
-                    model.post_init()
-                else:
-                    model.init_weights(buffer_device=buffer_device)
+                model.init_weights(buffer_device=buffer_device)
             model.train()
 
             self.model_parts = [model]

From e2d4adaca2bd00bce2d069b9e968a6dcc8e51c1d Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 10:02:07 +0000
Subject: [PATCH 018/129] add better testing script with reference log for
 later sanity check

---
 .../transformers_backend/compare_tt_hf_run.sh | 10 ++-
 .../configs/debug_1_gpu.toml                  | 62 --------------
 .../configs/debug_1_gpu_hf.toml               |  6 +-
 .../configs/debug_1_gpu_tt.toml               | 83 +++++++++++++++++++
 .../reference_diff_llama3_1gpu.log            | 61 ++++++++++++++
 .../test_hf_torchtitan_model_args.py          | 51 ------------
 torchtitan/models/llama3/model/args.py        | 75 +++++++++++++----
 7 files changed, 212 insertions(+), 136 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
 create mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml
 create mode 100644 torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
 delete mode 100644 torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py

diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
index 81b33091fb..0461ebfb7b 100755
--- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
@@ -17,7 +17,7 @@ run_tt() {
     echo "##############################################"
     echo "### Running TorchTitan (native) training ###"
     echo "##############################################"
-    TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml"
+    TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml"
 
     # Use CUDA_VISIBLE_DEVICES=0 for TT run
     CUDA_VISIBLE_DEVICES=0 \
@@ -44,9 +44,11 @@ TT_LOG="tt_run.log"
 HF_LOG="hf_run.log"
 DIFF_LOG="run_diff.log"
 
-run_tt "$@" 2>&1 | tee ${TT_LOG}
-# run_hf "$@" 2>&1 | tee ${HF_LOG}
-run_tt "$@" 2>&1 | tee ${HF_LOG}
+export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_hf"
+run_hf "$@" 2>&1 | tee ${HF_LOG} || true
+export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_tt"
+run_tt "$@" 2>&1 | tee ${TT_LOG} || true
+# run_tt "$@" 2>&1 | tee ${HF_LOG}
 
 
 # Filter logs to remove noisy differences
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
deleted file mode 100644
index 34f6953869..0000000000
--- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu.toml
+++ /dev/null
@@ -1,62 +0,0 @@
-[job]
-dump_folder = "./outputs"
-description = "Llama 3 debug training with FSDP on 2 GPUs"
-print_args = false
-use_for_integration_test = true
-
-[profiling]
-enable_profiling = false
-save_traces_folder = "profile_trace"
-profile_freq = 10
-enable_memory_snapshot = false
-save_memory_snapshot_folder = "memory_snapshot"
-
-[metrics]
-log_freq = 1
-disable_color_printing = false
-enable_tensorboard = false
-save_tb_folder = "tb"
-enable_wandb = false
-
-[model]
-name = "llama3"
-flavor = "debugmodel"
-tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
-
-[optimizer]
-name = "AdamW"
-lr = 8e-4
-eps = 1e-8
-
-[lr_scheduler]
-warmup_steps = 2
-decay_ratio = 0.8
-decay_type = "linear"
-min_lr_factor = 0.0
-
-[training]
-local_batch_size = 8
-seq_len = 2048
-max_norm = 1.0
-steps = 10
-compile = false
-dataset = "c4_test"
-dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
-
-[parallelism]
-data_parallel_replicate_degree = 1
-data_parallel_shard_degree = 1
-tensor_parallel_degree = 1
-pipeline_parallel_degree = 1
-context_parallel_degree = 1
-expert_parallel_degree = 1
-
-[checkpoint]
-enable_checkpoint = false
-
-[activation_checkpoint]
-mode = "selective"
-selective_ac_option = '2'
-
-[validation]
-enabled = false 
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
index 3144011b62..95aa9599b2 100644
--- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
+++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
@@ -5,9 +5,9 @@ print_args = false
 use_for_integration_test = true
 
 [profiling]
-enable_profiling = false
-save_traces_folder = "profile_trace"
-profile_freq = 10
+enable_profiling = true
+save_traces_folder = "profile_trace_hf"
+profile_freq = 5
 enable_memory_snapshot = false
 save_memory_snapshot_folder = "memory_snapshot"
 
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml
new file mode 100644
index 0000000000..b153a98f21
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml
@@ -0,0 +1,83 @@
+# torchtitan Config.toml
+
+[job]
+dump_folder = "./outputs"
+description = "Llama 3 debug training"
+print_args = false
+use_for_integration_test = true
+
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 5
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "llama3"
+flavor = "debugmodel"
+# test folder with tokenizer.json, for debug purpose only
+hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+# converters = ["float8"]
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+min_lr_factor = 0.0
+
+[training]
+local_batch_size = 8
+seq_len = 2048
+max_norm = 1.0  # grad norm clipping
+steps = 10
+dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
+dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+
+[checkpoint]
+enable = false
+folder = "checkpoint"
+interval = 10
+last_save_model_only = false
+export_dtype = "float32"
+async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
+
+[activation_checkpoint]
+mode = "selective"  # ["none", "selective", "full"]
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+
+[compile]
+enable=false
+components = ["model", "loss"]
+
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+filter_fqns = ["output"]
+
+[validation]
+enable = false
+dataset = "c4_validation"
+freq = 5
+steps = 10
diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
new file mode 100644
index 0000000000..e134f15115
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
@@ -0,0 +1,61 @@
+[1mdiff --git a/tt_run.log.filtered b/hf_run.log.filtered[m
+[1mindex d3be70f..0f9a180 100644[m
+[1m--- a/tt_run.log.filtered[m
+[1m+++ b/hf_run.log.filtered[m
+[36m@@ -1,22 +1,23 @@[m
++ echo [31m'##############################################'[m
+[31m##############################################[m[32m'#######################################################'[m
+[32m#######################################################[m
++ echo '### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###'
+### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###
++ echo [31m'##############################################'[m
+[31m##############################################[m[32m'#######################################################'[m
+[32m#######################################################[m
++ [31mTT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32mHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m
++ [31mCUDA_VISIBLE_DEVICES=0[m[32mCUDA_VISIBLE_DEVICES=1[m
++ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file [31m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m --training.seed 42 --training.deterministic
+[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.[m
+[rank0]:  warnings.warn([m
+[rank0]:[titan] TIMESTAMP - root - [32mWARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.[m
+[32m[rank0]:[titan] TIMESTAMP - root -[m INFO - Starting job: [32mHF[m Llama 3 debug training
+[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [][m
+[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation).[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Building [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel with [31mTransformerModelArgs(_enforced='This[m[32mHFTransformerModelArgs(_enforced='This[m field is used to enforce all fields have [31mdefaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)[m[32mdefaults.')[m
+[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Parameter breakdown:[m
+[rank0]:[titan] TIMESTAMP - root - INFO -   - embedding: 512,000 parameters[m
+[36m@@ -28,30 +29,29 @@[m
+[rank0]:[titan] TIMESTAMP - root - INFO -   - layer_5: 852,480 parameters[m
+[rank0]:[titan] TIMESTAMP - root - INFO -   - final_norm: 256 parameters[m
+[rank0]:[titan] TIMESTAMP - root - INFO -   - lm_head: 512,000 parameters[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [34mModel [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel [31msize: 6,139,136 total parameters[39m
+[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14[m
+[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.04GiB(0.05%)[m
+[31m[rank0]:[titan] TIMESTAMP - root - WARNING - model.safetensors.index.json not found at hf_assets_path: /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer/model.safetensors.index.json.                     Defaulting to saving a single safetensors file if checkpoint is saved in HF format[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2)[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at [31m./outputs/profile_trace[m[32m./outputs/profile_trace_hf[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m7.8723[m[32m7.8704[m  [38;2;180;60;0mgrad_norm:  [31m1.5167[m[32m1.5185[m  [38;2;54;234;195mmemory:  [31m1.39GiB(1.75%)[m[32m1.67GiB(2.10%)[m  [34mtps: [31m44,585[m[32m34,083[m  [36mtflops: [31m3.19[m[32m2.54[m  [35mmfu: [31m0.32%[39m[m[32m0.26%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.5246[m[32m7.5209[m  [38;2;180;60;0mgrad_norm:  [31m1.6359[m[32m1.6373[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m21,052[m[32m19,870[m  [36mtflops: [31m1.51[m[32m1.48[m  [35mmfu: 0.15%[39m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m6.7900[m[32m6.7789[m  [38;2;180;60;0mgrad_norm:  [31m2.0345[m[32m2.0390[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m334,947[m[32m199,616[m  [36mtflops: [31m23.95[m[32m14.89[m  [35mmfu: [31m2.42%[39m[m[32m1.51%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m5.9829[m[32m5.9673[m  [38;2;180;60;0mgrad_norm:  [31m2.4129[m[32m2.4176[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m357,001[m[32m207,967[m  [36mtflops: [31m25.53[m[32m15.51[m  [35mmfu: [31m2.58%[39m[m[32m1.57%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m5.0536[m[32m5.0388[m  [38;2;180;60;0mgrad_norm:  [31m2.5305[m[32m2.5275[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m279,263[m[32m188,745[m  [36mtflops: [31m19.97[m[32m14.08[m  [35mmfu: [31m2.02%[39m[m[32m1.42%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.02[m[32m0.04[m seconds
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.6370[m[32m4.6283[m  [38;2;180;60;0mgrad_norm:  [31m2.2826[m[32m2.2818[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m129,464[m[32m83,088[m  [36mtflops: [31m9.26[m[32m6.20[m  [35mmfu: [31m0.94%[39m[m[32m0.63%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.3133[m[32m4.3077[m  [38;2;180;60;0mgrad_norm:  [31m2.1019[m[32m2.1023[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m298,394[m[32m175,561[m  [36mtflops: [31m21.34[m[32m13.09[m  [35mmfu: [31m2.16%[39m[m[32m1.32%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.1398[m[32m4.1349[m  [38;2;180;60;0mgrad_norm:  [31m1.9342[m[32m1.9334[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m352,929[m[32m206,086[m  [36mtflops: [31m25.24[m[32m15.37[m  [35mmfu: [31m2.55%[39m[m[32m1.55%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.5326[m[32m4.5289[m  [38;2;180;60;0mgrad_norm:  [31m1.5111[m[32m1.5103[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m357,192[m[32m208,947[m  [36mtflops: [31m25.54[m[32m15.58[m  [35mmfu: [31m2.58%[39m[m[32m1.58%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9859[m[32m3.9828[m  [38;2;180;60;0mgrad_norm:  [31m1.7799[m[32m1.7849[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m287,408[m[32m189,593[m  [36mtflops: [31m20.55[m[32m14.14[m  [35mmfu: [31m2.08%[39m[m[32m1.43%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.04[m seconds
+[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Training completed[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed[m
diff --git a/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py b/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py
deleted file mode 100644
index d83f268091..0000000000
--- a/torchtitan/experiments/transformers_backend/test_hf_torchtitan_model_args.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from transformers.models.llama.configuration_llama import LlamaConfig
-from torchtitan.experiments.transformers_backend.model.hf_transformers_args import (
-    HFTransformerModelArgs,
-)
-from torchtitan.config import JobConfig
-
-
-def print_comparison_keys(ref_dict, tt_dict):
-    all_keys = sorted(list(set(ref_dict.keys()) | set(tt_dict.keys())))
-    print(f"{'Attribute':<30} | {'Original HF':<20} | {'TorchTitan HF':<20}")
-    print("-" * 75)
-    for key in all_keys:
-        ref_val = ref_dict.get(key, "N/A")
-        tt_val = tt_dict.get(key, "N/A")
-        if str(ref_val) != str(tt_val):
-            # Red for different
-            print(f"\033[91m{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}\033[0m")
-        else:
-            print(f"{key:<30} | {str(ref_val):<20} | {str(tt_val):<20}")
-
-def compare_hf_tt_configs(model_name, flavor):
-        ref_hf_config = LlamaConfig()
-        
-        model_args = HFTransformerModelArgs()
-        job_config = JobConfig()
-        job_config.model.name = model_name
-        job_config.model.flavor = flavor
-        model_args.update_from_config(job_config)
-        tt_hf_config = model_args.convert_to_hf_config()
-
-        ref_dict = ref_hf_config.to_dict()
-        tt_dict = tt_hf_config.to_dict()
-
-        try:
-            assert ref_dict == tt_dict
-            print(f"✅ Configs match for model name {model_name} with flavor: {flavor}")
-        except AssertionError:
-            print(f"❌ Configs do not match for model name {model_name} with flavor: {flavor}! Showing differences:")
-            print_comparison_keys(ref_dict, tt_dict)
-            raise
-
-if __name__ == "__main__":
-    model_names = [
-        "meta-llama/Llama-3.2-1B",
-    ]
-    flavors = ["full"]
-
-    for model_name in model_names:
-        for flavor in flavors:
-            print(f"\nTesting model name: {model_name} with flavor: {flavor}")
-            compare_hf_tt_configs(model_name, flavor)
\ No newline at end of file
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
index e2f698f8b1..1728d9b93e 100644
--- a/torchtitan/models/llama3/model/args.py
+++ b/torchtitan/models/llama3/model/args.py
@@ -53,25 +53,68 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
         self.max_seq_len = seq_len
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        """
+        Count parameters and estimate flops for a TT (TorchTitan) model.
+
+        Args:
+            model (nn.Module): The TT model (not HF).
+            seq_len (int): Sequence length.
+
+        Returns:
+            tuple[int, int]: (nparams, num_flops_per_token)
+        """
         nparams = sum(p.numel() for p in model.parameters())
+
+        layer_params = {}  # layer_id -> int
+        embedding_params = 0
+        norm_params = 0
+        lm_head_params = 0
+        misc_params = {}
+
+        # TT model: top-level modules are tok_embeddings, layers (ModuleDict), norm, output
+        for name, p in model.named_parameters():
+            if name.startswith("tok_embeddings."):
+                embedding_params += p.numel()
+            elif name.startswith("layers."):
+                try:
+                    # layers.<layer_id>.<rest>
+                    layer_id = int(name.split(".")[1])
+                    if layer_id not in layer_params:
+                        layer_params[layer_id] = 0
+                    layer_params[layer_id] += p.numel()
+                except (ValueError, IndexError):
+                    # Should not happen, but catch any oddities
+                    component = "misc_layer_parts"
+                    if component not in misc_params:
+                        misc_params[component] = 0
+                    misc_params[component] += p.numel()
+            elif name.startswith("norm."):
+                norm_params += p.numel()
+            elif name.startswith("output."):
+                lm_head_params += p.numel()
+            else:
+                # Catch anything else
+                component = name.split(".")[0]
+                if component not in misc_params:
+                    misc_params[component] = 0
+                misc_params[component] += p.numel()
+
+        logger.info("Parameter breakdown:")
+        logger.info(f"  - embedding: {embedding_params:,} parameters")
+        for layer_num in sorted(layer_params.keys()):
+            params = layer_params[layer_num]
+            logger.info(f"  - layer_{layer_num}: {params:,} parameters")
+        logger.info(f"  - final_norm: {norm_params:,} parameters")
+        logger.info(f"  - lm_head: {lm_head_params:,} parameters")
+        if misc_params:
+            for name, params in misc_params.items():
+                logger.info(f"  - {name} (misc): {params:,} parameters")
+
+        # For TT, embedding is always model.tok_embeddings
         nparams_embedding = sum(
-            sum(p.numel() for p in m.parameters())
-            for m in model.children()
-            if isinstance(m, nn.Embedding)
+            p.numel() for p in getattr(model, "tok_embeddings", nn.Module()).parameters()
         )
 
-        l, h, q, t = (
-            self.n_layers,
-            self.n_heads,
-            self.dim // self.n_heads,
-            seq_len,
-        )
-        # Reasoning behind the factor of 12 for the self-attention part of the formula:
-        # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
-        # 2. the flash attention does 1 more matmul recomputation in the backward
-        #    but recomputation should not be counted in calculating MFU           (+0)
-        # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
-        # 4. we follow the convention and do not account for sparsity in causal attention
+        l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len
         num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
-
         return nparams, num_flops_per_token

From 4b498a94fa9a02679005147cc6ae23460e11af45 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 12:05:49 +0000
Subject: [PATCH 019/129] no need to fill passed args

---
 .../transformers_backend/__init__.py            | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 9f7ee13484..c4b5256d83 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -18,36 +18,25 @@
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
 from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel
 
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
-
-
 __all__ = [
     "HFTransformerModelArgs",
     "HFTransformerModel",
     "hf_transformers_configs",
 ]
 
-
-def hf_transformer_model_args_builder(**kwargs):
-    # Capture the kwargs in the passed_args field
-    args = HFTransformerModelArgs(**kwargs)
-    args.passed_args = kwargs
-    return args
-
-
 flavors = {
-    "debugmodel": hf_transformer_model_args_builder(
+    "debugmodel": HFTransformerModelArgs(
         # n_layers=2,
         # vocab_size=2000,
         max_seq_len=2048,
         #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else
         dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16
     ),
-    "medium": hf_transformer_model_args_builder(
+    "medium": HFTransformerModelArgs(
         dim=1024,
         n_layers=12,
     ),
-    "full": hf_transformer_model_args_builder(),
+    "full": HFTransformerModelArgs(),
 }
 
 hf_train_spec = TrainSpec(

From eb403d5e0a45b7da7586e8d384562c6f22214e86 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 15:11:04 +0000
Subject: [PATCH 020/129] can now handle multiple HF modeling

---
 .../infra/parallelize_hf_transformers.py      |  6 +-
 .../model/hf_transformers_args.py             | 94 ++++++++-----------
 torchtitan/models/llama3/infra/parallelize.py |  2 -
 torchtitan/train.py                           |  2 +-
 4 files changed, 41 insertions(+), 63 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 2f0d9167b0..76d2d8adb4 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -140,13 +140,11 @@ def selective_checkpointing_context_fn():
 
 def apply_ac(model: nn.Module, ac_config: ACConfig):
     """Apply activation checkpointing to the model."""
-    layers = model.model.layers
-
-    for layer_id, transformer_block in layers.named_children():
+    for layer_id, transformer_block in model.layers.named_children():
         transformer_block = _apply_ac_to_transformer_block(
             transformer_block, ac_config, base_fqn=f"layers.{layer_id}"
         )
-        layers.register_module(layer_id, transformer_block)
+        model.layers.register_module(layer_id, transformer_block)
 
     logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
 
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index fad9e35f28..b0db5ba36b 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -4,19 +4,18 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from dataclasses import dataclass, field
-from typing import Optional, Union
-import os
+from dataclasses import dataclass
+from typing import Optional
 
 from torch import nn
 from torchtitan.config import JobConfig
 from torchtitan.protocols import BaseModelArgs
 from torchtitan.tools.logging import logger
 from transformers.models.llama.configuration_llama import LlamaConfig
-
-from transformers.modeling_utils import PreTrainedModel
 from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
+from transformers.modeling_utils import PreTrainedModel
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
 
 # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
 # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
@@ -111,7 +110,7 @@ def _init_weights_patched(self, module):
 PreTrainedModel._initialize_weights = _initialize_weights_patched
 
 @dataclass
-class HFTransformerModelArgs(LlamaConfig, BaseModelArgs):
+class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
     """
     Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
     
@@ -138,23 +137,7 @@ def __init__(
         # HuggingFace specific args
         attn_implementation: str = "sdpa",
         **kwargs
-    ):
-        # Map TorchTitan arguments to HuggingFace arguments for parent class initialization
-        hf_config_dict = dict(
-            hidden_size=dim,
-            num_hidden_layers=n_layers,
-            num_attention_heads=n_heads,
-            num_key_value_heads=n_kv_heads,
-            vocab_size=vocab_size,
-            rms_norm_eps=norm_eps,
-            rope_theta=rope_theta,
-            max_position_embeddings=max_seq_len,
-            eos_token_id=eos_id,
-            **kwargs
-        )
-        
-        super().__init__(**hf_config_dict)
-        
+    ):  
         # Store TorchTitan-specific args (no HF equivalent)
         self.multiple_of = multiple_of
         self.ffn_dim_multiplier = ffn_dim_multiplier
@@ -249,7 +232,7 @@ def eos_id(self, value: int):
 
     def update_from_config(self, job_config: JobConfig):
         # Load HF config (overwrites our HF attributes)
-        hf_model_config = LlamaConfig.from_pretrained(
+        hf_model_config = AutoConfig.from_pretrained(
             job_config.model.name,
             attn_implementation=self.attn_implementation,
         )
@@ -337,37 +320,36 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in
         return nparams, num_flops_per_token
 
 
-class HFTransformerModel(LlamaForCausalLM):
+class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
-        super().__init__(model_args)
+        super().__init__()
+        
+        # Try to import the model class dynamically from the transformers library if not found in globals
+        model_class_name = model_args.architectures[0]
+        model_cls = globals().get(model_class_name, None)
+        if model_cls is None:
+            try:
+                import importlib
+                transformers_mod = importlib.import_module("transformers")
+                model_cls = getattr(transformers_mod, model_class_name)
+            except (ImportError, AttributeError) as e:
+                raise ImportError(
+                    f"Could not find model class '{model_class_name}' in globals or transformers. "
+                    f"Make sure the class is available. Original error: {e}"
+                )
+        self.model = model_cls(config=model_args)
+
+    @property
+    def layers(self):
+        """Returns the model's layers, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
+            return self.model.model.layers
+        else:
+            # Add more cases here if needed for other model architectures
+            raise AttributeError("Could not find layers in the model. Please check the model structure.")
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
 
     def init_weights(self, *args, **kwargs):
-        # Taken from transformers.modeling_utils.PreTrainedModel.init_weights
-        super().init_weights()
-        self._backward_compatibility_gradient_checkpointing()
-
-        # Make sure the modules correctly exist if the flag is active
-        if self._keep_in_fp32_modules is not None or self._keep_in_fp32_modules_strict is not None:
-            all_parameters = {name for name, _ in self.named_parameters() if len(name) > 0}
-            unique_module_names = set()
-            # Get all unique module names in the module graph, without the prefixes
-            for param in all_parameters:
-                unique_module_names.update(
-                    [name for name in param.split(".") if not name.isnumeric() and name not in ["weight", "bias"]]
-                )
-            # Check that every module in the keep_in_fp32 list is part of the module graph
-            if self._keep_in_fp32_modules is not None:
-                for module in self._keep_in_fp32_modules:
-                    if module not in unique_module_names:
-                        raise ValueError(
-                            f"{module} was specified in the `_keep_in_fp32_modules` list, but is not part of the modules in"
-                            f" {self.__class__.__name__}"
-                        )
-
-            if self._keep_in_fp32_modules_strict is not None:
-                for module in self._keep_in_fp32_modules_strict:
-                    if module not in unique_module_names:
-                        raise ValueError(
-                            f"{module} was specified in the `_keep_in_fp32_modules_strict` list, but is not part of the modules in"
-                            f" {self.__class__.__name__}"
-                        )
\ No newline at end of file
+        self.model.post_init()
\ No newline at end of file
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
index 6da44a321d..1a2528be6d 100644
--- a/torchtitan/models/llama3/infra/parallelize.py
+++ b/torchtitan/models/llama3/infra/parallelize.py
@@ -34,8 +34,6 @@
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.tools.logging import logger
 
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
-
 def parallelize_llama(
     model: nn.Module,
     parallel_dims: ParallelDims,
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 7ae5881f2a..179f455e98 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -34,7 +34,7 @@
     maybe_enable_profiling,
 )
 
-from transformers.models.llama.modeling_llama import LlamaForCausalLM, CausalLMOutputWithPast
+from transformers.models.llama.modeling_llama import CausalLMOutputWithPast
 
 
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):

From a0d67a78ab45aba67e594d59b54116dd4c06742d Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 15:14:10 +0000
Subject: [PATCH 021/129] handle pref logits accessing inside HF model wrapper

---
 .../transformers_backend/model/hf_transformers_args.py   | 9 +++++++--
 torchtitan/train.py                                      | 3 ---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index b0db5ba36b..c257cbfcfd 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import importlib
 from dataclasses import dataclass
 from typing import Optional
 
@@ -16,6 +17,8 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
 
 # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
 # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
@@ -329,7 +332,6 @@ def __init__(self, model_args: HFTransformerModelArgs):
         model_cls = globals().get(model_class_name, None)
         if model_cls is None:
             try:
-                import importlib
                 transformers_mod = importlib.import_module("transformers")
                 model_cls = getattr(transformers_mod, model_class_name)
             except (ImportError, AttributeError) as e:
@@ -349,7 +351,10 @@ def layers(self):
             raise AttributeError("Could not find layers in the model. Please check the model structure.")
 
     def forward(self, *args, **kwargs):
-        return self.model(*args, **kwargs)
+        output = self.model(*args, **kwargs)
+        if isinstance(output, CausalLMOutputWithPast):
+            return output.logits
+        return output
 
     def init_weights(self, *args, **kwargs):
         self.model.post_init()
\ No newline at end of file
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 179f455e98..d7a399a1ce 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -464,9 +464,6 @@ def forward_backward_step(
                 assert len(model_parts) == 1
                 with self.maybe_enable_amp:
                     pred = model_parts[0](inputs)
-                    #NOTE(3outeille): just trying to make it work for now. Will refactor later.
-                    if isinstance(pred, CausalLMOutputWithPast):
-                        pred = pred.logits
                     loss = self.loss_fn(pred, labels)
                 # need to free to before bwd to avoid peaking memory
                 del pred

From ea05552507082936e4d7b92a71691fe4d37bac01 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 15:21:17 +0000
Subject: [PATCH 022/129] isolate HF patch for llama in another file

---
 .../model/hf_llama_patch.py                   | 90 +++++++++++++++++
 .../model/hf_transformers_args.py             | 97 +------------------
 2 files changed, 92 insertions(+), 95 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_patch.py

diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
new file mode 100644
index 0000000000..28888f61a6
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
@@ -0,0 +1,90 @@
+
+
+import torch.nn as nn
+
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer
+from transformers.modeling_utils import PreTrainedModel
+
+_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__
+
+def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int):
+    _original_llama_decoder_layer_init(self, config, layer_idx)
+    self.mlp.layer_idx = layer_idx
+
+def _initialize_weights_patched(self, module):
+    # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
+    # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
+    # which prevents subsequent proper initialization.
+    if getattr(module, "_is_hf_initialized", False):
+        return
+
+    for param in module.parameters(recurse=True):
+        if param.device.type == "meta":
+            return
+    
+    # If not on a meta device, call the original weight initialization
+    self._init_weights(module)
+    module._is_hf_initialized = True
+
+def _init_weights_patched(self, module):
+    """
+    Patched version of _init_weights to match TorchTitan's initialization for Llama.
+    `self` is a LlamaPreTrainedModel instance.
+    """
+    config = self.config
+
+    if isinstance(module, (LlamaAttention, LlamaMLP)):
+        layer_idx = module.layer_idx
+
+        if config.depth_init:
+            init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+        else:
+            init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+    if isinstance(module, LlamaAttention):
+        nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
+    
+    elif isinstance(module, LlamaMLP):
+        nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std)
+        nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
+
+    elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head
+        final_out_std = config.hidden_size**-0.5
+        cutoff_factor = 3
+        nn.init.trunc_normal_(
+            module.weight,
+            mean=0.0,
+            std=final_out_std,
+            a=-cutoff_factor * final_out_std,
+            b=cutoff_factor * final_out_std,
+        )
+        if module.bias is not None:
+            module.bias.data.zero_()
+
+    elif isinstance(module, nn.Embedding):
+        std = config.initializer_range
+        module.weight.data.normal_(mean=0.0, std=std)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    
+    elif (
+        isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d))
+        or "LayerNorm" in module.__class__.__name__
+        or "RMSNorm" in module.__class__.__name__
+    ):
+        # Norms can exist without weights (in which case they are None from torch primitives)
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(1.0)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.zero_()
+
+
+def patch_hf_llama():
+    LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched
+    PreTrainedModel._init_weights = _init_weights_patched
+    PreTrainedModel._initialize_weights = _initialize_weights_patched
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index c257cbfcfd..5a8b724397 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -12,105 +12,12 @@
 from torchtitan.config import JobConfig
 from torchtitan.protocols import BaseModelArgs
 from torchtitan.tools.logging import logger
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP
-from transformers.modeling_utils import PreTrainedModel
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-
-# NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
-# The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
-# which prevents subsequent proper initialization.
-def _initialize_weights_patched(self, module):
-    """
-    Patched version of _initialize_weights that skips initialization and setting
-    the _is_hf_initialized flag if the module is on a meta device.
-    """
-    if getattr(module, "_is_hf_initialized", False):
-        return
-
-    for param in module.parameters(recurse=True):
-        if param.device.type == "meta":
-            return
-    
-    # If not on a meta device, call the original weight initialization
-    self._init_weights(module)
-    module._is_hf_initialized = True
-
-
-#TODO(3outeille): find a better way to do this
-from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-
-_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__
-
-def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int):
-    _original_llama_decoder_layer_init(self, config, layer_idx)
-    self.mlp.layer_idx = layer_idx
-
-LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched
-
-
-def _init_weights_patched(self, module):
-    """
-    Patched version of _init_weights to match TorchTitan's initialization for Llama.
-    `self` is a LlamaPreTrainedModel instance.
-    """
-    config = self.config
-
-    if isinstance(module, (LlamaAttention, LlamaMLP)):
-        layer_idx = module.layer_idx
-
-        if config.depth_init:
-            init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-        else:
-            init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
-
-    if isinstance(module, LlamaAttention):
-        nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
-    
-    elif isinstance(module, LlamaMLP):
-        nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std)
-        nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
-
-    elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head
-        final_out_std = config.hidden_size**-0.5
-        cutoff_factor = 3
-        nn.init.trunc_normal_(
-            module.weight,
-            mean=0.0,
-            std=final_out_std,
-            a=-cutoff_factor * final_out_std,
-            b=cutoff_factor * final_out_std,
-        )
-        if module.bias is not None:
-            module.bias.data.zero_()
-
-    elif isinstance(module, nn.Embedding):
-        std = config.initializer_range
-        module.weight.data.normal_(mean=0.0, std=std)
-        if module.padding_idx is not None:
-            module.weight.data[module.padding_idx].zero_()
-    
-    elif (
-        isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d))
-        or "LayerNorm" in module.__class__.__name__
-        or "RMSNorm" in module.__class__.__name__
-    ):
-        # Norms can exist without weights (in which case they are None from torch primitives)
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(1.0)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.zero_()
-
-
-PreTrainedModel._init_weights = _init_weights_patched
-PreTrainedModel._initialize_weights = _initialize_weights_patched
+from .hf_llama_patch import patch_hf_llama
+patch_hf_llama()
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):

From adefa2cd616cd848956e5bea252a9bcd63515942 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 16 Sep 2025 18:42:23 +0000
Subject: [PATCH 023/129] find hacky way to pass HF model.name through CLI

---
 torchtitan/experiments/transformers_backend/__init__.py | 5 +----
 torchtitan/protocols/train_spec.py                      | 9 ++++++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index c4b5256d83..7ac18a1752 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -52,7 +52,4 @@
     build_loss_fn=build_cross_entropy_loss,
 )
 
-# Register multiple train_specs under the same name
-register_train_spec(hf_train_spec)
-register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-3B"))
-register_train_spec(dataclasses.replace(hf_train_spec, name="meta-llama/Llama-3.2-1B"))
\ No newline at end of file
+register_train_spec(hf_train_spec)
\ No newline at end of file
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index 06fa3a1bc6..0feaaa38cc 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from collections.abc import Callable
+import dataclasses
 from dataclasses import dataclass
 from typing import Mapping, TypeAlias
 
@@ -69,8 +70,14 @@ def register_train_spec(train_spec: TrainSpec) -> None:
 
 def get_train_spec(name: str) -> TrainSpec:
     global _train_specs
-    if name not in _train_specs:
+
+    if "/" in name: # HF model (dynamic loading)
+        hf_spec = _train_specs["hf_auto_model"]
+        new_spec = dataclasses.replace(hf_spec, name=name)
+        _train_specs[name] = new_spec
+    elif name not in _train_specs:  # Torchtitan
         raise ValueError(f"Model {name} is not registered.")
+
     return _train_specs[name]
 
 

From a2358631c2b430c9bbbc061db2055eb1c81f8abf Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 17 Sep 2025 08:12:29 +0000
Subject: [PATCH 024/129] more granularity of logging when doing parameter
 breakdown

---
 .../model/hf_transformers_args.py             |  82 ++++++-------
 .../reference_diff_llama3_1gpu.log            | 112 ++++++++++++++----
 torchtitan/models/deepseek_v3/model/args.py   |  16 +++
 torchtitan/models/llama3/model/args.py        |  91 +++++---------
 torchtitan/train.py                           |   4 +-
 5 files changed, 180 insertions(+), 125 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 5a8b724397..4b2f38ffa1 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -175,60 +175,52 @@ def update_from_config(self, job_config: JobConfig):
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
-
-        layer_params = {}  # int -> int
-        embedding_params = 0
-        norm_params = 0
-        lm_head_params = 0
-        misc_params = {}
-
-        for name, p in model.named_parameters():
-            if "model.embed_tokens" in name:
-                embedding_params += p.numel()
-            elif "model.layers." in name:
-                try:
-                    layer_num = int(name.split("layers.")[1].split(".")[0])
-                    if layer_num not in layer_params:
-                        layer_params[layer_num] = 0
-                    layer_params[layer_num] += p.numel()
-                except (ValueError, IndexError):
-                    # Should not happen with standard HF llama names
-                    component = "misc_layer_parts"
-                    if component not in misc_params:
-                        misc_params[component] = 0
-                    misc_params[component] += p.numel()
-            elif "model.norm" in name:
-                norm_params += p.numel()
-            elif "lm_head" in name:
-                lm_head_params += p.numel()
-            else:
-                # Catch anything else
-                component = name.split(".")[0]
-                if component not in misc_params:
-                    misc_params[component] = 0
-                misc_params[component] += p.numel()
-
-        logger.info("Parameter breakdown:")
-        logger.info(f"  - embedding: {embedding_params:,} parameters")
-        for layer_num in sorted(layer_params.keys()):
-            params = layer_params[layer_num]
-            logger.info(f"  - layer_{layer_num}: {params:,} parameters")
-        logger.info(f"  - final_norm: {norm_params:,} parameters")
-        logger.info(f"  - lm_head: {lm_head_params:,} parameters")
-        if misc_params:
-            for name, params in misc_params.items():
-                logger.info(f"  - {name} (misc): {params:,} parameters")
-
         nparams_embedding = sum(
             sum(p.numel() for p in m.parameters())
             for m in model.children()
             if isinstance(m, nn.Embedding)
         )
 
-        l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len
+        l, h, q, t = (
+            self.n_layers,
+            self.n_heads,
+            self.dim // self.n_heads,
+            seq_len,
+        )
+        # Reasoning behind the factor of 12 for the self-attention part of the formula:
+        # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+        # 2. the flash attention does 1 more matmul recomputation in the backward
+        #    but recomputation should not be counted in calculating MFU           (+0)
+        # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+        # 4. we follow the convention and do not account for sparsity in causal attention
         num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+
         return nparams, num_flops_per_token
 
+    def debug_structure_param(self, model: nn.Module):
+        logger.info("Model Structure Parameter Breakdown:")
+
+        def _format_module(module: nn.Module, prefix: str = ""):
+            for name, sub_module in module.named_children():
+                sub_module_params = sum(p.numel() for p in sub_module.parameters())
+                if sub_module_params == 0:
+                    continue
+
+                # For HF models, we want to "unwrap" the ".model" attribute
+                # to get a view comparable to the native TorchTitan models.
+                if name == "model":
+                    _format_module(sub_module, prefix)
+                else:
+                    logger.info(
+                        f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
+                    )
+                    _format_module(sub_module, prefix + "  ")
+
+        total_params = sum(p.numel() for p in model.parameters())
+        logger.info(f"{model.__class__.__name__} - {total_params:,} params")
+        _format_module(model, "  ")
+
+
 
 class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
index e134f15115..44bbbae2d1 100644
--- a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
+++ b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
@@ -1,8 +1,8 @@
 [1mdiff --git a/tt_run.log.filtered b/hf_run.log.filtered[m
-[1mindex d3be70f..0f9a180 100644[m
+[1mindex 28327e0..abbe4d7 100644[m
 [1m--- a/tt_run.log.filtered[m
 [1m+++ b/hf_run.log.filtered[m
-[36m@@ -1,22 +1,23 @@[m
+[36m@@ -1,125 +1,125 @@[m
 + echo [31m'##############################################'[m
 [31m##############################################[m[32m'#######################################################'[m
 [32m#######################################################[m
@@ -13,7 +13,7 @@
 [32m#######################################################[m
 + [31mTT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32mHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m
 + [31mCUDA_VISIBLE_DEVICES=0[m[32mCUDA_VISIBLE_DEVICES=1[m
-+ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file [31m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m --training.seed 42 --training.deterministic
++ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file [31m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m --training.seed 42 --training.deterministic --model.name [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m
 [rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.[m
 [rank0]:  warnings.warn([m
 [rank0]:[titan] TIMESTAMP - root - [32mWARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.[m
@@ -26,12 +26,84 @@
 [rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Building [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel with [31mTransformerModelArgs(_enforced='This[m[32mHFTransformerModelArgs(_enforced='This[m field is used to enforce all fields have [31mdefaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)[m[32mdefaults.')[m
 [rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Parameter breakdown:[m
-[rank0]:[titan] TIMESTAMP - root - INFO -   - embedding: 512,000 parameters[m
-[36m@@ -28,30 +29,29 @@[m
-[rank0]:[titan] TIMESTAMP - root - INFO -   - layer_5: 852,480 parameters[m
-[rank0]:[titan] TIMESTAMP - root - INFO -   - final_norm: 256 parameters[m
-[rank0]:[titan] TIMESTAMP - root - INFO -   - lm_head: 512,000 parameters[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown:[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mTransformer[m[32mHFTransformerModel[m - 6,139,136 params
+[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(tok_embeddings):[m[32m(embed_tokens):[m Embedding - 512,000 params
+[rank0]:[titan] TIMESTAMP - root - INFO -   (layers): [31mModuleDict[m[32mModuleList[m - 5,114,880 params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (0): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (1): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (2): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (3): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (4): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (5): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -   (norm): [31mRMSNorm[m[32mLlamaRMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(output):[m[32m(lm_head):[m Linear - 512,000 params
 [rank0]:[titan] TIMESTAMP - root - INFO - [34mModel [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel [31msize: 6,139,136 total parameters[39m
 [rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14[m
@@ -41,19 +113,19 @@
 [rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2)[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at [31m./outputs/profile_trace[m[32m./outputs/profile_trace_hf[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m7.8723[m[32m7.8704[m  [38;2;180;60;0mgrad_norm:  [31m1.5167[m[32m1.5185[m  [38;2;54;234;195mmemory:  [31m1.39GiB(1.75%)[m[32m1.67GiB(2.10%)[m  [34mtps: [31m44,585[m[32m34,083[m  [36mtflops: [31m3.19[m[32m2.54[m  [35mmfu: [31m0.32%[39m[m[32m0.26%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m7.8723[m[32m7.8704[m  [38;2;180;60;0mgrad_norm:  [31m1.5167[m[32m1.5185[m  [38;2;54;234;195mmemory:  [31m1.39GiB(1.75%)[m[32m1.67GiB(2.10%)[m  [34mtps: [31m43,792[m[32m34,528[m  [36mtflops: [31m3.13[m[32m2.58[m  [35mmfu: [31m0.32%[39m[m[32m0.26%[39m[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.5246[m[32m7.5209[m  [38;2;180;60;0mgrad_norm:  [31m1.6359[m[32m1.6373[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m21,052[m[32m19,870[m  [36mtflops: [31m1.51[m[32m1.48[m  [35mmfu: 0.15%[39m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m6.7900[m[32m6.7789[m  [38;2;180;60;0mgrad_norm:  [31m2.0345[m[32m2.0390[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m334,947[m[32m199,616[m  [36mtflops: [31m23.95[m[32m14.89[m  [35mmfu: [31m2.42%[39m[m[32m1.51%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m5.9829[m[32m5.9673[m  [38;2;180;60;0mgrad_norm:  [31m2.4129[m[32m2.4176[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m357,001[m[32m207,967[m  [36mtflops: [31m25.53[m[32m15.51[m  [35mmfu: [31m2.58%[39m[m[32m1.57%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m5.0536[m[32m5.0388[m  [38;2;180;60;0mgrad_norm:  [31m2.5305[m[32m2.5275[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m279,263[m[32m188,745[m  [36mtflops: [31m19.97[m[32m14.08[m  [35mmfu: [31m2.02%[39m[m[32m1.42%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.5246[m[32m7.5209[m  [38;2;180;60;0mgrad_norm:  [31m1.6359[m[32m1.6373[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m21,384[m[32m19,712[m  [36mtflops: [31m1.53[m[32m1.47[m  [35mmfu: 0.15%[39m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m6.7900[m[32m6.7789[m  [38;2;180;60;0mgrad_norm:  [31m2.0345[m[32m2.0390[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m336,714[m[32m197,260[m  [36mtflops: [31m24.08[m[32m14.71[m  [35mmfu: [31m2.43%[39m[m[32m1.49%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m5.9829[m[32m5.9673[m  [38;2;180;60;0mgrad_norm:  [31m2.4129[m[32m2.4176[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m360,388[m[32m206,932[m  [36mtflops: [31m25.77[m[32m15.43[m  [35mmfu: [31m2.61%[39m[m[32m1.56%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m5.0536[m[32m5.0388[m  [38;2;180;60;0mgrad_norm:  [31m2.5305[m[32m2.5275[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m286,298[m[32m186,563[m  [36mtflops: [31m20.47[m[32m13.91[m  [35mmfu: [31m2.07%[39m[m[32m1.41%[39m[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.02[m[32m0.04[m seconds
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.6370[m[32m4.6283[m  [38;2;180;60;0mgrad_norm:  [31m2.2826[m[32m2.2818[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m129,464[m[32m83,088[m  [36mtflops: [31m9.26[m[32m6.20[m  [35mmfu: [31m0.94%[39m[m[32m0.63%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.3133[m[32m4.3077[m  [38;2;180;60;0mgrad_norm:  [31m2.1019[m[32m2.1023[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m298,394[m[32m175,561[m  [36mtflops: [31m21.34[m[32m13.09[m  [35mmfu: [31m2.16%[39m[m[32m1.32%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.1398[m[32m4.1349[m  [38;2;180;60;0mgrad_norm:  [31m1.9342[m[32m1.9334[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m352,929[m[32m206,086[m  [36mtflops: [31m25.24[m[32m15.37[m  [35mmfu: [31m2.55%[39m[m[32m1.55%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.5326[m[32m4.5289[m  [38;2;180;60;0mgrad_norm:  [31m1.5111[m[32m1.5103[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m357,192[m[32m208,947[m  [36mtflops: [31m25.54[m[32m15.58[m  [35mmfu: [31m2.58%[39m[m[32m1.58%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9859[m[32m3.9828[m  [38;2;180;60;0mgrad_norm:  [31m1.7799[m[32m1.7849[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m287,408[m[32m189,593[m  [36mtflops: [31m20.55[m[32m14.14[m  [35mmfu: [31m2.08%[39m[m[32m1.43%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.05[m seconds
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.6370[m[32m4.6283[m  [38;2;180;60;0mgrad_norm:  [31m2.2826[m[32m2.2818[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m129,447[m[32m80,608[m  [36mtflops: [31m9.26[m[32m6.01[m  [35mmfu: [31m0.94%[39m[m[32m0.61%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.3133[m[32m4.3077[m  [38;2;180;60;0mgrad_norm:  [31m2.1019[m[32m2.1023[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m300,058[m[32m177,619[m  [36mtflops: [31m21.46[m[32m13.25[m  [35mmfu: [31m2.17%[39m[m[32m1.34%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.1398[m[32m4.1349[m  [38;2;180;60;0mgrad_norm:  [31m1.9342[m[32m1.9334[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m361,523[m[32m205,777[m  [36mtflops: [31m25.85[m[32m15.35[m  [35mmfu: [31m2.61%[39m[m[32m1.55%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.5326[m[32m4.5289[m  [38;2;180;60;0mgrad_norm:  [31m1.5111[m[32m1.5103[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m363,412[m[32m207,933[m  [36mtflops: [31m25.99[m[32m15.51[m  [35mmfu: [31m2.63%[39m[m[32m1.57%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9859[m[32m3.9828[m  [38;2;180;60;0mgrad_norm:  [31m1.7799[m[32m1.7849[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m295,360[m[32m188,228[m  [36mtflops: [31m21.12[m[32m14.04[m  [35mmfu: [31m2.14%[39m[m[32m1.42%[39m[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.04[m seconds
 [rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete[m
diff --git a/torchtitan/models/deepseek_v3/model/args.py b/torchtitan/models/deepseek_v3/model/args.py
index d6afedfa34..9451f01b01 100644
--- a/torchtitan/models/deepseek_v3/model/args.py
+++ b/torchtitan/models/deepseek_v3/model/args.py
@@ -159,3 +159,19 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in
         )
 
         return nparams, num_flops_per_token
+
+    def debug_structure_param(self, model: nn.Module):
+        logger.info("Model Structure Parameter Breakdown:")
+
+        def _format_module(module: nn.Module, prefix: str = ""):
+            for name, sub_module in module.named_children():
+                sub_module_params = sum(p.numel() for p in sub_module.parameters())
+                if sub_module_params > 0:
+                    logger.info(
+                        f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
+                    )
+                    _format_module(sub_module, prefix + "  ")
+
+        total_params = sum(p.numel() for p in model.parameters())
+        logger.info(f"{model.__class__.__name__} - {total_params:,} params")
+        _format_module(model, "  ")
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
index 1728d9b93e..5aaf3839ed 100644
--- a/torchtitan/models/llama3/model/args.py
+++ b/torchtitan/models/llama3/model/args.py
@@ -53,68 +53,41 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
         self.max_seq_len = seq_len
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        """
-        Count parameters and estimate flops for a TT (TorchTitan) model.
-
-        Args:
-            model (nn.Module): The TT model (not HF).
-            seq_len (int): Sequence length.
-
-        Returns:
-            tuple[int, int]: (nparams, num_flops_per_token)
-        """
         nparams = sum(p.numel() for p in model.parameters())
-
-        layer_params = {}  # layer_id -> int
-        embedding_params = 0
-        norm_params = 0
-        lm_head_params = 0
-        misc_params = {}
-
-        # TT model: top-level modules are tok_embeddings, layers (ModuleDict), norm, output
-        for name, p in model.named_parameters():
-            if name.startswith("tok_embeddings."):
-                embedding_params += p.numel()
-            elif name.startswith("layers."):
-                try:
-                    # layers.<layer_id>.<rest>
-                    layer_id = int(name.split(".")[1])
-                    if layer_id not in layer_params:
-                        layer_params[layer_id] = 0
-                    layer_params[layer_id] += p.numel()
-                except (ValueError, IndexError):
-                    # Should not happen, but catch any oddities
-                    component = "misc_layer_parts"
-                    if component not in misc_params:
-                        misc_params[component] = 0
-                    misc_params[component] += p.numel()
-            elif name.startswith("norm."):
-                norm_params += p.numel()
-            elif name.startswith("output."):
-                lm_head_params += p.numel()
-            else:
-                # Catch anything else
-                component = name.split(".")[0]
-                if component not in misc_params:
-                    misc_params[component] = 0
-                misc_params[component] += p.numel()
-
-        logger.info("Parameter breakdown:")
-        logger.info(f"  - embedding: {embedding_params:,} parameters")
-        for layer_num in sorted(layer_params.keys()):
-            params = layer_params[layer_num]
-            logger.info(f"  - layer_{layer_num}: {params:,} parameters")
-        logger.info(f"  - final_norm: {norm_params:,} parameters")
-        logger.info(f"  - lm_head: {lm_head_params:,} parameters")
-        if misc_params:
-            for name, params in misc_params.items():
-                logger.info(f"  - {name} (misc): {params:,} parameters")
-
-        # For TT, embedding is always model.tok_embeddings
         nparams_embedding = sum(
-            p.numel() for p in getattr(model, "tok_embeddings", nn.Module()).parameters()
+            sum(p.numel() for p in m.parameters())
+            for m in model.children()
+            if isinstance(m, nn.Embedding)
         )
 
-        l, h, q, t = self.n_layers, self.n_heads, self.dim // self.n_heads, seq_len
+        l, h, q, t = (
+            self.n_layers,
+            self.n_heads,
+            self.dim // self.n_heads,
+            seq_len,
+        )
+        # Reasoning behind the factor of 12 for the self-attention part of the formula:
+        # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+        # 2. the flash attention does 1 more matmul recomputation in the backward
+        #    but recomputation should not be counted in calculating MFU           (+0)
+        # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+        # 4. we follow the convention and do not account for sparsity in causal attention
         num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+
         return nparams, num_flops_per_token
+    
+    def debug_structure_param(self, model: nn.Module):
+        logger.info("Model Structure Parameter Breakdown:")
+
+        def _format_module(module: nn.Module, prefix: str = ""):
+            for name, sub_module in module.named_children():
+                sub_module_params = sum(p.numel() for p in sub_module.parameters())
+                if sub_module_params > 0:
+                    logger.info(
+                        f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
+                    )
+                    _format_module(sub_module, prefix + "  ")
+
+        total_params = sum(p.numel() for p in model.parameters())
+        logger.info(f"{model.__class__.__name__} - {total_params:,} params")
+        _format_module(model, "  ")
\ No newline at end of file
diff --git a/torchtitan/train.py b/torchtitan/train.py
index d7a399a1ce..b15cd73e2c 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -180,7 +180,9 @@ def __init__(self, job_config: JobConfig):
             model_param_count,
             self.metrics_processor.num_flops_per_token,
         ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
-
+        
+        model_args.debug_structure_param(model)
+        
         logger.info(
             f"{color.blue}Model {self.train_spec.name} {job_config.model.flavor} "
             f"{color.red}size: {model_param_count:,} total parameters{color.reset}"

From fc43dc84adcc482f21b3c163e229109f859b965e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 17 Sep 2025 08:20:05 +0000
Subject: [PATCH 025/129] add __repr__ to HFTransformerModelArgs for better
 debugging logs

---
 .../model/hf_transformers_args.py             |  9 +++++++
 .../reference_diff_llama3_1gpu.log            | 26 +++++++++----------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 4b2f38ffa1..75610d8203 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -77,6 +77,15 @@ def __init__(
             **kwargs
         )
 
+    def __repr__(self) -> str:
+        # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
+        # PretrainedConfig has a __repr__ that serializes the object to JSON, but it
+        # doesn't work well with how HFTransformerModelArgs is initialized.
+        # This custom __repr__ provides a dataclass-like representation that correctly
+        # displays the arguments passed during initialization.
+        args_str = ", ".join(f"{k}={v!r}" for k, v in self._passed_args.items())
+        return f"{self.__class__.__name__}({args_str})"
+
     @property
     def dim(self) -> int:
         """TorchTitan: Model dimension (alias for HF hidden_size)"""
diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
index 44bbbae2d1..84eff10ff8 100644
--- a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
+++ b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
@@ -1,5 +1,5 @@
 [1mdiff --git a/tt_run.log.filtered b/hf_run.log.filtered[m
-[1mindex 28327e0..abbe4d7 100644[m
+[1mindex 1f72d39..c1856a6 100644[m
 [1m--- a/tt_run.log.filtered[m
 [1m+++ b/hf_run.log.filtered[m
 [36m@@ -1,125 +1,125 @@[m
@@ -24,7 +24,7 @@
 [rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation).[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Building [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel with [31mTransformerModelArgs(_enforced='This[m[32mHFTransformerModelArgs(_enforced='This[m field is used to enforce all fields have [31mdefaults.', dim=256, n_layers=6, n_heads=16, n_kv_heads=None, vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', eos_id=0)[m[32mdefaults.')[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Building [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel with [31mTransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256,[m[32mHFTransformerModelArgs(dim=256,[m n_layers=6, n_heads=16, [31mn_kv_heads=None,[m[32mn_kv_heads=16,[m vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', [31meos_id=0)[m[32meos_id=0, attn_implementation='sdpa')[m
 [rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown:[m
 [rank0]:[titan] TIMESTAMP - root - INFO - [31mTransformer[m[32mHFTransformerModel[m - 6,139,136 params
@@ -113,19 +113,19 @@
 [rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2)[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at [31m./outputs/profile_trace[m[32m./outputs/profile_trace_hf[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m7.8723[m[32m7.8704[m  [38;2;180;60;0mgrad_norm:  [31m1.5167[m[32m1.5185[m  [38;2;54;234;195mmemory:  [31m1.39GiB(1.75%)[m[32m1.67GiB(2.10%)[m  [34mtps: [31m43,792[m[32m34,528[m  [36mtflops: [31m3.13[m[32m2.58[m  [35mmfu: [31m0.32%[39m[m[32m0.26%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m7.8723[m[32m7.8704[m  [38;2;180;60;0mgrad_norm:  [31m1.5167[m[32m1.5185[m  [38;2;54;234;195mmemory:  [31m1.39GiB(1.75%)[m[32m1.67GiB(2.10%)[m  [34mtps: [31m43,375[m[32m32,685[m  [36mtflops: [31m3.10[m[32m2.44[m  [35mmfu: [31m0.31%[39m[m[32m0.25%[39m[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.5246[m[32m7.5209[m  [38;2;180;60;0mgrad_norm:  [31m1.6359[m[32m1.6373[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m21,384[m[32m19,712[m  [36mtflops: [31m1.53[m[32m1.47[m  [35mmfu: 0.15%[39m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m6.7900[m[32m6.7789[m  [38;2;180;60;0mgrad_norm:  [31m2.0345[m[32m2.0390[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m336,714[m[32m197,260[m  [36mtflops: [31m24.08[m[32m14.71[m  [35mmfu: [31m2.43%[39m[m[32m1.49%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m5.9829[m[32m5.9673[m  [38;2;180;60;0mgrad_norm:  [31m2.4129[m[32m2.4176[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m360,388[m[32m206,932[m  [36mtflops: [31m25.77[m[32m15.43[m  [35mmfu: [31m2.61%[39m[m[32m1.56%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m5.0536[m[32m5.0388[m  [38;2;180;60;0mgrad_norm:  [31m2.5305[m[32m2.5275[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m286,298[m[32m186,563[m  [36mtflops: [31m20.47[m[32m13.91[m  [35mmfu: [31m2.07%[39m[m[32m1.41%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.5246[m[32m7.5209[m  [38;2;180;60;0mgrad_norm:  [31m1.6359[m[32m1.6373[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m20,834[m[32m19,798[m  [36mtflops: [31m1.49[m[32m1.48[m  [35mmfu: 0.15%[39m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m6.7900[m[32m6.7789[m  [38;2;180;60;0mgrad_norm:  [31m2.0345[m[32m2.0390[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m338,323[m[32m199,161[m  [36mtflops: [31m24.19[m[32m14.85[m  [35mmfu: [31m2.45%[39m[m[32m1.50%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m5.9829[m[32m5.9673[m  [38;2;180;60;0mgrad_norm:  [31m2.4129[m[32m2.4176[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m362,741[m[32m207,198[m  [36mtflops: [31m25.94[m[32m15.45[m  [35mmfu: [31m2.62%[39m[m[32m1.56%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m5.0536[m[32m5.0388[m  [38;2;180;60;0mgrad_norm:  [31m2.5305[m[32m2.5275[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m287,315[m[32m187,882[m  [36mtflops: [31m20.55[m[32m14.01[m  [35mmfu: [31m2.08%[39m[m[32m1.42%[39m[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.05[m seconds
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.6370[m[32m4.6283[m  [38;2;180;60;0mgrad_norm:  [31m2.2826[m[32m2.2818[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m129,447[m[32m80,608[m  [36mtflops: [31m9.26[m[32m6.01[m  [35mmfu: [31m0.94%[39m[m[32m0.61%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.3133[m[32m4.3077[m  [38;2;180;60;0mgrad_norm:  [31m2.1019[m[32m2.1023[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m300,058[m[32m177,619[m  [36mtflops: [31m21.46[m[32m13.25[m  [35mmfu: [31m2.17%[39m[m[32m1.34%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.1398[m[32m4.1349[m  [38;2;180;60;0mgrad_norm:  [31m1.9342[m[32m1.9334[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m361,523[m[32m205,777[m  [36mtflops: [31m25.85[m[32m15.35[m  [35mmfu: [31m2.61%[39m[m[32m1.55%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.5326[m[32m4.5289[m  [38;2;180;60;0mgrad_norm:  [31m1.5111[m[32m1.5103[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m363,412[m[32m207,933[m  [36mtflops: [31m25.99[m[32m15.51[m  [35mmfu: [31m2.63%[39m[m[32m1.57%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9859[m[32m3.9828[m  [38;2;180;60;0mgrad_norm:  [31m1.7799[m[32m1.7849[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m295,360[m[32m188,228[m  [36mtflops: [31m21.12[m[32m14.04[m  [35mmfu: [31m2.14%[39m[m[32m1.42%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.04[m seconds
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.6370[m[32m4.6283[m  [38;2;180;60;0mgrad_norm:  [31m2.2826[m[32m2.2818[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m130,121[m[32m83,115[m  [36mtflops: [31m9.31[m[32m6.20[m  [35mmfu: [31m0.94%[39m[m[32m0.63%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.3133[m[32m4.3077[m  [38;2;180;60;0mgrad_norm:  [31m2.1019[m[32m2.1023[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m295,546[m[32m174,068[m  [36mtflops: [31m21.13[m[32m12.98[m  [35mmfu: [31m2.14%[39m[m[32m1.31%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.1398[m[32m4.1349[m  [38;2;180;60;0mgrad_norm:  [31m1.9342[m[32m1.9334[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m361,129[m[32m206,837[m  [36mtflops: [31m25.82[m[32m15.43[m  [35mmfu: [31m2.61%[39m[m[32m1.56%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.5326[m[32m4.5289[m  [38;2;180;60;0mgrad_norm:  [31m1.5111[m[32m1.5103[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m363,728[m[32m208,233[m  [36mtflops: [31m26.01[m[32m15.53[m  [35mmfu: [31m2.63%[39m[m[32m1.57%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9859[m[32m3.9828[m  [38;2;180;60;0mgrad_norm:  [31m1.7799[m[32m1.7849[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m294,013[m[32m188,295[m  [36mtflops: [31m21.03[m[32m14.04[m  [35mmfu: [31m2.13%[39m[m[32m1.42%[39m[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10[m
 [rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.04[m seconds
 [rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete[m

From 23ae3785e7718d42b3d76bdb54c955c7da2fb9c8 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 17 Sep 2025 13:33:34 +0000
Subject: [PATCH 026/129] HF deepseek v3 is now training

---
 .../transformers_backend/__init__.py          |  53 +++++++--
 .../model/hf_transformers_args.py             | 112 ++++++++++++++++--
 2 files changed, 142 insertions(+), 23 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 7ac18a1752..6273dd2dd3 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -18,25 +18,58 @@
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
 from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel
 
+from torchtitan.models.moe import MoEArgs
+
 __all__ = [
     "HFTransformerModelArgs",
     "HFTransformerModel",
     "hf_transformers_configs",
 ]
 
+#TODO(3outeille): identify that if MoE model is used, we add a moe_args field
+# flavors = {
+#     "debugmodel": HFTransformerModelArgs(
+#         # n_layers=2,
+#         # vocab_size=2000,
+#         max_seq_len=2048,
+#         #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else
+#         dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16
+#     ),
+#     "medium": HFTransformerModelArgs(
+#         dim=1024,
+#         n_layers=12,
+#     ),
+#     "full": HFTransformerModelArgs(),
+# }
+
 flavors = {
     "debugmodel": HFTransformerModelArgs(
-        # n_layers=2,
-        # vocab_size=2000,
-        max_seq_len=2048,
-        #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else
-        dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16
-    ),
-    "medium": HFTransformerModelArgs(
-        dim=1024,
-        n_layers=12,
+        n_layers=3,
+        vocab_size=2000,
+        dim=256,
+        inter_dim=1024,
+        moe_inter_dim=256,
+        n_dense_layers=1,
+        n_heads=16,
+        n_group=2,
+        topk_group=1,
+        moe_args=MoEArgs(
+            num_experts=8,
+            num_shared_experts=2,
+            top_k=3,
+            score_func="softmax",
+            route_norm=True,
+            score_before_experts=False,
+        ),
+        kv_lora_rank=16,
+        q_lora_rank=0,
+        qk_rope_head_dim=16,
+        qk_nope_head_dim=32,
+        v_head_dim=32,
+        mscale=0.70,
+        # TO REMOVE:
+        n_kv_heads=16
     ),
-    "full": HFTransformerModelArgs(),
 }
 
 hf_train_spec = TrainSpec(
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 75610d8203..821a20f61f 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -16,6 +16,8 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
+from torchtitan.models.moe import MoEArgs
+
 from .hf_llama_patch import patch_hf_llama
 patch_hf_llama()
 
@@ -44,20 +46,44 @@ def __init__(
         use_flex_attn: bool = False,
         attn_mask_type: str = "causal",
         eos_id: int = 0,
+        moe_args: Optional[MoEArgs] = None,
+        # DeepSeekV3 specific args
+        n_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        inter_dim: Optional[int] = None,
+        moe_inter_dim: Optional[int] = None,
+        n_dense_layers: Optional[int] = None,
+        n_expert_groups: Optional[int] = None,
+        n_limited_groups: Optional[int] = None,
+        q_lora_rank: Optional[int] = None,
+        kv_lora_rank: Optional[int] = None,
+        qk_nope_head_dim: Optional[int] = None,
+        qk_rope_head_dim: Optional[int] = None,
+        v_head_dim: Optional[int] = None,
+        original_seq_len: Optional[int] = None,
+        rope_factor: Optional[float] = None,
+        beta_fast: Optional[int] = None,
+        beta_slow: Optional[int] = None,
+        mscale: Optional[float] = None,
         # HuggingFace specific args
         attn_implementation: str = "sdpa",
-        **kwargs
-    ):  
+        **kwargs,
+    ):
         # Store TorchTitan-specific args (no HF equivalent)
         self.multiple_of = multiple_of
         self.ffn_dim_multiplier = ffn_dim_multiplier
         self.depth_init = depth_init
         self.use_flex_attn = use_flex_attn
         self.attn_mask_type = attn_mask_type
-        
+
         # HuggingFace specific args
         self.attn_implementation = attn_implementation
 
+        # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
+        # setting it to None in HuggingFace.
+        if q_lora_rank == 0:
+            q_lora_rank = None
+
         self._passed_args = dict(
             dim=dim,
             n_layers=n_layers,
@@ -74,17 +100,53 @@ def __init__(
             attn_mask_type=attn_mask_type,
             eos_id=eos_id,
             attn_implementation=attn_implementation,
-            **kwargs
+            # DeepSeekV3 specific args
+            n_group=n_group,
+            topk_group=topk_group,
+            inter_dim=inter_dim,
+            moe_inter_dim=moe_inter_dim,
+            n_dense_layers=n_dense_layers,
+            n_expert_groups=n_expert_groups,
+            n_limited_groups=n_limited_groups,
+            q_lora_rank=q_lora_rank,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            original_seq_len=original_seq_len,
+            rope_factor=rope_factor,
+            beta_fast=beta_fast,
+            beta_slow=beta_slow,
+            mscale=mscale,
+            **kwargs,
         )
 
+        if moe_args is not None:
+            # MoE args for HF config
+            # HF uses different names for these
+            self.num_experts_per_tok = moe_args.top_k
+            self.n_routed_experts = moe_args.num_experts
+            self.n_shared_experts = moe_args.num_shared_experts
+            self.moe_intermediate_size = moe_inter_dim
+            self._passed_args.update(
+                dict(
+                    num_experts_per_tok=moe_args.top_k,
+                    n_routed_experts=moe_args.num_experts,
+                    n_shared_experts=moe_args.num_shared_experts,
+                    moe_intermediate_size=moe_inter_dim,
+                )
+            )
+
+
     def __repr__(self) -> str:
         # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
         # PretrainedConfig has a __repr__ that serializes the object to JSON, but it
         # doesn't work well with how HFTransformerModelArgs is initialized.
         # This custom __repr__ provides a dataclass-like representation that correctly
         # displays the arguments passed during initialization.
-        args_str = ", ".join(f"{k}={v!r}" for k, v in self._passed_args.items())
-        return f"{self.__class__.__name__}({args_str})"
+        args_lines = [f"{k}={v!r}" for k, v in sorted(self._passed_args.items())]
+        args_str = "\n".join(args_lines)
+        return f"{self.__class__.__name__}(\n{args_str}\n)"
 
     @property
     def dim(self) -> int:
@@ -149,6 +211,25 @@ def eos_id(self) -> int:
     def eos_id(self, value: int):
         self.eos_token_id = value
 
+    # === DeepSeekV3 specific properties ===
+    @property
+    def inter_dim(self) -> int:
+        """TorchTitan: Intermediate dimension (alias for HF intermediate_size)"""
+        return self.intermediate_size
+    
+    @inter_dim.setter
+    def inter_dim(self, value: int):
+        self.intermediate_size = value
+    
+    @property
+    def n_dense_layers(self) -> int:
+        """TorchTitan: Number of dense layers (alias for HF first_k_dense_replace)"""
+        return self.first_k_dense_replace
+    
+    @n_dense_layers.setter
+    def n_dense_layers(self, value: int):
+        self.first_k_dense_replace = value
+
     def update_from_config(self, job_config: JobConfig):
         # Load HF config (overwrites our HF attributes)
         hf_model_config = AutoConfig.from_pretrained(
@@ -163,6 +244,10 @@ def update_from_config(self, job_config: JobConfig):
             if hasattr(self, key):
                 setattr(self, key, value)
         
+        # MoE
+        if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
+            self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        
         # Configure HF-specific settings to match TorchTitan settings
         self.tie_word_embeddings = False
         self.attention_bias = False
@@ -170,13 +255,14 @@ def update_from_config(self, job_config: JobConfig):
         self.use_cache = False
         self.initializer_range = 1.0  # use as std for normal init in embedding
         
-        ffn_hidden_size = 4 * self.dim
-        ffn_hidden_size = int(2 * ffn_hidden_size / 3)
-        if self.ffn_dim_multiplier is not None:
-            ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
-        self.intermediate_size = self.multiple_of * (
-            (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
-        )
+        if self.inter_dim is None: # Only for llama model
+            ffn_hidden_size = 4 * self.dim
+            ffn_hidden_size = int(2 * ffn_hidden_size / 3)
+            if self.ffn_dim_multiplier is not None:
+                ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
+            self.intermediate_size = self.multiple_of * (
+                (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
+            )
         
         self.head_dim = self.dim // self.num_attention_heads
         

From 2573be482c7d1467c9d947a526c2952d0535a4ce Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 17 Sep 2025 13:54:34 +0000
Subject: [PATCH 027/129] refactor to make it clear which args comes from which
 parts

---
 .../transformers_backend/__init__.py          |  80 +++---
 .../model/hf_transformers_args.py             | 264 +++++++-----------
 2 files changed, 141 insertions(+), 203 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 6273dd2dd3..422df5621c 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -27,51 +27,51 @@
 ]
 
 #TODO(3outeille): identify that if MoE model is used, we add a moe_args field
-# flavors = {
-#     "debugmodel": HFTransformerModelArgs(
-#         # n_layers=2,
-#         # vocab_size=2000,
-#         max_seq_len=2048,
-#         #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else
-#         dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16
-#     ),
-#     "medium": HFTransformerModelArgs(
-#         dim=1024,
-#         n_layers=12,
-#     ),
-#     "full": HFTransformerModelArgs(),
-# }
-
 flavors = {
     "debugmodel": HFTransformerModelArgs(
-        n_layers=3,
-        vocab_size=2000,
-        dim=256,
-        inter_dim=1024,
-        moe_inter_dim=256,
-        n_dense_layers=1,
-        n_heads=16,
-        n_group=2,
-        topk_group=1,
-        moe_args=MoEArgs(
-            num_experts=8,
-            num_shared_experts=2,
-            top_k=3,
-            score_func="softmax",
-            route_norm=True,
-            score_before_experts=False,
-        ),
-        kv_lora_rank=16,
-        q_lora_rank=0,
-        qk_rope_head_dim=16,
-        qk_nope_head_dim=32,
-        v_head_dim=32,
-        mscale=0.70,
-        # TO REMOVE:
-        n_kv_heads=16
+        # n_layers=2,
+        # vocab_size=2000,
+        max_seq_len=2048,
+        #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else
+        dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16
     ),
+    "medium": HFTransformerModelArgs(
+        dim=1024,
+        n_layers=12,
+    ),
+    "full": HFTransformerModelArgs(),
 }
 
+# flavors = {
+#     "debugmodel": HFTransformerModelArgs(
+#         n_layers=3,
+#         vocab_size=2000,
+#         dim=256,
+#         inter_dim=1024,
+#         moe_inter_dim=256,
+#         n_dense_layers=1,
+#         n_heads=16,
+#         n_group=2,
+#         topk_group=1,
+#         moe_args=MoEArgs(
+#             num_experts=8,
+#             num_shared_experts=2,
+#             top_k=3,
+#             score_func="softmax",
+#             route_norm=True,
+#             score_before_experts=False,
+#         ),
+#         kv_lora_rank=16,
+#         q_lora_rank=0,
+#         qk_rope_head_dim=16,
+#         qk_nope_head_dim=32,
+#         v_head_dim=32,
+#         mscale=0.70,
+#         # TO REMOVE:
+#         n_kv_heads=16
+#     ),
+# }
+
 hf_train_spec = TrainSpec(
     name="hf_auto_model",
     model_cls=HFTransformerModel,
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 821a20f61f..f5f04ce77b 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -21,123 +21,144 @@
 from .hf_llama_patch import patch_hf_llama
 patch_hf_llama()
 
+class AliasedPropertiesMeta(type):
+    """
+    This metaclass automatically creates aliased properties on a class.
+    It looks for a `_TITAN_TO_HF_MAPPING` dictionary in the class
+    namespace and generates properties based on its contents.
+    """
+
+    def __new__(cls, name, bases, dct):
+        def _create_aliased_property(hf_name: str) -> property:
+            def getter(self):
+                return getattr(self, hf_name)
+            def setter(self, value):
+                setattr(self, hf_name, value)
+            return property(getter, setter)
+            
+        mapping = dct.get('_TITAN_TO_HF_MAPPING', {})
+        for titan_name, hf_name in mapping.items():
+            dct[titan_name] = _create_aliased_property(hf_name)
+        return super().__new__(cls, name, bases, dct)
+
+@dataclass
+class TitanModelArgs:
+    """Arguments for the base TorchTitan model."""
+
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = 128256
+    multiple_of: int = 256
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000
+    max_seq_len: int = 2048
+    depth_init: bool = True
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
+    eos_id: int = 0
+    moe_args: Optional[MoEArgs] = None
+
+
+@dataclass
+class DeepSeekV3Args:
+    """Arguments specific to DeepSeekV3 models."""
+
+    n_group: Optional[int] = None
+    topk_group: Optional[int] = None
+    inter_dim: Optional[int] = None
+    moe_inter_dim: Optional[int] = None
+    n_dense_layers: Optional[int] = None
+    n_expert_groups: Optional[int] = None
+    n_limited_groups: Optional[int] = None
+    q_lora_rank: Optional[int] = None
+    kv_lora_rank: Optional[int] = None
+    qk_nope_head_dim: Optional[int] = None
+    qk_rope_head_dim: Optional[int] = None
+    v_head_dim: Optional[int] = None
+    original_seq_len: Optional[int] = None
+    rope_factor: Optional[float] = None
+    beta_fast: Optional[int] = None
+    beta_slow: Optional[int] = None
+    mscale: Optional[float] = None
+
+
 @dataclass
-class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
+class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs, metaclass=AliasedPropertiesMeta):
     """
     Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
     
     Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
     """
     
+    _TITAN_TO_HF_MAPPING = {
+        # TorchTitan Name: HuggingFace Name
+        "dim": "hidden_size",
+        "n_layers": "num_hidden_layers",
+        "n_heads": "num_attention_heads",
+        "n_kv_heads": "num_key_value_heads",
+        "norm_eps": "rms_norm_eps",
+        "max_seq_len": "max_position_embeddings",
+        "eos_id": "eos_token_id",
+        # DeepSeekV3 specific aliases
+        "inter_dim": "intermediate_size",
+        "n_dense_layers": "first_k_dense_replace",
+    }
+
     def __init__(
         self,
-        # TorchTitan args
-        dim: int = 4096,
-        n_layers: int = 32,
-        n_heads: int = 32,
-        n_kv_heads: Optional[int] = None,
-        vocab_size: int = 128256,
-        multiple_of: int = 256,
-        ffn_dim_multiplier: Optional[float] = None,
-        norm_eps: float = 1e-5,
-        rope_theta: float = 10000,
-        max_seq_len: int = 2048,
-        depth_init: bool = True,
-        use_flex_attn: bool = False,
-        attn_mask_type: str = "causal",
-        eos_id: int = 0,
-        moe_args: Optional[MoEArgs] = None,
-        # DeepSeekV3 specific args
-        n_group: Optional[int] = None,
-        topk_group: Optional[int] = None,
-        inter_dim: Optional[int] = None,
-        moe_inter_dim: Optional[int] = None,
-        n_dense_layers: Optional[int] = None,
-        n_expert_groups: Optional[int] = None,
-        n_limited_groups: Optional[int] = None,
-        q_lora_rank: Optional[int] = None,
-        kv_lora_rank: Optional[int] = None,
-        qk_nope_head_dim: Optional[int] = None,
-        qk_rope_head_dim: Optional[int] = None,
-        v_head_dim: Optional[int] = None,
-        original_seq_len: Optional[int] = None,
-        rope_factor: Optional[float] = None,
-        beta_fast: Optional[int] = None,
-        beta_slow: Optional[int] = None,
-        mscale: Optional[float] = None,
+        titan_args: Optional[TitanModelArgs] = None,
+        deepseek_v3_args: Optional[DeepSeekV3Args] = None,
         # HuggingFace specific args
         attn_implementation: str = "sdpa",
         **kwargs,
     ):
+        titan_args = titan_args or TitanModelArgs()
+        deepseek_v3_args = deepseek_v3_args or DeepSeekV3Args()
+
         # Store TorchTitan-specific args (no HF equivalent)
-        self.multiple_of = multiple_of
-        self.ffn_dim_multiplier = ffn_dim_multiplier
-        self.depth_init = depth_init
-        self.use_flex_attn = use_flex_attn
-        self.attn_mask_type = attn_mask_type
+        self.multiple_of = titan_args.multiple_of
+        self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
+        self.depth_init = titan_args.depth_init
+        self.use_flex_attn = titan_args.use_flex_attn
+        self.attn_mask_type = titan_args.attn_mask_type
 
         # HuggingFace specific args
         self.attn_implementation = attn_implementation
 
         # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
         # setting it to None in HuggingFace.
+        q_lora_rank = deepseek_v3_args.q_lora_rank
         if q_lora_rank == 0:
             q_lora_rank = None
+        deepseek_v3_args.q_lora_rank = q_lora_rank
 
-        self._passed_args = dict(
-            dim=dim,
-            n_layers=n_layers,
-            n_heads=n_heads,
-            n_kv_heads=n_kv_heads,
-            vocab_size=vocab_size,
-            multiple_of=multiple_of,
-            ffn_dim_multiplier=ffn_dim_multiplier,
-            norm_eps=norm_eps,
-            rope_theta=rope_theta,
-            max_seq_len=max_seq_len,
-            depth_init=depth_init,
-            use_flex_attn=use_flex_attn,
-            attn_mask_type=attn_mask_type,
-            eos_id=eos_id,
-            attn_implementation=attn_implementation,
-            # DeepSeekV3 specific args
-            n_group=n_group,
-            topk_group=topk_group,
-            inter_dim=inter_dim,
-            moe_inter_dim=moe_inter_dim,
-            n_dense_layers=n_dense_layers,
-            n_expert_groups=n_expert_groups,
-            n_limited_groups=n_limited_groups,
-            q_lora_rank=q_lora_rank,
-            kv_lora_rank=kv_lora_rank,
-            qk_nope_head_dim=qk_nope_head_dim,
-            qk_rope_head_dim=qk_rope_head_dim,
-            v_head_dim=v_head_dim,
-            original_seq_len=original_seq_len,
-            rope_factor=rope_factor,
-            beta_fast=beta_fast,
-            beta_slow=beta_slow,
-            mscale=mscale,
-            **kwargs,
-        )
+        self._passed_args = {
+            **titan_args.__dict__,
+            **deepseek_v3_args.__dict__,
+            "attn_implementation": attn_implementation,
+        }
+        self._passed_args.update(kwargs)
 
-        if moe_args is not None:
+        if titan_args.moe_args is not None:
             # MoE args for HF config
             # HF uses different names for these
+            moe_args = titan_args.moe_args
             self.num_experts_per_tok = moe_args.top_k
             self.n_routed_experts = moe_args.num_experts
             self.n_shared_experts = moe_args.num_shared_experts
-            self.moe_intermediate_size = moe_inter_dim
+            self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
             self._passed_args.update(
                 dict(
                     num_experts_per_tok=moe_args.top_k,
                     n_routed_experts=moe_args.num_experts,
                     n_shared_experts=moe_args.num_shared_experts,
-                    moe_intermediate_size=moe_inter_dim,
+                    moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
                 )
             )
 
-
     def __repr__(self) -> str:
         # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
         # PretrainedConfig has a __repr__ that serializes the object to JSON, but it
@@ -148,88 +169,6 @@ def __repr__(self) -> str:
         args_str = "\n".join(args_lines)
         return f"{self.__class__.__name__}(\n{args_str}\n)"
 
-    @property
-    def dim(self) -> int:
-        """TorchTitan: Model dimension (alias for HF hidden_size)"""
-        return self.hidden_size
-    
-    @dim.setter
-    def dim(self, value: int):
-        self.hidden_size = value
-    
-    @property
-    def n_layers(self) -> int:
-        """TorchTitan: Number of layers (alias for HF num_hidden_layers)"""
-        return self.num_hidden_layers
-    
-    @n_layers.setter
-    def n_layers(self, value: int):
-        self.num_hidden_layers = value
-    
-    @property
-    def n_heads(self) -> int:
-        """TorchTitan: Number of attention heads (alias for HF num_attention_heads)"""
-        return self.num_attention_heads
-    
-    @n_heads.setter
-    def n_heads(self, value: int):
-        self.num_attention_heads = value
-    
-    @property
-    def n_kv_heads(self) -> Optional[int]:
-        """TorchTitan: Number of key-value heads (alias for HF num_key_value_heads)"""
-        return self.num_key_value_heads
-    
-    @n_kv_heads.setter
-    def n_kv_heads(self, value: Optional[int]):
-        self.num_key_value_heads = value
-    
-    @property
-    def norm_eps(self) -> float:
-        """TorchTitan: Layer norm epsilon (alias for HF rms_norm_eps)"""
-        return self.rms_norm_eps
-    
-    @norm_eps.setter
-    def norm_eps(self, value: float):
-        self.rms_norm_eps = value
-    
-    @property
-    def max_seq_len(self) -> int:
-        """TorchTitan: Maximum sequence length (alias for HF max_position_embeddings)"""
-        return self.max_position_embeddings
-    
-    @max_seq_len.setter
-    def max_seq_len(self, value: int):
-        self.max_position_embeddings = value
-    
-    @property
-    def eos_id(self) -> int:
-        """TorchTitan: End of sequence token ID (alias for HF eos_token_id)"""
-        return self.eos_token_id
-    
-    @eos_id.setter
-    def eos_id(self, value: int):
-        self.eos_token_id = value
-
-    # === DeepSeekV3 specific properties ===
-    @property
-    def inter_dim(self) -> int:
-        """TorchTitan: Intermediate dimension (alias for HF intermediate_size)"""
-        return self.intermediate_size
-    
-    @inter_dim.setter
-    def inter_dim(self, value: int):
-        self.intermediate_size = value
-    
-    @property
-    def n_dense_layers(self) -> int:
-        """TorchTitan: Number of dense layers (alias for HF first_k_dense_replace)"""
-        return self.first_k_dense_replace
-    
-    @n_dense_layers.setter
-    def n_dense_layers(self, value: int):
-        self.first_k_dense_replace = value
-
     def update_from_config(self, job_config: JobConfig):
         # Load HF config (overwrites our HF attributes)
         hf_model_config = AutoConfig.from_pretrained(
@@ -316,7 +255,6 @@ def _format_module(module: nn.Module, prefix: str = ""):
         _format_module(model, "  ")
 
 
-
 class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
         super().__init__()

From 46ae0a3caa1dfa02b1a00b426e0bfad74eba7be8 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 18 Sep 2025 08:05:27 +0000
Subject: [PATCH 028/129] fix refactor and simplify things

---
 .../transformers_backend/__init__.py          | 136 ++++++++----
 .../model/hf_transformers_args.py             | 201 +++++++-----------
 2 files changed, 178 insertions(+), 159 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 422df5621c..59900b0408 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import dataclasses
+from dataclasses import dataclass
+from typing import Optional
 
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
@@ -20,58 +21,113 @@
 
 from torchtitan.models.moe import MoEArgs
 
+
 __all__ = [
     "HFTransformerModelArgs",
     "HFTransformerModel",
     "hf_transformers_configs",
 ]
 
-#TODO(3outeille): identify that if MoE model is used, we add a moe_args field
-flavors = {
-    "debugmodel": HFTransformerModelArgs(
-        # n_layers=2,
-        # vocab_size=2000,
-        max_seq_len=2048,
-        #TODO(3outeille): n_kv_heads=n_heads may be handle somewhere else
-        dim=256, n_layers=6, n_heads=16, vocab_size=2000, rope_theta=500000, n_kv_heads=16
-    ),
-    "medium": HFTransformerModelArgs(
-        dim=1024,
-        n_layers=12,
-    ),
-    "full": HFTransformerModelArgs(),
-}
+@dataclass
+class TitanModelArgs:
+    """Arguments for the base TorchTitan model."""
+
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = 128256
+    multiple_of: int = 256
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000
+    max_seq_len: int = 2048
+    depth_init: bool = True
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
+    eos_id: int = 0
+
+
+@dataclass
+class DeepSeekV3Args:
+    """Arguments specific to DeepSeekV3 models."""
+    moe_args: Optional[MoEArgs] = None
+    n_group: Optional[int] = None
+    topk_group: Optional[int] = None
+    inter_dim: Optional[int] = None
+    moe_inter_dim: Optional[int] = None
+    n_dense_layers: Optional[int] = None
+    n_expert_groups: Optional[int] = None
+    n_limited_groups: Optional[int] = None
+    q_lora_rank: Optional[int] = None
+    kv_lora_rank: Optional[int] = None
+    qk_nope_head_dim: Optional[int] = None
+    qk_rope_head_dim: Optional[int] = None
+    v_head_dim: Optional[int] = None
+    original_seq_len: Optional[int] = None
+    rope_factor: Optional[float] = None
+    beta_fast: Optional[int] = None
+    beta_slow: Optional[int] = None
+    mscale: Optional[float] = None
 
+# #TODO(3outeille): identify that if MoE model is used, we add a moe_args field
 # flavors = {
 #     "debugmodel": HFTransformerModelArgs(
-#         n_layers=3,
-#         vocab_size=2000,
-#         dim=256,
-#         inter_dim=1024,
-#         moe_inter_dim=256,
-#         n_dense_layers=1,
-#         n_heads=16,
-#         n_group=2,
-#         topk_group=1,
-#         moe_args=MoEArgs(
-#             num_experts=8,
-#             num_shared_experts=2,
-#             top_k=3,
-#             score_func="softmax",
-#             route_norm=True,
-#             score_before_experts=False,
+#         titan_args=TitanModelArgs(
+#             max_seq_len=2048,
+#             dim=256,
+#             n_layers=6,
+#             n_heads=16,
+#             n_kv_heads=16,
+#             vocab_size=2000,
+#             rope_theta=500000
 #         ),
-#         kv_lora_rank=16,
-#         q_lora_rank=0,
-#         qk_rope_head_dim=16,
-#         qk_nope_head_dim=32,
-#         v_head_dim=32,
-#         mscale=0.70,
-#         # TO REMOVE:
-#         n_kv_heads=16
+#     ),
+#     "medium": HFTransformerModelArgs(
+#         titan_args=TitanModelArgs(
+#             dim=1024,
+#             n_layers=12,
+#         ),
+#     ),
+#     "full": HFTransformerModelArgs(
+#         titan_args=TitanModelArgs(),
 #     ),
 # }
 
+# DeepSeekV3 flavors
+flavors = {
+    "debugmodel": HFTransformerModelArgs(
+        titan_args=TitanModelArgs(
+            dim=256,
+            n_layers=3,
+            n_heads=16,
+            n_kv_heads=16,
+            vocab_size=2000,
+        ),
+        deepseek_v3_args=DeepSeekV3Args(
+            inter_dim=1024,
+            moe_inter_dim=256,
+            n_dense_layers=1,
+            n_group=2,
+            topk_group=1,
+            kv_lora_rank=16,
+            q_lora_rank=0,
+            qk_nope_head_dim=32,
+            qk_rope_head_dim=16,
+            v_head_dim=32,
+            mscale=0.70,
+            moe_args=MoEArgs(
+                num_experts=8,
+                num_shared_experts=2,
+                top_k=3,
+                score_func="softmax",
+                route_norm=True,
+                score_before_experts=False,
+            ),
+        )
+    ),
+}
+
 hf_train_spec = TrainSpec(
     name="hf_auto_model",
     model_cls=HFTransformerModel,
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index f5f04ce77b..2fa18e9abb 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -16,109 +16,59 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from torchtitan.models.moe import MoEArgs
-
 from .hf_llama_patch import patch_hf_llama
 patch_hf_llama()
 
-class AliasedPropertiesMeta(type):
-    """
-    This metaclass automatically creates aliased properties on a class.
-    It looks for a `_TITAN_TO_HF_MAPPING` dictionary in the class
-    namespace and generates properties based on its contents.
-    """
-
-    def __new__(cls, name, bases, dct):
-        def _create_aliased_property(hf_name: str) -> property:
-            def getter(self):
-                return getattr(self, hf_name)
-            def setter(self, value):
-                setattr(self, hf_name, value)
-            return property(getter, setter)
-            
-        mapping = dct.get('_TITAN_TO_HF_MAPPING', {})
-        for titan_name, hf_name in mapping.items():
-            dct[titan_name] = _create_aliased_property(hf_name)
-        return super().__new__(cls, name, bases, dct)
-
-@dataclass
-class TitanModelArgs:
-    """Arguments for the base TorchTitan model."""
-
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = 128256
-    multiple_of: int = 256
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    rope_theta: float = 10000
-    max_seq_len: int = 2048
-    depth_init: bool = True
-    use_flex_attn: bool = False
-    attn_mask_type: str = "causal"
-    eos_id: int = 0
-    moe_args: Optional[MoEArgs] = None
-
-
 @dataclass
-class DeepSeekV3Args:
-    """Arguments specific to DeepSeekV3 models."""
-
-    n_group: Optional[int] = None
-    topk_group: Optional[int] = None
-    inter_dim: Optional[int] = None
-    moe_inter_dim: Optional[int] = None
-    n_dense_layers: Optional[int] = None
-    n_expert_groups: Optional[int] = None
-    n_limited_groups: Optional[int] = None
-    q_lora_rank: Optional[int] = None
-    kv_lora_rank: Optional[int] = None
-    qk_nope_head_dim: Optional[int] = None
-    qk_rope_head_dim: Optional[int] = None
-    v_head_dim: Optional[int] = None
-    original_seq_len: Optional[int] = None
-    rope_factor: Optional[float] = None
-    beta_fast: Optional[int] = None
-    beta_slow: Optional[int] = None
-    mscale: Optional[float] = None
-
-
-@dataclass
-class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs, metaclass=AliasedPropertiesMeta):
+class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
     """
     Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
     
     Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
+    Properties are created dynamically based on which arguments are provided.
     """
     
-    _TITAN_TO_HF_MAPPING = {
-        # TorchTitan Name: HuggingFace Name
-        "dim": "hidden_size",
-        "n_layers": "num_hidden_layers",
-        "n_heads": "num_attention_heads",
-        "n_kv_heads": "num_key_value_heads",
-        "norm_eps": "rms_norm_eps",
-        "max_seq_len": "max_position_embeddings",
-        "eos_id": "eos_token_id",
-        # DeepSeekV3 specific aliases
-        "inter_dim": "intermediate_size",
-        "n_dense_layers": "first_k_dense_replace",
+    # Define all possible mappings organized by argument type
+    _ALL_MAPPINGS = {
+        "base": {
+            # Core TorchTitan mappings (always available)
+            "dim": "hidden_size",
+            "n_layers": "num_hidden_layers",
+            "n_heads": "num_attention_heads",
+            "n_kv_heads": "num_key_value_heads",
+            "norm_eps": "rms_norm_eps",
+            "max_seq_len": "max_position_embeddings",
+            "eos_id": "eos_token_id",
+        },
+        "deepseek_v3": {
+            # DeepSeekV3 specific mappings (only when deepseek_v3_args provided)
+            "inter_dim": "intermediate_size",
+            "n_dense_layers": "first_k_dense_replace",
+        },
     }
 
     def __init__(
         self,
-        titan_args: Optional[TitanModelArgs] = None,
-        deepseek_v3_args: Optional[DeepSeekV3Args] = None,
+        titan_args,
+        deepseek_v3_args=None,
         # HuggingFace specific args
         attn_implementation: str = "sdpa",
         **kwargs,
     ):
-        titan_args = titan_args or TitanModelArgs()
-        deepseek_v3_args = deepseek_v3_args or DeepSeekV3Args()
+        assert titan_args is not None, "titan_args is required"
+
+        active_mappings = {}
+        
+        active_mappings.update(self._ALL_MAPPINGS["base"])
+        
+        if deepseek_v3_args is not None:
+            active_mappings.update(self._ALL_MAPPINGS["deepseek_v3"])
+        
+        self._active_mappings = active_mappings
+        
+        self._create_dynamic_properties()
 
-        # Store TorchTitan-specific args (no HF equivalent)
+        # Fill all TorchTitan-specific args (no HF equivalent)
         self.multiple_of = titan_args.multiple_of
         self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
         self.depth_init = titan_args.depth_init
@@ -128,36 +78,49 @@ def __init__(
         # HuggingFace specific args
         self.attn_implementation = attn_implementation
 
-        # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
-        # setting it to None in HuggingFace.
-        q_lora_rank = deepseek_v3_args.q_lora_rank
-        if q_lora_rank == 0:
-            q_lora_rank = None
-        deepseek_v3_args.q_lora_rank = q_lora_rank
-
-        self._passed_args = {
-            **titan_args.__dict__,
-            **deepseek_v3_args.__dict__,
-            "attn_implementation": attn_implementation,
-        }
+        # Start with passed_args as just titan_args
+        self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}
         self._passed_args.update(kwargs)
 
-        if titan_args.moe_args is not None:
-            # MoE args for HF config
-            # HF uses different names for these
-            moe_args = titan_args.moe_args
-            self.num_experts_per_tok = moe_args.top_k
-            self.n_routed_experts = moe_args.num_experts
-            self.n_shared_experts = moe_args.num_shared_experts
-            self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
-            self._passed_args.update(
-                dict(
-                    num_experts_per_tok=moe_args.top_k,
-                    n_routed_experts=moe_args.num_experts,
-                    n_shared_experts=moe_args.num_shared_experts,
-                    moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
+        # If DeepSeekV3 args are provided, fill the rest
+        if deepseek_v3_args is not None:
+            # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
+            # setting it to None in HuggingFace.
+            q_lora_rank = deepseek_v3_args.q_lora_rank
+            if q_lora_rank == 0:
+                q_lora_rank = None
+            deepseek_v3_args.q_lora_rank = q_lora_rank
+
+            self._passed_args.update(**deepseek_v3_args.__dict__)
+
+            if deepseek_v3_args.moe_args is not None:
+                moe_args = deepseek_v3_args.moe_args
+                self.num_experts_per_tok = moe_args.top_k
+                self.n_routed_experts = moe_args.num_experts
+                self.n_shared_experts = moe_args.num_shared_experts
+                self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
+                self._passed_args.update(
+                    dict(
+                        num_experts_per_tok=moe_args.top_k,
+                        n_routed_experts=moe_args.num_experts,
+                        n_shared_experts=moe_args.num_shared_experts,
+                        moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
+                    )
                 )
-            )
+
+    def _create_dynamic_properties(self):
+        """Create properties dynamically based on active mappings."""
+        def _create_property(hf_name: str) -> property:
+            def getter(self):
+                return getattr(self, hf_name)
+            def setter(self, value):
+                setattr(self, hf_name, value)
+            return property(getter, setter)
+        
+        for titan_name, hf_name in self._active_mappings.items():
+            # Create getter/setter for attribute that don't already exist
+            if not hasattr(self.__class__, titan_name):
+                setattr(self.__class__, titan_name, _create_property(hf_name))
 
     def __repr__(self) -> str:
         # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
@@ -194,14 +157,14 @@ def update_from_config(self, job_config: JobConfig):
         self.use_cache = False
         self.initializer_range = 1.0  # use as std for normal init in embedding
         
-        if self.inter_dim is None: # Only for llama model
-            ffn_hidden_size = 4 * self.dim
-            ffn_hidden_size = int(2 * ffn_hidden_size / 3)
-            if self.ffn_dim_multiplier is not None:
-                ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
-            self.intermediate_size = self.multiple_of * (
-                (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
-            )
+        # if self.inter_dim is None: # Only for llama model
+        ffn_hidden_size = 4 * self.dim
+        ffn_hidden_size = int(2 * ffn_hidden_size / 3)
+        if self.ffn_dim_multiplier is not None:
+            ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
+        self.intermediate_size = self.multiple_of * (
+            (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
+        )
         
         self.head_dim = self.dim // self.num_attention_heads
         

From b33d5758763963786d5c2fedfa30134d87f1bfb9 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 18 Sep 2025 08:21:35 +0000
Subject: [PATCH 029/129] hacky way to switch flavors for now

---
 .../transformers_backend/__init__.py          | 111 +++++++++---------
 .../transformers_backend/compare_tt_hf_run.sh |  34 +++++-
 .../model/hf_transformers_args.py             |  17 +--
 3 files changed, 97 insertions(+), 65 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 59900b0408..de81b18794 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
+import os
 from dataclasses import dataclass
 from typing import Optional
 
@@ -71,62 +71,65 @@ class DeepSeekV3Args:
     mscale: Optional[float] = None
 
 # #TODO(3outeille): identify that if MoE model is used, we add a moe_args field
-# flavors = {
-#     "debugmodel": HFTransformerModelArgs(
-#         titan_args=TitanModelArgs(
-#             max_seq_len=2048,
-#             dim=256,
-#             n_layers=6,
-#             n_heads=16,
-#             n_kv_heads=16,
-#             vocab_size=2000,
-#             rope_theta=500000
-#         ),
-#     ),
-#     "medium": HFTransformerModelArgs(
-#         titan_args=TitanModelArgs(
-#             dim=1024,
-#             n_layers=12,
-#         ),
-#     ),
-#     "full": HFTransformerModelArgs(
-#         titan_args=TitanModelArgs(),
-#     ),
-# }
 
-# DeepSeekV3 flavors
-flavors = {
-    "debugmodel": HFTransformerModelArgs(
-        titan_args=TitanModelArgs(
-            dim=256,
-            n_layers=3,
-            n_heads=16,
-            n_kv_heads=16,
-            vocab_size=2000,
+if os.environ.get("MODEL_TYPE") == "llama":
+    print("Using llama model")
+    flavors = {
+        "debugmodel": HFTransformerModelArgs(
+            titan_args=TitanModelArgs(
+                max_seq_len=2048,
+                dim=256,
+                n_layers=6,
+                n_heads=16,
+                n_kv_heads=16,
+                vocab_size=2000,
+                rope_theta=500000
+            ),
+        ),
+        "medium": HFTransformerModelArgs(
+            titan_args=TitanModelArgs(
+                dim=1024,
+                n_layers=12,
+            ),
+        ),
+        "full": HFTransformerModelArgs(
+            titan_args=TitanModelArgs(),
         ),
-        deepseek_v3_args=DeepSeekV3Args(
-            inter_dim=1024,
-            moe_inter_dim=256,
-            n_dense_layers=1,
-            n_group=2,
-            topk_group=1,
-            kv_lora_rank=16,
-            q_lora_rank=0,
-            qk_nope_head_dim=32,
-            qk_rope_head_dim=16,
-            v_head_dim=32,
-            mscale=0.70,
-            moe_args=MoEArgs(
-                num_experts=8,
-                num_shared_experts=2,
-                top_k=3,
-                score_func="softmax",
-                route_norm=True,
-                score_before_experts=False,
+    }
+else:
+    print("Using deepseek model")
+    flavors = {
+        "debugmodel": HFTransformerModelArgs(
+            titan_args=TitanModelArgs(
+                vocab_size=2000,
+                dim=256,
+                n_layers=3,
+                n_heads=16,
+                n_kv_heads=16,
             ),
-        )
-    ),
-}
+            deepseek_v3_args=DeepSeekV3Args(
+                inter_dim=1024,
+                moe_inter_dim=256,
+                n_dense_layers=1,
+                n_group=2,
+                topk_group=1,
+                kv_lora_rank=16,
+                q_lora_rank=0,
+                qk_nope_head_dim=32,
+                qk_rope_head_dim=16,
+                v_head_dim=32,
+                mscale=0.70,
+                moe_args=MoEArgs(
+                    num_experts=8,
+                    num_shared_experts=2,
+                    top_k=3,
+                    score_func="softmax",
+                    route_norm=True,
+                    score_before_experts=False,
+                ),
+            )
+        ),
+    }
 
 hf_train_spec = TrainSpec(
     name="hf_auto_model",
diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
index 0461ebfb7b..e49a2a5803 100755
--- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
@@ -12,6 +12,35 @@ set -o pipefail
 NGPU=${NGPU:-"1"}
 export LOG_RANK=${LOG_RANK:-0}
 
+# Parse command line arguments for model selection
+MODEL_TYPE=${1:-"llama"}
+export MODEL_TYPE
+
+# Set model names based on argument
+case $MODEL_TYPE in
+    "llama")
+        TT_MODEL_NAME="llama3"
+        HF_MODEL_NAME="meta-llama/Llama-3.2-1B"
+        ;;
+    "deepseek")
+        TT_MODEL_NAME="deepseek_v3"
+        HF_MODEL_NAME="deepseek-ai/DeepSeek-V3"
+        ;;
+    *)
+        echo "Error: Unsupported model type '$MODEL_TYPE'"
+        echo "Usage: $0 [llama|deepseek] [additional_args...]"
+        echo "  llama   - Uses llama3 for TT and meta-llama/Llama-3.2-1B for HF"
+        echo "  deepseek - Uses deepseek_v3 for TT and deepseek-ai/DeepSeek-V3 for HF"
+        exit 1
+        ;;
+esac
+
+echo "Using model type: $MODEL_TYPE"
+echo "  TT model: $TT_MODEL_NAME"
+echo "  HF model: $HF_MODEL_NAME"
+
+# Shift to remove the model type argument, pass remaining args to training
+shift
 
 run_tt() {
     echo "##############################################"
@@ -23,7 +52,7 @@ run_tt() {
     CUDA_VISIBLE_DEVICES=0 \
     torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
     --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic "$@"
+    -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic --model.name ${TT_MODEL_NAME} "$@"
 }
 
 run_hf() {
@@ -36,10 +65,9 @@ run_hf() {
     CUDA_VISIBLE_DEVICES=1 \
     torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
     --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic "$@"
+    -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic --model.name ${HF_MODEL_NAME} "$@"
 }
 
-
 TT_LOG="tt_run.log"
 HF_LOG="hf_run.log"
 DIFF_LOG="run_diff.log"
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 2fa18e9abb..4366467129 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -157,20 +157,21 @@ def update_from_config(self, job_config: JobConfig):
         self.use_cache = False
         self.initializer_range = 1.0  # use as std for normal init in embedding
         
-        # if self.inter_dim is None: # Only for llama model
-        ffn_hidden_size = 4 * self.dim
-        ffn_hidden_size = int(2 * ffn_hidden_size / 3)
-        if self.ffn_dim_multiplier is not None:
-            ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
-        self.intermediate_size = self.multiple_of * (
-            (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
-        )
+        if not hasattr(self, "inter_dim"): # Only for llama model
+            ffn_hidden_size = 4 * self.dim
+            ffn_hidden_size = int(2 * ffn_hidden_size / 3)
+            if self.ffn_dim_multiplier is not None:
+                ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
+            self.intermediate_size = self.multiple_of * (
+                (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
+            )
         
         self.head_dim = self.dim // self.num_attention_heads
         
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        #TODO(3outeille): adapt to handle MoE
         nparams = sum(p.numel() for p in model.parameters())
         nparams_embedding = sum(
             sum(p.numel() for p in m.parameters())

From 007f00555724cf6e59957cef3cdf1322b6e57178 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 18 Sep 2025 13:48:53 +0000
Subject: [PATCH 030/129] hf deepseek train while matching same param counts as
 tt deepseek

---
 .../experiments/transformers_backend/__init__.py       | 10 ++++++----
 .../transformers_backend/model/hf_transformers_args.py |  8 +++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index de81b18794..06d8524c14 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -69,6 +69,7 @@ class DeepSeekV3Args:
     beta_fast: Optional[int] = None
     beta_slow: Optional[int] = None
     mscale: Optional[float] = None
+    partial_rotary_factor: Optional[float] = None
 
 # #TODO(3outeille): identify that if MoE model is used, we add a moe_args field
 
@@ -108,16 +109,17 @@ class DeepSeekV3Args:
                 n_kv_heads=16,
             ),
             deepseek_v3_args=DeepSeekV3Args(
+                partial_rotary_factor=4.0,
                 inter_dim=1024,
                 moe_inter_dim=256,
                 n_dense_layers=1,
                 n_group=2,
                 topk_group=1,
-                kv_lora_rank=16,
+                kv_lora_rank=512,
                 q_lora_rank=0,
-                qk_nope_head_dim=32,
-                qk_rope_head_dim=16,
-                v_head_dim=32,
+                qk_nope_head_dim=128,
+                qk_rope_head_dim=64,
+                v_head_dim=128,
                 mscale=0.70,
                 moe_args=MoEArgs(
                     num_experts=8,
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 4366467129..2e3b3e93f0 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -29,7 +29,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
     """
     
     # Define all possible mappings organized by argument type
-    _ALL_MAPPINGS = {
+    _TT_TO_HF_MAPPINGS = {
         "base": {
             # Core TorchTitan mappings (always available)
             "dim": "hidden_size",
@@ -59,10 +59,10 @@ def __init__(
 
         active_mappings = {}
         
-        active_mappings.update(self._ALL_MAPPINGS["base"])
+        active_mappings.update(self._TT_TO_HF_MAPPINGS["base"])
         
         if deepseek_v3_args is not None:
-            active_mappings.update(self._ALL_MAPPINGS["deepseek_v3"])
+            active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"])
         
         self._active_mappings = active_mappings
         
@@ -93,6 +93,8 @@ def __init__(
 
             self._passed_args.update(**deepseek_v3_args.__dict__)
 
+            self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor
+
             if deepseek_v3_args.moe_args is not None:
                 moe_args = deepseek_v3_args.moe_args
                 self.num_experts_per_tok = moe_args.top_k

From dd2b04cf947f97c0735b2f8e1b8f188440ee1c29 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 22 Sep 2025 09:31:00 +0000
Subject: [PATCH 031/129] wtf deepseek q_proj weight init differ ???

---
 .../transformers_backend/__init__.py          |   7 +-
 .../model/hf_deepseek_v3_patch.py             | 115 ++++++++++++++++++
 .../model/hf_transformers_args.py             |   3 -
 3 files changed, 121 insertions(+), 4 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 06d8524c14..0cecbfb199 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -20,6 +20,9 @@
 from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel
 
 from torchtitan.models.moe import MoEArgs
+from .model.hf_llama_patch import patch_hf_llama
+from .model.hf_deepseek_v3_patch import patch_hf_deepseek_v3
+
 
 
 __all__ = [
@@ -75,6 +78,7 @@ class DeepSeekV3Args:
 
 if os.environ.get("MODEL_TYPE") == "llama":
     print("Using llama model")
+    patch_hf_llama()
     flavors = {
         "debugmodel": HFTransformerModelArgs(
             titan_args=TitanModelArgs(
@@ -99,12 +103,13 @@ class DeepSeekV3Args:
     }
 else:
     print("Using deepseek model")
+    patch_hf_deepseek_v3()
     flavors = {
         "debugmodel": HFTransformerModelArgs(
             titan_args=TitanModelArgs(
                 vocab_size=2000,
                 dim=256,
-                n_layers=3,
+                n_layers=2,
                 n_heads=16,
                 n_kv_heads=16,
             ),
diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
new file mode 100644
index 0000000000..53769b9cc9
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
@@ -0,0 +1,115 @@
+
+
+import torch.nn as nn
+
+from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
+from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer
+from transformers.modeling_utils import PreTrainedModel
+
+_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__
+
+def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int):
+    _original_deepseek_v3_decoder_layer_init(self, config, layer_idx)
+    
+    self.mlp.layer_idx = layer_idx
+    
+    if hasattr(self.mlp, 'experts'):
+        for expert in self.mlp.experts:
+            expert.layer_idx = layer_idx
+        self.mlp.shared_experts.layer_idx = layer_idx
+    
+def _initialize_weights_patched(self, module):
+    # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
+    # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
+    # which prevents subsequent proper initialization.
+    if getattr(module, "_is_hf_initialized", False):
+        return
+
+    for param in module.parameters(recurse=True):
+        if param.device.type == "meta":
+            return
+    
+    # If not on a meta device, call the original weight initialization
+    self._init_weights(module)
+    module._is_hf_initialized = True
+
+def _init_weights_patched(self, module):
+    """
+    Patched version of _init_weights to match TorchTitan's initialization for Llama.
+    `self` is a LlamaPreTrainedModel instance.
+    """
+    config = self.config
+    
+    #TODO(3outeille): only out_proj/down_proj needs std=init_std. so we can refactor to loop over module and only init last layer with std=init_std
+    if isinstance(module, (DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE)):
+        layer_idx = module.layer_idx
+        init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+
+    if isinstance(module, DeepseekV3Attention):
+        print("DeepseekV3Attention", module.layer_idx)
+        if hasattr(module, 'q_proj'):
+            nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
+            # NOTE(3outeille): module.smart_apply is called on parent class, we have 3 child so init will be called 3 times
+            # That's why we need to set _is_hf_initialized to True to avoid triple initialization
+            print(f"module.q_proj.weight: {module.q_proj.weight}")
+        else:
+            nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
+            nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
+        
+        nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
+        
+        nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
+        print("=====")
+    
+    elif isinstance(module, DeepseekV3MLP):
+        nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
+
+    elif isinstance(module, DeepseekV3MoE):
+        nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
+        for expert in module.experts:
+            nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
+            nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
+            nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
+        
+        nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
+
+    elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head
+        final_out_std = config.hidden_size**-0.5
+        cutoff_factor = 3
+        nn.init.trunc_normal_(
+            module.weight,
+            mean=0.0,
+            std=final_out_std,
+            a=-cutoff_factor * final_out_std,
+            b=cutoff_factor * final_out_std,
+        )
+        if module.bias is not None:
+            module.bias.data.zero_()
+
+    elif isinstance(module, nn.Embedding):
+        std = config.initializer_range
+        module.weight.data.normal_(mean=0.0, std=std)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    
+    elif (
+        isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d))
+        or "LayerNorm" in module.__class__.__name__
+        or "RMSNorm" in module.__class__.__name__
+    ):
+        # Norms can exist without weights (in which case they are None from torch primitives)
+        if hasattr(module, "weight") and module.weight is not None:
+            module.weight.data.fill_(1.0)
+        if hasattr(module, "bias") and module.bias is not None:
+            module.bias.data.zero_()
+
+
+def patch_hf_deepseek_v3():
+    DeepseekV3DecoderLayer.__init__ = _deepseek_v3_decoder_layer_init_patched
+    PreTrainedModel._init_weights = _init_weights_patched
+    PreTrainedModel._initialize_weights = _initialize_weights_patched
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 2e3b3e93f0..64fb64d72f 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -16,9 +16,6 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
-from .hf_llama_patch import patch_hf_llama
-patch_hf_llama()
-
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
     """

From 9abdae34fdb4a2e6a22a654f78babf3163df725e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 22 Sep 2025 11:19:50 +0000
Subject: [PATCH 032/129] deepseek now has same weight init in HF & TT. Reasons
 was rng_state was not same as we call weight init at different time

---
 .../transformers_backend/compare_tt_hf_run.sh |   7 +-
 .../model/hf_deepseek_v3_patch.py             |  36 +++-
 .../reference_diff_deepseekv3_1gpu.log        | 163 ++++++++++++++++++
 torchtitan/models/deepseek_v3/__init__.py     |   2 +-
 torchtitan/models/deepseek_v3/model/model.py  |  34 ++++
 torchtitan/models/moe.py                      |  34 ++++
 6 files changed, 265 insertions(+), 11 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log

diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
index e49a2a5803..be7243f81b 100755
--- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
@@ -15,7 +15,8 @@ export LOG_RANK=${LOG_RANK:-0}
 # Parse command line arguments for model selection
 MODEL_TYPE=${1:-"llama"}
 export MODEL_TYPE
-
+SEED=${SEED:-42}
+export SEED
 # Set model names based on argument
 case $MODEL_TYPE in
     "llama")
@@ -52,7 +53,7 @@ run_tt() {
     CUDA_VISIBLE_DEVICES=0 \
     torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
     --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed 42 --training.deterministic --model.name ${TT_MODEL_NAME} "$@"
+    -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${TT_MODEL_NAME} "$@"
 }
 
 run_hf() {
@@ -65,7 +66,7 @@ run_hf() {
     CUDA_VISIBLE_DEVICES=1 \
     torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
     --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed 42 --training.deterministic --model.name ${HF_MODEL_NAME} "$@"
+    -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${HF_MODEL_NAME} "$@"
 }
 
 TT_LOG="tt_run.log"
diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
index 53769b9cc9..346a400260 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
@@ -1,6 +1,7 @@
-
-
+import os
+import torch
 import torch.nn as nn
+import functools
 
 from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
 from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer
@@ -8,6 +9,31 @@
 
 _original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__
 
+def seeded_init_decorator_for_test(seed):
+    """
+    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
+    and prints layer weights after initialization.
+    """
+    import lovely_tensors as lt; lt.monkey_patch()
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(self, module):
+            original_trunc_normal = nn.init.trunc_normal_
+
+            def seeded_trunc_normal(*args, **kwargs):
+                torch.manual_seed(seed)
+                tensor = args[0]  # First argument is always the tensor
+                result = original_trunc_normal(*args, **kwargs)
+                # module_name = getattr(module, "__class__", type(module)).__name__
+                # print(f"Module: {module_name}, Tensor value: {tensor}")
+                return result
+
+            nn.init.trunc_normal_ = seeded_trunc_normal
+            return func(self, module)
+
+        return wrapper
+    return decorator
+
 def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int):
     _original_deepseek_v3_decoder_layer_init(self, config, layer_idx)
     
@@ -33,6 +59,7 @@ def _initialize_weights_patched(self, module):
     self._init_weights(module)
     module._is_hf_initialized = True
 
+@seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
 def _init_weights_patched(self, module):
     """
     Patched version of _init_weights to match TorchTitan's initialization for Llama.
@@ -46,12 +73,8 @@ def _init_weights_patched(self, module):
         init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
 
     if isinstance(module, DeepseekV3Attention):
-        print("DeepseekV3Attention", module.layer_idx)
         if hasattr(module, 'q_proj'):
             nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
-            # NOTE(3outeille): module.smart_apply is called on parent class, we have 3 child so init will be called 3 times
-            # That's why we need to set _is_hf_initialized to True to avoid triple initialization
-            print(f"module.q_proj.weight: {module.q_proj.weight}")
         else:
             nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
             nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
@@ -60,7 +83,6 @@ def _init_weights_patched(self, module):
         nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
         
         nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
-        print("=====")
     
     elif isinstance(module, DeepseekV3MLP):
         nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
diff --git a/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log
new file mode 100644
index 0000000000..1155c9a5db
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log
@@ -0,0 +1,163 @@
+[1mdiff --git a/tt_run.log.filtered b/hf_run.log.filtered[m
+[1mindex 9726db6..84b6138 100644[m
+[1m--- a/tt_run.log.filtered[m
+[1m+++ b/hf_run.log.filtered[m
+[36m@@ -1,85 +1,153 @@[m
++ echo [31m'##############################################'[m
+[31m##############################################[m[32m'#######################################################'[m
+[32m#######################################################[m
++ echo '### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###'
+### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###
++ echo [31m'##############################################'[m
+[31m##############################################[m[32m'#######################################################'[m
+[32m#######################################################[m
++ [31mTT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32mHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m
++ [31mCUDA_VISIBLE_DEVICES=0[m[32mCUDA_VISIBLE_DEVICES=1[m
++ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file [31m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m --training.seed 42 --training.deterministic --model.name [31mdeepseek_v3[m[32mdeepseek-ai/DeepSeek-V3[m
+[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.[m
+[rank0]:  warnings.warn([m
+[rank0]:[titan] TIMESTAMP - root - [32mWARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.[m
+[32m[rank0]:[titan] TIMESTAMP - root -[m INFO - Starting job: [32mHF[m Llama 3 debug training
+[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [][m
+[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation).[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Building [31mdeepseek_v3[m[32mdeepseek-ai/DeepSeek-V3[m debugmodel with [31mDeepSeekV3ModelArgs(_enforced='This field is used to enforce all fields have defaults.', max_batch_size=8, max_seq_len=2048, vocab_size=2000, dim=256, inter_dim=1024, moe_inter_dim=256, n_layers=2, n_dense_layers=1, n_heads=16, norm_eps=1e-05, moe_args=MoEArgs(num_experts=8,[m[32mHFTransformerModelArgs([m
+[32m[rank0]:attn_implementation='sdpa'[m
+[32m[rank0]:attn_mask_type='causal'[m
+[32m[rank0]:beta_fast=None[m
+[32m[rank0]:beta_slow=None[m
+[32m[rank0]:depth_init=True[m
+[32m[rank0]:dim=256[m
+[32m[rank0]:eos_id=0[m
+[32m[rank0]:ffn_dim_multiplier=None[m
+[32m[rank0]:inter_dim=1024[m
+[32m[rank0]:kv_lora_rank=512[m
+[32m[rank0]:max_seq_len=2048[m
+[32m[rank0]:moe_args=MoEArgs(num_experts=8,[m num_shared_experts=2, score_func='softmax', route_norm=True, route_scale=1.0, score_before_experts=False, top_k=3, use_grouped_mm=True, [31mload_balance_coeff=0.001), n_expert_groups=1, n_limited_groups=1, q_lora_rank=0, kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64, v_head_dim=128, use_flex_attn=False, attn_mask_type='causal', original_seq_len=4096, rope_theta=10000.0, rope_factor=40, beta_fast=32, beta_slow=1, mscale=0.7)[m[32mload_balance_coeff=0.001)[m
+[32m[rank0]:moe_inter_dim=256[m
+[32m[rank0]:moe_intermediate_size=256[m
+[32m[rank0]:mscale=0.7[m
+[32m[rank0]:multiple_of=256[m
+[32m[rank0]:n_dense_layers=1[m
+[32m[rank0]:n_expert_groups=None[m
+[32m[rank0]:n_group=2[m
+[32m[rank0]:n_heads=16[m
+[32m[rank0]:n_kv_heads=16[m
+[32m[rank0]:n_layers=2[m
+[32m[rank0]:n_limited_groups=None[m
+[32m[rank0]:n_routed_experts=8[m
+[32m[rank0]:n_shared_experts=2[m
+[32m[rank0]:norm_eps=1e-05[m
+[32m[rank0]:num_experts_per_tok=3[m
+[32m[rank0]:original_seq_len=None[m
+[32m[rank0]:partial_rotary_factor=4.0[m
+[32m[rank0]:q_lora_rank=None[m
+[32m[rank0]:qk_nope_head_dim=128[m
+[32m[rank0]:qk_rope_head_dim=64[m
+[32m[rank0]:rope_factor=None[m
+[32m[rank0]:rope_theta=10000[m
+[32m[rank0]:topk_group=1[m
+[32m[rank0]:use_flex_attn=False[m
+[32m[rank0]:v_head_dim=128[m
+[32m[rank0]:vocab_size=2000[m
+[32m[rank0]:)[m
+[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory[m
+[31m[rank0]:[titan] TIMESTAMP - root - INFO - Total parameter count: dense 8,923,392, sparse 1,968,128, active 9,908,480[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown:[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mDeepSeekV3Model[m[32mHFTransformerModel[m - 10,891,520 params
+[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(tok_embeddings):[m[32m(embed_tokens):[m Embedding - 512,000 params
+[rank0]:[titan] TIMESTAMP - root - INFO -   (layers): [31mModuleDict[m[32mModuleList[m - 9,867,264 params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (0): [31mTransformerBlock[m[32mDeepseekV3DecoderLayer[m - 4,342,784 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): DeepseekV3Attention[m - 3,555,840 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 786,432 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_a):[m[32m(kv_a_proj_with_mqa):[m Linear - 147,456 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(kv_norm): RMSNorm[m[32m(kv_a_layernorm): DeepseekV3RMSNorm[m - 512 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_b):[m[32m(kv_b_proj):[m Linear - 2,097,152 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 524,288 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(mlp): DeepseekV3MLP[m - [31m256[m[32m786,432[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(ffn_norm): RMSNorm[m[32m(gate_proj): Linear[m - [31m256[m[32m262,144[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(feed_forward): FeedForward[m[32m(up_proj): Linear[m - [31m786,432[m[32m262,144[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(down_proj):[m Linear - 262,144 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(w2): Linear[m[32m(input_layernorm): DeepseekV3RMSNorm[m - [31m262,144[m[32m256[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(w3): Linear[m[32m(post_attention_layernorm): DeepseekV3RMSNorm[m - [31m262,144[m[32m256[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -     (1): [31mTransformerBlock[m[32mDeepseekV3DecoderLayer[m - 5,524,480 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): DeepseekV3Attention[m - 3,555,840 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 786,432 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_a):[m[32m(kv_a_proj_with_mqa):[m Linear - 147,456 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(kv_norm): RMSNorm[m[32m(kv_a_layernorm): DeepseekV3RMSNorm[m - 512 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_b):[m[32m(kv_b_proj):[m Linear - 2,097,152 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 524,288 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(mlp): DeepseekV3MoE[m - [31m256[m[32m1,968,128[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(ffn_norm): RMSNorm[m[32m(experts): ModuleList[m - [31m256[m[32m1,572,864[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(moe): MoE[m[32m(0): DeepseekV3MLP[m - [31m1,968,128[m[32m196,608[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -             [31m(experts): GroupedExperts[m[32m(gate_proj): Linear[m - [31m1,572,864[m[32m65,536[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -             [31m(router): TokenChoiceTopKRouter[m[32m(up_proj): Linear[m - [31m2,048[m[32m65,536[m params
+[rank0]:[titan] TIMESTAMP - root - INFO -             [31m(gate):[m[32m(down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (1): DeepseekV3MLP - 196,608 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (2): DeepseekV3MLP - 196,608 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (3): DeepseekV3MLP - 196,608 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj):[m Linear - [32m65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (4): DeepseekV3MLP - 196,608 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (5): DeepseekV3MLP - 196,608 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (6): DeepseekV3MLP - 196,608 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (7): DeepseekV3MLP - 196,608 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -         (gate): DeepseekV3TopkRouter -[m 2,048 params
+[rank0]:[titan] TIMESTAMP - root - INFO -         (shared_experts): [31mFeedForward[m[32mDeepseekV3MLP[m - 393,216 params
+[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(w1):[m[32m(gate_proj):[m Linear - 131,072 params
+[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(w2):[m[32m(up_proj):[m Linear - 131,072 params
+[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(w3):[m[32m(down_proj):[m Linear - 131,072 params
+[rank0]:[titan] TIMESTAMP - root - INFO -       [32m(input_layernorm): DeepseekV3RMSNorm - 256 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -       (post_attention_layernorm): DeepseekV3RMSNorm - 256 params[m
+[32m[rank0]:[titan] TIMESTAMP - root - INFO -[m   (norm): [31mRMSNorm[m[32mDeepseekV3RMSNorm[m - 256 params
+[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(output):[m[32m(lm_head):[m Linear - 512,000 params
+[rank0]:[titan] TIMESTAMP - root - INFO - [34mModel [31mdeepseek_v3[m[32mdeepseek-ai/DeepSeek-V3[m debugmodel [31msize: 10,891,520 total parameters[39m
+[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14[m
+[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.05GiB(0.06%)[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2)[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at [31m./outputs/profile_trace[m
+[31m[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/lib/python3.12/site-packages/torch/nn/functional.py:2920: UserWarning: Mismatch dtype between input and weight: input dtype = c10::BFloat16, weight dtype = float, Cannot dispatch to fused implementation. (Triggered internally at /pytorch/aten/src/ATen/native/layer_norm.cpp:344.)[m
+[31m[rank0]:  return torch.rms_norm(input, normalized_shape, weight, eps)[m[32m./outputs/profile_trace_hf[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m8.1381[m[32m8.1218[m  [38;2;180;60;0mgrad_norm:  [31m2.7374[m[32m2.7807[m  [38;2;54;234;195mmemory:  [31m2.14GiB(2.70%)[m[32m2.48GiB(3.13%)[m  [34mtps: [31m18,024[m[32m11,445[m  [36mtflops: [31m1.24[m[32m0.89[m  [35mmfu: [31m0.13%[39m[m[32m0.09%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.0208[m[32m6.8905[m  [38;2;180;60;0mgrad_norm:  [31m3.2615[m[32m3.2709[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.49GiB(3.13%)[m  [34mtps: [31m20,232[m[32m17,755[m  [36mtflops: [31m1.40[m[32m1.38[m  [35mmfu: 0.14%[39m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m5.2642[m[32m5.1682[m  [38;2;180;60;0mgrad_norm:  [31m2.8735[m[32m2.8229[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.49GiB(3.13%)[m  [34mtps: [31m325,066[m[32m119,606[m  [36mtflops: [31m22.42[m[32m9.32[m  [35mmfu: [31m2.27%[39m[m[32m0.94%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m4.8286[m[32m4.7719[m  [38;2;180;60;0mgrad_norm:  [31m2.1885[m[32m2.2433[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.51GiB(3.15%)[m  [34mtps: [31m345,536[m[32m135,937[m  [36mtflops: [31m23.83[m[32m10.59[m  [35mmfu: [31m2.41%[39m[m[32m1.07%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m4.4370[m[32m4.3827[m  [38;2;180;60;0mgrad_norm:  [31m2.3053[m[32m2.3779[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.51GiB(3.15%)[m  [34mtps: [31m296,009[m[32m133,266[m  [36mtflops: [31m20.41[m[32m10.39[m  [35mmfu: [31m2.06%[39m[m[32m1.05%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.05[m seconds
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.3063[m[32m4.2368[m  [38;2;180;60;0mgrad_norm:  [31m2.2445[m[32m2.2557[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m136,065[m[32m66,465[m  [36mtflops: [31m9.38[m[32m5.18[m  [35mmfu: [31m0.95%[39m[m[32m0.52%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.1253[m[32m4.0403[m  [38;2;180;60;0mgrad_norm:  [31m1.9626[m[32m1.9132[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m299,863[m[32m131,077[m  [36mtflops: [31m20.68[m[32m10.22[m  [35mmfu: [31m2.09%[39m[m[32m1.03%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.0645[m[32m3.9796[m  [38;2;180;60;0mgrad_norm:  [31m1.8299[m[32m1.8154[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m343,855[m[32m147,955[m  [36mtflops: [31m23.71[m[32m11.53[m  [35mmfu: [31m2.40%[39m[m[32m1.17%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.4758[m[32m4.4010[m  [38;2;180;60;0mgrad_norm:  [31m1.4743[m[32m1.4965[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m346,707[m[32m139,416[m  [36mtflops: [31m23.91[m[32m10.87[m  [35mmfu: [31m2.42%[39m[m[32m1.10%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9483[m[32m3.8448[m  [38;2;180;60;0mgrad_norm:  [31m1.6240[m[32m1.6185[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m303,029[m[32m139,581[m  [36mtflops: [31m20.90[m[32m10.88[m  [35mmfu: [31m2.11%[39m[m[32m1.10%[39m[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.02[m[32m0.04[m seconds
+[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Training completed[m
+[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed[m
diff --git a/torchtitan/models/deepseek_v3/__init__.py b/torchtitan/models/deepseek_v3/__init__.py
index 1c3d2b19d2..3322ad0a83 100644
--- a/torchtitan/models/deepseek_v3/__init__.py
+++ b/torchtitan/models/deepseek_v3/__init__.py
@@ -35,7 +35,7 @@
         dim=256,
         inter_dim=1024,
         moe_inter_dim=256,
-        n_layers=3,
+        n_layers=2,
         n_dense_layers=1,
         n_heads=16,
         moe_args=MoEArgs(
diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py
index e2c4bbeda9..5547840e27 100644
--- a/torchtitan/models/deepseek_v3/model/model.py
+++ b/torchtitan/models/deepseek_v3/model/model.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
+import os
+import functools
 from typing import Tuple
 
 import torch
@@ -17,6 +19,35 @@
 from .args import DeepSeekV3ModelArgs
 
 
+def seeded_init_decorator_for_test(seed):
+    """
+    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
+    and prints layer weights after initialization.
+    """
+    import lovely_tensors as lt; lt.monkey_patch()
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            original_trunc_normal = nn.init.trunc_normal_
+ 
+            def seeded_trunc_normal(*trunc_args, **trunc_kwargs):
+                torch.manual_seed(seed)
+                tensor = trunc_args[0]  # First argument is always the tensor
+                result = original_trunc_normal(*trunc_args, **trunc_kwargs)
+                # # Try to get module info from the calling context
+                # module_name = "Unknown"
+                # if len(args) > 0 and hasattr(args[0], "__class__"):
+                #     module_name = args[0].__class__.__name__
+                # print(f"Module: {module_name}, Tensor value: {tensor}")
+                return result
+ 
+            nn.init.trunc_normal_ = seeded_trunc_normal
+            return func(*args, **kwargs)
+ 
+        return wrapper
+    return decorator
+
+
 # Adapted from https://github.com/DeepSeek-ai/DeepSeek-V3/blob/main/inference/model.py#L294
 def precompute_freqs_cis(args: DeepSeekV3ModelArgs) -> torch.Tensor:
     """
@@ -240,6 +271,7 @@ def forward(
         output = output.view(bsz, seqlen, -1)  # (bsz, seqlen, n_heads * v_head_dim)
         return self.wo(output)  # (bsz, seqlen, dim)
 
+    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float):
         linear_list = [
             self.wkv_a,
@@ -302,6 +334,7 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):
             x = x + self.feed_forward(self.ffn_norm(x))
         return x
 
+    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, buffer_device: torch.device):
         for norm in (self.attention_norm, self.ffn_norm):
             norm.reset_parameters()
@@ -339,6 +372,7 @@ def __init__(self, model_args: DeepSeekV3ModelArgs):
         self.model_args = model_args
         self.init_weights()
 
+    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, buffer_device: torch.device | None = None) -> None:
         buffer_device = buffer_device or self.freqs_cis.device
         with torch.device(buffer_device):
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
index 8be14ecbf0..5ba63b9157 100644
--- a/torchtitan/models/moe.py
+++ b/torchtitan/models/moe.py
@@ -12,8 +12,38 @@
 from torch import nn
 
 from torchtitan.distributed.expert_parallel import expert_parallel
+import os
+import functools
 
 
+def seeded_init_decorator_for_test(seed):
+    """
+    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
+    and prints layer weights after initialization.
+    """
+    import lovely_tensors as lt; lt.monkey_patch()
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            original_trunc_normal = nn.init.trunc_normal_
+ 
+            def seeded_trunc_normal(*trunc_args, **trunc_kwargs):
+                torch.manual_seed(seed)
+                tensor = trunc_args[0]  # First argument is always the tensor
+                result = original_trunc_normal(*trunc_args, **trunc_kwargs)
+                # # Try to get module info from the calling context
+                # module_name = "Unknown"
+                # if len(args) > 0 and hasattr(args[0], "__class__"):
+                #     module_name = args[0].__class__.__name__
+                # print(f"Module: {module_name}, Tensor value: {tensor}")
+                return result
+ 
+            nn.init.trunc_normal_ = seeded_trunc_normal
+            return func(*args, **kwargs)
+ 
+        return wrapper
+    return decorator
+
 @dataclass
 class MoEArgs:
     num_experts: int = 8
@@ -57,6 +87,7 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 
+    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float = 0.02):
         nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
         for linear in (self.w2, self.w3):
@@ -153,6 +184,7 @@ def forward(
                 self.w1, self.w2, self.w3, x, num_tokens_per_expert
             )
 
+    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
         nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
@@ -246,6 +278,7 @@ def forward(
 
         return top_scores, selected_experts_indices, num_tokens_per_expert
 
+    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std)
 
@@ -435,6 +468,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         out = out.reshape(bs, slen, dim)
         return out
 
+    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(
         self,
         init_std: float,

From f9e90bc03aba179edb1cb6c488ac2b0e9a002de4 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 22 Sep 2025 11:41:27 +0000
Subject: [PATCH 033/129] adapt mfu to handle moe

---
 .../model/hf_transformers_args.py             | 89 ++++++++++++++-----
 1 file changed, 69 insertions(+), 20 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 64fb64d72f..704f83a534 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -170,27 +170,76 @@ def update_from_config(self, job_config: JobConfig):
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        #TODO(3outeille): adapt to handle MoE
-        nparams = sum(p.numel() for p in model.parameters())
-        nparams_embedding = sum(
-            sum(p.numel() for p in m.parameters())
-            for m in model.children()
-            if isinstance(m, nn.Embedding)
-        )
+        # Check if this is a MoE model by looking for MoE attributes
+        is_moe = hasattr(self, 'n_routed_experts') and hasattr(self, 'num_experts_per_tok')
+        
+        if is_moe:
+            # MoE parameter counting (adapted from DeepSeek V3 implementation)
+            nparams_embedding = 0
+            nparams_moe_router = 0
+            nparams_shared_experts = 0
+            nparams_experts = 0
+            nparams_dense = 0
 
-        l, h, q, t = (
-            self.n_layers,
-            self.n_heads,
-            self.dim // self.n_heads,
-            seq_len,
-        )
-        # Reasoning behind the factor of 12 for the self-attention part of the formula:
-        # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
-        # 2. the flash attention does 1 more matmul recomputation in the backward
-        #    but recomputation should not be counted in calculating MFU           (+0)
-        # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
-        # 4. we follow the convention and do not account for sparsity in causal attention
-        num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+            for name, p in model.named_parameters():
+                if "embedding" in name:
+                    nparams_embedding += p.numel()
+                    nparams_dense += p.numel()
+                elif "moe.shared_experts" in name:
+                    nparams_shared_experts += p.numel()
+                elif "moe.router" in name:
+                    nparams_moe_router += p.numel()
+                elif "moe.experts" in name:
+                    nparams_experts += p.numel()
+                else:
+                    nparams_dense += p.numel()
+
+            nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
+            nparams = nparams_dense + nparams_sparse
+            nparams_sparse_active = (
+                nparams_moe_router
+                + nparams_shared_experts
+                + nparams_experts * self.num_experts_per_tok // self.n_routed_experts
+            )
+
+            logger.info(
+                f"Total parameter count: dense {nparams_dense:,}, "
+                f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
+            )
+
+            l, h, q, t = (
+                self.n_layers,
+                self.n_heads,
+                self.dim // self.n_heads,
+                seq_len,
+            )
+            # Use active parameters for FLOPS calculation in MoE
+            num_flops_per_token = (
+                6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
+                + 12 * l * h * q * t
+            )
+        else:
+            # Dense model parameter counting (original implementation)
+            nparams = sum(p.numel() for p in model.parameters())
+            nparams_embedding = sum(
+                sum(p.numel() for p in m.parameters())
+                for m in model.children()
+                if isinstance(m, nn.Embedding)
+            )
+
+            l, h, q, t = (
+                self.n_layers,
+                self.n_heads,
+                self.dim // self.n_heads,
+                seq_len,
+            )
+            # Reasoning behind the factor of 12 for the self-attention part of the formula:
+            # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+            # 2. the flash attention does 1 more matmul recomputation in the backward
+            #    but recomputation should not be counted in calculating MFU           (+0)
+            # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+            # 4. we follow the convention and do not account for sparsity in causal attention
+            num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
 
         return nparams, num_flops_per_token
 

From ba5d6d1e1d2aa7b1168efac27f0d0859db7e4976 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 23 Sep 2025 12:06:18 +0000
Subject: [PATCH 034/129] beginning parallelism by setting tests

---
 .../compare_distributed_run.py                | 564 ++++++++++++++++++
 .../compare_distributed_run.sh                |   6 +
 .../transformers_backend/compare_tt_hf_run.sh |   5 -
 3 files changed, 570 insertions(+), 5 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/compare_distributed_run.py
 create mode 100755 torchtitan/experiments/transformers_backend/compare_distributed_run.sh

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
new file mode 100644
index 0000000000..08e8057c90
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -0,0 +1,564 @@
+#!/usr/bin/env python3
+"""
+compare_distributed_run.py - Test different parallelism configurations against baseline
+Based on TorchTitan convergence guidelines
+
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+"""
+
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, NamedTuple
+import tempfile
+import json
+from dataclasses import dataclass
+from enum import Enum
+import logging
+
+# Configure logging with colors
+class Colors:
+    RED = '\033[0;31m'
+    GREEN = '\033[0;32m'
+    YELLOW = '\033[1;33m'
+    BLUE = '\033[0;34m'
+    MAGENTA = '\033[0;35m'
+    CYAN = '\033[0;36m'
+    NC = '\033[0m'  # No Color
+
+class LogLevel(Enum):
+    INFO = "INFO"
+    SUCCESS = "SUCCESS"
+    WARNING = "WARNING"
+    ERROR = "ERROR"
+    TEST_PASS = "TEST_PASS"
+    TEST_FAIL = "TEST_FAIL"
+
+def log_message(level: LogLevel, message: str) -> None:
+    """Log a message with appropriate color coding."""
+    color_map = {
+        LogLevel.INFO: Colors.BLUE,
+        LogLevel.SUCCESS: Colors.GREEN,
+        LogLevel.WARNING: Colors.YELLOW,
+        LogLevel.ERROR: Colors.RED,
+        LogLevel.TEST_PASS: Colors.GREEN,
+        LogLevel.TEST_FAIL: Colors.RED,
+    }
+    
+    prefix_map = {
+        LogLevel.INFO: "[INFO]",
+        LogLevel.SUCCESS: "[SUCCESS]",
+        LogLevel.WARNING: "[WARNING]",
+        LogLevel.ERROR: "[ERROR]",
+        LogLevel.TEST_PASS: "✅ TEST PASS",
+        LogLevel.TEST_FAIL: "❌ TEST FAIL",
+    }
+    
+    color = color_map[level]
+    prefix = prefix_map[level]
+    print(f"{color}{prefix}{Colors.NC} {message}")
+
+@dataclass
+class ParallelismConfig:
+    """Configuration for a parallelism setup."""
+    name: str
+    dp_replicate: int
+    dp_shard: int
+    tp: int
+    pp: int
+    pp_schedule: str
+    cp: int
+    ep: int
+    eptp: int
+
+@dataclass
+class TrainingMetrics:
+    """Training metrics extracted from logs."""
+    loss: Optional[float] = None
+    grad_norm: Optional[float] = None
+
+class CompareDistributedRun:
+    """Main class for running distributed parallelism comparison tests."""
+    
+    # Default values
+    DEFAULT_THRESHOLD_LOSS = 1e-4
+    DEFAULT_THRESHOLD_GRAD_NORM = 1e-3
+    DEFAULT_STEPS = 10
+    DEFAULT_SEED = 42
+    DEFAULT_FLAVOR = "debugmodel"
+    
+    # HF Model lists - extendable for different model families
+    HF_MODEL_LISTS = {
+        "llama": "meta-llama/Llama-3.2-1B",
+        "deepseek": "deepseek-ai/DeepSeek-V3",
+    }
+    
+    # Available flavors per model type
+    MODEL_FLAVORS = {
+        "llama": ["debugmodel", "medium", "full"],
+        "deepseek": ["debugmodel"],
+    }
+
+    # Available ND parallelisms <-> number of GPUs
+    ND_PARALLEL_TO_NB_GPUS = {
+        "1d": 2,
+        "2d": 4,
+        "3d": 8,
+        "4d": 16,
+    }
+    
+    def __init__(self):
+        self.script_dir = Path(__file__).parent.absolute()
+        self.torchtitan_root = self.script_dir.parent.parent
+        self.results_dir = self.script_dir / "comparison_results"
+        self.config_dir = self.script_dir / "generated_configs"
+        
+        # Configuration parameters
+        self.loss_threshold = self.DEFAULT_THRESHOLD_LOSS
+        self.grad_norm_threshold = self.DEFAULT_THRESHOLD_GRAD_NORM
+        self.nd_parallel_to_nb_gpus = self.ND_PARALLEL_TO_NB_GPUS
+        self.steps = self.DEFAULT_STEPS
+        self.seed = self.DEFAULT_SEED
+        self.model_filter = ""
+        self.flavor = self.DEFAULT_FLAVOR
+        self.verbose = False
+        self.parallelism_configs: List[ParallelismConfig] = []
+
+    def generate_parallelism_configs(self) -> None:
+        """Generate parallelism configurations based on the number of GPUs."""
+        ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
+        configs = []
+
+        def _get_factors(n: int) -> List[int]:
+            factors = set()
+            for i in range(1, int(n**0.5) + 1):
+                if n % i == 0:
+                    factors.add(i)
+                    factors.add(n // i)
+            return sorted(list(factors))
+
+        # Baseline FSDP
+        configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="Interleaved1F1B", cp=1, ep=1, eptp=1))
+
+        possible_tp = _get_factors(ngpu)
+        possible_pp = _get_factors(ngpu)
+        possible_ep = _get_factors(ngpu)
+        #TODO(3outeille): is CP borrowing degree from DP ?
+        #TODO(3outeille): is EP borrowing degree from DP ? 
+
+        # Is that correct ?
+        for tp in possible_tp:
+            for pp in possible_pp:
+                for ep in possible_ep:
+                    if tp * pp * ep > ngpu:
+                        continue
+
+                    if ngpu % (tp * pp * ep) == 0:
+                        dp = ngpu // (tp * pp * ep)
+                        if dp > 0 and (tp > 1 or pp > 1 or ep > 1 or dp > 1):
+                            # DDP style
+                            if dp > 1:
+                                configs.append(
+                                    ParallelismConfig(
+                                        name=f"tp{tp}_pp{pp}_ep{ep}_ddp{dp}",
+                                        dp_replicate=dp,
+                                        dp_shard=1,
+                                        tp=tp,
+                                        pp=pp,
+                                        pp_schedule="Interleaved1F1B",
+                                        cp=1,
+                                        ep=ep,
+                                        eptp=1
+                                    )
+                                )
+                            # FSDP with other parallelisms
+                            if tp > 1 or pp > 1 or ep > 1:
+                                configs.append(
+                                    ParallelismConfig(
+                                        name=f"tp{tp}_pp{pp}_ep{ep}_fsdp",
+                                        dp_replicate=1,
+                                        dp_shard=-1,
+                                        tp=tp,
+                                        pp=pp,
+                                        pp_schedule="Interleaved1F1B",
+                                        cp=1,
+                                        ep=ep,
+                                        eptp=1
+                                    )
+                                )
+
+        # HSDP requires a DP degree that can be split
+        for dp in _get_factors(ngpu):
+            if dp > 1:
+                dp_factors = _get_factors(dp)
+                for replicate in dp_factors:
+                    if replicate > 1:
+                        shard = dp // replicate
+                        if shard > 1:
+                            configs.append(
+                                ParallelismConfig(
+                                    name=f"hsdp_r{replicate}_s{shard}",
+                                    dp_replicate=replicate,
+                                    dp_shard=shard,
+                                    tp=1,
+                                    pp=1,
+                                    pp_schedule="Interleaved1F1B",
+                                    cp=1,
+                                    ep=1,
+                                    eptp=1
+                                )
+                            )
+        
+        # Remove duplicates and assign to instance
+        unique_configs = []
+        seen_configs = set()
+        for config in configs:
+            # Create a tuple of the config values to check for duplicates
+            config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.ep)
+            if config_tuple not in seen_configs:
+                unique_configs.append(config)
+                seen_configs.add(config_tuple)
+
+        self.parallelism_configs = unique_configs
+        
+        log_message(LogLevel.INFO, f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.")
+        if self.verbose:
+            for config in self.parallelism_configs:
+                log_message(LogLevel.INFO, f"  - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, ep={config.ep}")
+    def generate_config(self, config: ParallelismConfig, model_name: str, model_type: str) -> Path:
+        """Generate configuration file for a parallelism setup."""
+        config_file = self.config_dir / f"{config.name}_{model_type}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu.toml"
+        
+        #TODO(3outeille): create template instead
+        if model_type == "llama":
+            base_config = self.script_dir / "configs" / "debug_1_gpu_tt.toml"
+        else:
+            base_config = self.script_dir / "configs" / "debug_1_gpu_hf.toml"
+        
+        shutil.copy2(base_config, config_file)
+
+        with open(config_file, 'r') as f:
+            content = f.read()
+        
+        # Update model name if it's HF backend
+        if model_type != "llama":
+            content = re.sub(r'name = ".*"', f'name = "{model_name}"', content)
+        
+        # Update model flavor
+        content = re.sub(r'flavor = ".*"', f'flavor = "{self.flavor}"', content)
+        
+        # Validate flavor for model type
+        if model_type in self.MODEL_FLAVORS:
+            if self.flavor not in self.MODEL_FLAVORS[model_type]:
+                log_message(LogLevel.WARNING, 
+                           f"Flavor '{self.flavor}' not available for {model_type}. "
+                           f"Available: {self.MODEL_FLAVORS[model_type]}")
+        
+        # Update training steps and seed
+        content = re.sub(r'steps = .*', f'steps = {self.steps}', content)
+        if 'seed = ' in content:
+            content = re.sub(r'seed = .*', f'seed = {self.seed}', content)
+        else:
+            content = re.sub(r'(steps = .*)', f'\\1\nseed = {self.seed}', content)
+        
+        #TODO(3outeille): is this correct ?
+        # Ensure deterministic training
+        if 'deterministic = true' not in content:
+            content = re.sub(r'(seed = .*)', '\\1\ndeterministic = true', content)
+        
+        # Update parallelism configuration
+        content = re.sub(r'data_parallel_replicate_degree = .*', 
+                        f'data_parallel_replicate_degree = {config.dp_replicate}', content)
+        content = re.sub(r'data_parallel_shard_degree = .*', 
+                        f'data_parallel_shard_degree = {config.dp_shard}', content)
+        content = re.sub(r'tensor_parallel_degree = .*', 
+                        f'tensor_parallel_degree = {config.tp}', content)
+        content = re.sub(r'pipeline_parallel_degree = .*', 
+                        f'pipeline_parallel_degree = {config.pp}', content)
+        content = re.sub(r'pipeline_parallel_schedule = .*', 
+                        f'pipeline_parallel_schedule = "{config.pp_schedule}"', content)
+        content = re.sub(r'context_parallel_degree = .*', 
+                        f'context_parallel_degree = {config.cp}', content)
+        content = re.sub(r'expert_parallel_degree = .*', 
+                        f'expert_parallel_degree = {config.ep}', content)
+        
+        content = re.sub(r'expert_tensor_parallel_degree = .*', 
+                        f'expert_tensor_parallel_degree = {config.eptp}', content)
+
+        # Write modified config
+        with open(config_file, 'w') as f:
+            f.write(content)
+        
+        log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name}, type: {model_type})")
+        return config_file
+    
+    def extract_metrics(self, log_file: Path) -> TrainingMetrics:
+        """Extract metrics from log file."""
+        metrics = TrainingMetrics()
+        
+        try:
+            with open(log_file, 'r') as f:
+                content = f.read()
+            
+            # Extract final loss and grad_norm from the last step
+            loss_matches = re.findall(r'loss:\s*([0-9]+\.?[0-9]*)', content)
+            grad_norm_matches = re.findall(r'grad_norm:\s*([0-9]+\.?[0-9]*)', content)
+            
+            if loss_matches:
+                metrics.loss = float(loss_matches[-1])
+            if grad_norm_matches:
+                metrics.grad_norm = float(grad_norm_matches[-1])
+                
+        except Exception as e:
+            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}")
+        
+        if metrics.loss is None or metrics.grad_norm is None:
+            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}")
+        
+        return metrics
+    
+    def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, 
+                       config_name: str) -> bool:
+        """Compare metrics between baseline and test configuration."""
+        if (baseline_metrics.loss is None or baseline_metrics.grad_norm is None or
+            test_metrics.loss is None or test_metrics.grad_norm is None):
+            log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics")
+            return False
+        
+        # Calculate absolute differences
+        loss_diff = abs(baseline_metrics.loss - test_metrics.loss)
+        grad_norm_diff = abs(baseline_metrics.grad_norm - test_metrics.grad_norm)
+        
+        # Check if differences are within thresholds
+        loss_pass = loss_diff < self.loss_threshold
+        grad_pass = grad_norm_diff < self.grad_norm_threshold
+        
+        if loss_pass and grad_pass:
+            log_message(LogLevel.TEST_PASS, 
+                       f"{config_name} - Loss diff: {loss_diff:.2e} (< {self.loss_threshold:.2e}), "
+                       f"Grad norm diff: {grad_norm_diff:.2e} (< {self.grad_norm_threshold:.2e})")
+            return True
+        else:
+            log_message(LogLevel.TEST_FAIL,
+                       f"{config_name} - Loss diff: {loss_diff:.2e} (threshold: {self.loss_threshold:.2e}), "
+                       f"Grad norm diff: {grad_norm_diff:.2e} (threshold: {self.grad_norm_threshold:.2e})")
+            return False
+    
+    def generate_diff(self, baseline_log: Path, log_path: Path, diff_file: Path) -> None:
+        """Generate diff between baseline and test logs."""
+        
+        def _filter_log(log_file: Path) -> Path:
+            """Filter log file to normalize volatile information."""
+            filtered_file = log_file.with_suffix(log_file.suffix + '.filtered')
+            
+            with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile:
+                for line in infile:
+                    # Apply filtering patterns
+                    line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', 
+                                'TIMESTAMP', line)
+                    line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', 
+                                'torchrun ... --master_port=XXXX', line)
+                    line = re.sub(r'PID [0-9]+', 'PID XXXX', line)
+                    line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line)
+                    line = re.sub(r'memory: [0-9]+\.[0-9]+GiB', 'memory: XX.XXGiB', line)
+                    line = re.sub(r'tps: [0-9,]+', 'tps: XXXXX', line)
+                    line = re.sub(r'tflops: [0-9]+\.[0-9]+', 'tflops: XX.XX', line)
+                    line = re.sub(r'mfu: [0-9]+\.[0-9]+%', 'mfu: XX.XX%', line)
+                    outfile.write(line)
+            
+            return filtered_file
+        try:
+            # Filter logs to remove timestamps and volatile information
+            baseline_filtered = _filter_log(baseline_log)
+            test_filtered = _filter_log(log_path)
+            
+            # Generate colored diff using git diff
+            cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color",
+                   str(baseline_filtered), str(test_filtered)]
+            
+            with open(diff_file, 'w') as f:
+                subprocess.run(cmd, stdout=f, stderr=subprocess.DEVNULL)
+            
+            # Clean up filtered files
+            baseline_filtered.unlink()
+            test_filtered.unlink()
+            
+        except Exception as e:
+            log_message(LogLevel.WARNING, f"Could not generate diff: {e}")
+    
+    def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> bool:
+        """Run training with given configuration."""
+        log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}")
+        cmd = [
+            "torchrun",
+            f"--nproc_per_node={self.nd_parallel_to_nb_gpus[self.nd_parallel]}",
+            "--rdzv_backend", "c10d",
+            "--rdzv_endpoint=localhost:0",
+            "--local-ranks-filter", "0",
+            "--role", "rank",
+            "--tee", "3",
+            "-m", "torchtitan.train",
+            "--job.config_file", str(config_file)
+        ]
+        
+        env = os.environ.copy()
+        
+        if self.verbose:
+            log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}")
+        
+        try:
+            with open(log_file, 'w') as f:
+                result = subprocess.run(
+                    cmd,
+                    cwd=self.torchtitan_root,
+                    stdout=f,
+                    stderr=subprocess.STDOUT,
+                    env=env,
+                    check=True
+                )
+            
+            if self.verbose:
+                log_message(LogLevel.SUCCESS, f"Training completed: {config_name}")
+            return True
+            
+        except subprocess.CalledProcessError as e:
+            log_message(LogLevel.ERROR, f"Training failed: {config_name}")
+            return False
+    
+    def run(self) -> int:
+        """Main execution function. Runs all test suites for all models."""
+        parser = argparse.ArgumentParser(
+            description="Test different parallelism configurations against a baseline FSDP model.",
+        )      
+        parser.add_argument("-m", "--model-filter", default="",
+                          help="Filter models by name pattern (e.g., 'llama')")
+        parser.add_argument("-t", "--loss-threshold", type=float, default=self.DEFAULT_THRESHOLD_LOSS,
+                          help=f"Loss difference threshold (default: {self.DEFAULT_THRESHOLD_LOSS})")
+        parser.add_argument("-g", "--grad-threshold", type=float, default=self.DEFAULT_THRESHOLD_GRAD_NORM,
+                          help=f"Grad norm difference threshold (default: {self.DEFAULT_THRESHOLD_GRAD_NORM})")
+        parser.add_argument("-nd", "--nd_parallel", type=str, default="2d",
+                          help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})")
+        parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS,
+                          help=f"Training steps (default: {self.DEFAULT_STEPS})")
+        parser.add_argument("--seed", type=int, default=self.DEFAULT_SEED,
+                          help=f"Random seed (default: {self.DEFAULT_SEED})")
+        parser.add_argument("--flavor", default=self.DEFAULT_FLAVOR,
+                          help=f"Model flavor/size (default: {self.DEFAULT_FLAVOR}). "
+                               f"Available: llama=[debugmodel, medium, full], deepseek=[debugmodel]")
+        parser.add_argument("-v", "--verbose", action="store_true",
+                          help="Verbose output")
+        
+        args = parser.parse_args()
+        
+        self.loss_threshold = args.loss_threshold
+        self.grad_norm_threshold = args.grad_threshold
+        self.nd_parallel = args.nd_parallel
+        self.steps = args.steps
+        self.seed = args.seed
+        self.model_filter = args.model_filter
+        self.flavor = args.flavor
+        self.verbose = args.verbose
+        
+        log_message(LogLevel.INFO, "=== TorchTitan Distributed Parallelism Comparison ===")
+        log_message(LogLevel.INFO, f"Loss threshold: {self.loss_threshold}")
+        log_message(LogLevel.INFO, f"Grad norm threshold: {self.grad_norm_threshold}")
+        log_message(LogLevel.INFO, f"GPUs: {self.nd_parallel_to_nb_gpus[self.nd_parallel]}")
+        log_message(LogLevel.INFO, f"Steps: {self.steps}")
+        log_message(LogLevel.INFO, f"Seed: {self.seed}")
+        log_message(LogLevel.INFO, f"Model filter: {self.model_filter or 'all'}")
+        log_message(LogLevel.INFO, f"Model flavor: {self.flavor}")
+        print()
+        
+        self.results_dir.mkdir(exist_ok=True)
+        self.config_dir.mkdir(exist_ok=True)
+        
+        if self.verbose:
+            log_message(LogLevel.INFO, f"Results directory: {self.results_dir}")
+            log_message(LogLevel.INFO, f"Config directory: {self.config_dir}")
+
+        self.generate_parallelism_configs()
+        
+        total_model_failures = 0
+
+        for model_type, model_name in self.HF_MODEL_LISTS.items():
+            # Apply model filter if specified
+            if self.model_filter and self.model_filter not in model_type:
+                continue
+
+            log_message(LogLevel.INFO, f"Testing model: {model_type} ({model_name})")
+            total_tests = 0
+            passed_tests = 0
+            failed_tests = 0
+            configs_to_run = []
+
+            for config in self.parallelism_configs:
+                # Skip configurations that require more GPUs than available
+                required_gpus = config.dp_replicate * config.tp * config.pp
+                if config.dp_shard != -1:
+                    required_gpus *= config.dp_shard
+
+                if required_gpus > self.nd_parallel_to_nb_gpus[self.nd_parallel]:
+                    log_message(LogLevel.WARNING, 
+                               f"Skipping {config.name}: requires {required_gpus} GPUs but only {self.ngpu} available")
+                    continue
+
+                config_file = self.generate_config(config, model_name, model_type)
+                configs_to_run.append((config, config_file))
+
+            # # Test each parallelism configuration
+            # for config, config_file in configs_to_run:
+            #     log_path = self.results_dir / f"{config.name}_{model_type}_{self.flavor}_{self.ngpu}gpu.log"
+            #     if not self.run_training(config_file, log_path, config.name, model_name):
+            #         log_message(LogLevel.TEST_FAIL, f"{config.name} - Training failed")
+            #         failed_tests += 1
+            #         continue
+            #     test_metrics = self.extract_metrics(log_path)
+            #     if self.compare_metrics(baseline_metrics, test_metrics, config.name):
+            #         passed_tests += 1
+            #     else:
+            #         failed_tests += 1
+            #         diff_file = self.results_dir / f"diff_{config.name}_vs_baseline_{model_type}_{self.flavor}_{self.ngpu}gpu.log"
+            #         self.generate_diff(baseline_log, log_path, diff_file)
+            #         log_message(LogLevel.INFO, f"Diff saved to: {diff_file}")
+            #     total_tests += 1
+
+            # Print summary for this model
+            print()
+            log_message(LogLevel.INFO, f"=== TEST SUMMARY for {model_type} ===")
+            log_message(LogLevel.INFO, f"Total tests: {total_tests}")
+            log_message(LogLevel.SUCCESS, f"Passed: {passed_tests}")
+            if failed_tests > 0:
+                log_message(LogLevel.TEST_FAIL, f"Failed: {failed_tests}")
+            else:
+                log_message(LogLevel.INFO, f"Failed: {failed_tests}")
+            print()
+
+            if failed_tests > 0:
+                total_model_failures += 1
+
+        # Final summary
+        print()
+        log_message(LogLevel.INFO, "=== FINAL SUMMARY ===")
+        if total_model_failures == 0:
+            log_message(LogLevel.SUCCESS, "All model tests passed! 🎉")
+            return 0
+        else:
+            log_message(LogLevel.TEST_FAIL, f"{total_model_failures} model(s) had test failures")
+            log_message(LogLevel.INFO, f"Check the diff files in {self.results_dir} for details")
+            return 1
+
+def main():
+    """Entry point for the script."""
+    runner = CompareDistributedRun()
+    return runner.run()
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
new file mode 100755
index 0000000000..80bb2d04ca
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/bash
+
+# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel
+
+# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d
+debugpy-run compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d
diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
index be7243f81b..703a9b55c9 100755
--- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
@@ -1,9 +1,4 @@
 #!/usr/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
 
 set -ex
 set -o pipefail

From 338a25006716ed6a9ef23631ca54421a9b00779e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 24 Sep 2025 12:05:24 +0000
Subject: [PATCH 035/129] better compare_distributed_run test

---
 .../transformers_backend/__init__.py          |   2 +-
 .../compare_distributed_run.py                | 466 ++++++++++--------
 .../compare_distributed_run.sh                |   5 +-
 3 files changed, 258 insertions(+), 215 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 0cecbfb199..fa8cc4c119 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -76,7 +76,7 @@ class DeepSeekV3Args:
 
 # #TODO(3outeille): identify that if MoE model is used, we add a moe_args field
 
-if os.environ.get("MODEL_TYPE") == "llama":
+if os.environ.get("MODEL_TYPE") == "llama3" or os.environ.get("MODEL_TYPE") == "meta-llama/Llama-3.2-1B":
     print("Using llama model")
     patch_hf_llama()
     flavors = {
diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 08e8057c90..9be6b52acf 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -1,15 +1,46 @@
-#!/usr/bin/env python3
 """
-compare_distributed_run.py - Test different parallelism configurations against baseline
-Based on TorchTitan convergence guidelines
+python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 2d --verbose
+python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor flavor --nd_parallel 2d --verbose
+
+Methodology:
+    - train on FSDP with TT (baseline)
+    - train on FSDP with HF (baseline)
+    - For all parallelism, train with nd-// with HF
+        - If one train fails:
+            - generated diff between HF FSDP (baseline) HF nd-// 
+            - train the nd-// TT counterpart
+                - diff between TT nd-// and HF nd-//
+                - diff between TT FSDP (baseline) and HF nd-//
+results/
+|_ meta-llama
+	|_ Llama-3.2-1B
+		|_ 2D
+			|_ debugmodel
+				|_ baseline_hf_fsdp_4gpu.log
+				|_ baseline_tt_fsdp_4gpu.log
+				|_ baseline_fsdp_debugmodel_4gpu_huggingface.toml
+				|_ baseline_fsdp_debugmodel_4gpu_torchtitan.toml
+				|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface/
+					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.toml
+					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_torchtitan.toml
+					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.log
+					|_ diff_hf_baseline_vs_hf_nd_parallelism.log
+					|_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log
+					|_ diff_tt_baseline_vs_hf_nd_parallelism.log
+			|_ full
+				|_ baseline_hf_fsdp_4gpu.log
+				|_ baseline_tt_fsdp_4gpu.log
+				|_ baseline_fsdp_full_4gpu_huggingface.toml
+				|_ baseline_fsdp_full_4gpu_torchtitan.toml
+				|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface/
+					|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.toml
+					|_ fsdp1_cp1_tp2_pp2_full_4gpu_torchtitan.toml
+					|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.log
+					|_ diff_hf_baseline_vs_hf_nd_parallelism.log
+					|_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log
+					|_ diff_tt_baseline_vs_hf_nd_parallelism.log
 
-Copyright (c) Meta Platforms, Inc. and affiliates.
-All rights reserved.
-
-This source code is licensed under the BSD-style license found in the
-LICENSE file in the root directory of this source tree.
 """
-
 import argparse
 import os
 import re
@@ -17,12 +48,9 @@
 import subprocess
 import sys
 from pathlib import Path
-from typing import Dict, List, Tuple, Optional, NamedTuple
-import tempfile
-import json
+from typing import List, Optional
 from dataclasses import dataclass
 from enum import Enum
-import logging
 
 # Configure logging with colors
 class Colors:
@@ -95,31 +123,32 @@ class CompareDistributedRun:
     DEFAULT_SEED = 42
     DEFAULT_FLAVOR = "debugmodel"
     
-    # HF Model lists - extendable for different model families
-    HF_MODEL_LISTS = {
-        "llama": "meta-llama/Llama-3.2-1B",
-        "deepseek": "deepseek-ai/DeepSeek-V3",
+    MODEL_LISTS = {
+        "torchtitan":  ["llama3", "deepseek_v3"],
+        "huggingface": ["meta-llama/Llama-3.2-1B", "deepseek-ai/DeepSeek-V3"]
     }
     
-    # Available flavors per model type
     MODEL_FLAVORS = {
-        "llama": ["debugmodel", "medium", "full"],
-        "deepseek": ["debugmodel"],
+        "llama3": ["debugmodel", "medium", "full"],
+        "deepseek_v3": ["debugmodel"],
+        "meta-llama/Llama-3.2-1B": ["debugmodel", "medium", "full"],
+        "deepseek-ai/DeepSeek-V3": ["debugmodel"],
     }
 
     # Available ND parallelisms <-> number of GPUs
     ND_PARALLEL_TO_NB_GPUS = {
+        "0d": 1,
         "1d": 2,
         "2d": 4,
         "3d": 8,
         "4d": 16,
+        "5d": 32,
     }
     
     def __init__(self):
         self.script_dir = Path(__file__).parent.absolute()
         self.torchtitan_root = self.script_dir.parent.parent
-        self.results_dir = self.script_dir / "comparison_results"
-        self.config_dir = self.script_dir / "generated_configs"
+        self.base_results_dir = self.script_dir / "results"
         
         # Configuration parameters
         self.loss_threshold = self.DEFAULT_THRESHOLD_LOSS
@@ -131,6 +160,7 @@ def __init__(self):
         self.flavor = self.DEFAULT_FLAVOR
         self.verbose = False
         self.parallelism_configs: List[ParallelismConfig] = []
+        self.results_dir: Optional[Path] = None
 
     def generate_parallelism_configs(self) -> None:
         """Generate parallelism configurations based on the number of GPUs."""
@@ -148,81 +178,65 @@ def _get_factors(n: int) -> List[int]:
         # Baseline FSDP
         configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="Interleaved1F1B", cp=1, ep=1, eptp=1))
 
+        #NOTE(3outeille): No need to handle DDP (dp_replicate) as DDP is not supported > 1D parallelism"
+        #(cf https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/infra/parallelize.py#L139)
+        possible_fsdp = _get_factors(ngpu) # dp_shard
+        possible_cp = _get_factors(ngpu)
         possible_tp = _get_factors(ngpu)
         possible_pp = _get_factors(ngpu)
-        possible_ep = _get_factors(ngpu)
-        #TODO(3outeille): is CP borrowing degree from DP ?
-        #TODO(3outeille): is EP borrowing degree from DP ? 
-
-        # Is that correct ?
-        for tp in possible_tp:
-            for pp in possible_pp:
-                for ep in possible_ep:
-                    if tp * pp * ep > ngpu:
-                        continue
-
-                    if ngpu % (tp * pp * ep) == 0:
-                        dp = ngpu // (tp * pp * ep)
-                        if dp > 0 and (tp > 1 or pp > 1 or ep > 1 or dp > 1):
-                            # DDP style
-                            if dp > 1:
-                                configs.append(
-                                    ParallelismConfig(
-                                        name=f"tp{tp}_pp{pp}_ep{ep}_ddp{dp}",
-                                        dp_replicate=dp,
-                                        dp_shard=1,
-                                        tp=tp,
-                                        pp=pp,
-                                        pp_schedule="Interleaved1F1B",
-                                        cp=1,
-                                        ep=ep,
-                                        eptp=1
-                                    )
-                                )
-                            # FSDP with other parallelisms
-                            if tp > 1 or pp > 1 or ep > 1:
-                                configs.append(
-                                    ParallelismConfig(
-                                        name=f"tp{tp}_pp{pp}_ep{ep}_fsdp",
-                                        dp_replicate=1,
-                                        dp_shard=-1,
-                                        tp=tp,
-                                        pp=pp,
-                                        pp_schedule="Interleaved1F1B",
-                                        cp=1,
-                                        ep=ep,
-                                        eptp=1
-                                    )
-                                )
-
-        # HSDP requires a DP degree that can be split
-        for dp in _get_factors(ngpu):
-            if dp > 1:
-                dp_factors = _get_factors(dp)
-                for replicate in dp_factors:
-                    if replicate > 1:
-                        shard = dp // replicate
-                        if shard > 1:
-                            configs.append(
-                                ParallelismConfig(
-                                    name=f"hsdp_r{replicate}_s{shard}",
-                                    dp_replicate=replicate,
-                                    dp_shard=shard,
-                                    tp=1,
-                                    pp=1,
-                                    pp_schedule="Interleaved1F1B",
-                                    cp=1,
-                                    ep=1,
-                                    eptp=1
-                                )
+
+        #TODO(3outeille): handle HSDP later
+
+        for dp_shard in possible_fsdp:
+            for cp in possible_cp:
+                for tp in possible_tp:
+                    for pp in possible_pp:
+                        
+                        if dp_shard * cp * tp * pp != ngpu:
+                            continue
+
+                        num_parallelisms_used = sum(parallel_degree > 1 for parallel_degree in [dp_shard, cp, tp, pp])
+                        ndims_required = int(self.nd_parallel[0])
+                        #NOTE(3outeille): if 2D//, we need at least 2 parallelisms to be active (> 1). For 3D //, least 3 parallelisms > 1 etc.
+                        if ndims_required > 1 and num_parallelisms_used < ndims_required:
+                            continue
+
+                        configs.append(
+                            ParallelismConfig(
+                                name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}",
+                                dp_replicate=1,
+                                dp_shard=dp_shard,
+                                tp=tp,
+                                pp=pp,
+                                pp_schedule="Interleaved1F1B",
+                                cp=cp,
+                                ep=1,
+                                eptp=1
                             )
+                        )
+
+                        # NOTE(3outeille): EP borrowing degree from dp_shard
+                        configs.append(
+                            ParallelismConfig(
+                                name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}",
+                                dp_replicate=1,
+                                dp_shard=dp_shard,
+                                tp=tp,
+                                pp=pp,
+                                pp_schedule="Interleaved1F1B",
+                                cp=cp,
+                                ep=dp_shard,
+                                eptp=1
+                            )
+                        )
         
+    
         # Remove duplicates and assign to instance
         unique_configs = []
         seen_configs = set()
         for config in configs:
             # Create a tuple of the config values to check for duplicates
-            config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.ep)
+            config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.cp, config.ep, config.eptp)
             if config_tuple not in seen_configs:
                 unique_configs.append(config)
                 seen_configs.add(config_tuple)
@@ -232,72 +246,66 @@ def _get_factors(n: int) -> List[int]:
         log_message(LogLevel.INFO, f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.")
         if self.verbose:
             for config in self.parallelism_configs:
-                log_message(LogLevel.INFO, f"  - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, ep={config.ep}")
-    def generate_config(self, config: ParallelismConfig, model_name: str, model_type: str) -> Path:
+                log_message(LogLevel.INFO, f"  - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, cp={config.cp}, ep={config.ep}, eptp={config.eptp}")
+    
+    def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None) -> Path:
         """Generate configuration file for a parallelism setup."""
-        config_file = self.config_dir / f"{config.name}_{model_type}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu.toml"
-        
-        #TODO(3outeille): create template instead
-        if model_type == "llama":
-            base_config = self.script_dir / "configs" / "debug_1_gpu_tt.toml"
+        import toml
+
+        if filename:
+            config_file = config_dir / filename
         else:
-            base_config = self.script_dir / "configs" / "debug_1_gpu_hf.toml"
-        
+            config_file = config_dir / f"{config.name}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu_{backend}.toml"
+
+        base_config = self.script_dir / "configs" / "test_template.toml"
         shutil.copy2(base_config, config_file)
 
+        # Load the TOML file as a dict
         with open(config_file, 'r') as f:
-            content = f.read()
-        
-        # Update model name if it's HF backend
-        if model_type != "llama":
-            content = re.sub(r'name = ".*"', f'name = "{model_name}"', content)
-        
-        # Update model flavor
-        content = re.sub(r'flavor = ".*"', f'flavor = "{self.flavor}"', content)
-        
+            config_data = toml.load(f)
+
+        # Update [model] section
+        if "model" not in config_data:
+            config_data["model"] = {}
+        config_data["model"]["name"] = model_name
+        config_data["model"]["flavor"] = self.flavor
+
         # Validate flavor for model type
-        if model_type in self.MODEL_FLAVORS:
-            if self.flavor not in self.MODEL_FLAVORS[model_type]:
+        if model_name in self.MODEL_FLAVORS:
+            if self.flavor not in self.MODEL_FLAVORS[model_name]:
                 log_message(LogLevel.WARNING, 
-                           f"Flavor '{self.flavor}' not available for {model_type}. "
-                           f"Available: {self.MODEL_FLAVORS[model_type]}")
-        
-        # Update training steps and seed
-        content = re.sub(r'steps = .*', f'steps = {self.steps}', content)
-        if 'seed = ' in content:
-            content = re.sub(r'seed = .*', f'seed = {self.seed}', content)
-        else:
-            content = re.sub(r'(steps = .*)', f'\\1\nseed = {self.seed}', content)
-        
-        #TODO(3outeille): is this correct ?
-        # Ensure deterministic training
-        if 'deterministic = true' not in content:
-            content = re.sub(r'(seed = .*)', '\\1\ndeterministic = true', content)
-        
-        # Update parallelism configuration
-        content = re.sub(r'data_parallel_replicate_degree = .*', 
-                        f'data_parallel_replicate_degree = {config.dp_replicate}', content)
-        content = re.sub(r'data_parallel_shard_degree = .*', 
-                        f'data_parallel_shard_degree = {config.dp_shard}', content)
-        content = re.sub(r'tensor_parallel_degree = .*', 
-                        f'tensor_parallel_degree = {config.tp}', content)
-        content = re.sub(r'pipeline_parallel_degree = .*', 
-                        f'pipeline_parallel_degree = {config.pp}', content)
-        content = re.sub(r'pipeline_parallel_schedule = .*', 
-                        f'pipeline_parallel_schedule = "{config.pp_schedule}"', content)
-        content = re.sub(r'context_parallel_degree = .*', 
-                        f'context_parallel_degree = {config.cp}', content)
-        content = re.sub(r'expert_parallel_degree = .*', 
-                        f'expert_parallel_degree = {config.ep}', content)
-        
-        content = re.sub(r'expert_tensor_parallel_degree = .*', 
-                        f'expert_tensor_parallel_degree = {config.eptp}', content)
+                           f"Flavor '{self.flavor}' not available for {model_name}. "
+                           f"Available: {self.MODEL_FLAVORS[model_name]}")
+
+        # Update [training] section
+        if "training" not in config_data:
+            config_data["training"] = {}
+        config_data["training"]["steps"] = self.steps
+        config_data["training"]["seed"] = self.seed
+
+        # Update [parallelism] section
+        if "parallelism" not in config_data:
+            config_data["parallelism"] = {}
+        config_data["parallelism"]["data_parallel_replicate_degree"] = config.dp_replicate
+        config_data["parallelism"]["data_parallel_shard_degree"] = config.dp_shard
+        config_data["parallelism"]["tensor_parallel_degree"] = config.tp
+        config_data["parallelism"]["pipeline_parallel_degree"] = config.pp
+        config_data["parallelism"]["pipeline_parallel_schedule"] = config.pp_schedule
+        config_data["parallelism"]["context_parallel_degree"] = config.cp
+        config_data["parallelism"]["expert_parallel_degree"] = config.ep
+        config_data["parallelism"]["expert_tensor_parallel_degree"] = config.eptp
+
+        # Write back the modified TOML
+        with open(config_file, 'w') as f:
+            toml.dump(config_data, f)
+
+        log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})")
+        return config_file
 
-        # Write modified config
         with open(config_file, 'w') as f:
             f.write(content)
         
-        log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name}, type: {model_type})")
+        log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})")
         return config_file
     
     def extract_metrics(self, log_file: Path) -> TrainingMetrics:
@@ -352,7 +360,7 @@ def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: Train
                        f"Grad norm diff: {grad_norm_diff:.2e} (threshold: {self.grad_norm_threshold:.2e})")
             return False
     
-    def generate_diff(self, baseline_log: Path, log_path: Path, diff_file: Path) -> None:
+    def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None:
         """Generate diff between baseline and test logs."""
         
         def _filter_log(log_file: Path) -> Path:
@@ -378,7 +386,7 @@ def _filter_log(log_file: Path) -> Path:
         try:
             # Filter logs to remove timestamps and volatile information
             baseline_filtered = _filter_log(baseline_log)
-            test_filtered = _filter_log(log_path)
+            test_filtered = _filter_log(test_log)
             
             # Generate colored diff using git diff
             cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color",
@@ -410,6 +418,8 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
         ]
         
         env = os.environ.copy()
+        env["SEED"] = str(self.seed)
+        env["MODEL_TYPE"] = model_name
         
         if self.verbose:
             log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}")
@@ -439,7 +449,7 @@ def run(self) -> int:
             description="Test different parallelism configurations against a baseline FSDP model.",
         )      
         parser.add_argument("-m", "--model-filter", default="",
-                          help="Filter models by name pattern (e.g., 'llama')")
+                          help="Filter models by name pattern (e.g., 'llama3')")
         parser.add_argument("-t", "--loss-threshold", type=float, default=self.DEFAULT_THRESHOLD_LOSS,
                           help=f"Loss difference threshold (default: {self.DEFAULT_THRESHOLD_LOSS})")
         parser.add_argument("-g", "--grad-threshold", type=float, default=self.DEFAULT_THRESHOLD_GRAD_NORM,
@@ -448,11 +458,9 @@ def run(self) -> int:
                           help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})")
         parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS,
                           help=f"Training steps (default: {self.DEFAULT_STEPS})")
-        parser.add_argument("--seed", type=int, default=self.DEFAULT_SEED,
-                          help=f"Random seed (default: {self.DEFAULT_SEED})")
         parser.add_argument("--flavor", default=self.DEFAULT_FLAVOR,
                           help=f"Model flavor/size (default: {self.DEFAULT_FLAVOR}). "
-                               f"Available: llama=[debugmodel, medium, full], deepseek=[debugmodel]")
+                               f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]")
         parser.add_argument("-v", "--verbose", action="store_true",
                           help="Verbose output")
         
@@ -461,97 +469,133 @@ def run(self) -> int:
         self.loss_threshold = args.loss_threshold
         self.grad_norm_threshold = args.grad_threshold
         self.nd_parallel = args.nd_parallel
+        self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
         self.steps = args.steps
-        self.seed = args.seed
         self.model_filter = args.model_filter
         self.flavor = args.flavor
         self.verbose = args.verbose
         
-        log_message(LogLevel.INFO, "=== TorchTitan Distributed Parallelism Comparison ===")
+        log_message(LogLevel.INFO, "=== Distributed Parallelism Comparison ===")
         log_message(LogLevel.INFO, f"Loss threshold: {self.loss_threshold}")
         log_message(LogLevel.INFO, f"Grad norm threshold: {self.grad_norm_threshold}")
-        log_message(LogLevel.INFO, f"GPUs: {self.nd_parallel_to_nb_gpus[self.nd_parallel]}")
+        log_message(LogLevel.INFO, f"GPUs: {self.ngpu}")
         log_message(LogLevel.INFO, f"Steps: {self.steps}")
         log_message(LogLevel.INFO, f"Seed: {self.seed}")
         log_message(LogLevel.INFO, f"Model filter: {self.model_filter or 'all'}")
         log_message(LogLevel.INFO, f"Model flavor: {self.flavor}")
         print()
         
-        self.results_dir.mkdir(exist_ok=True)
-        self.config_dir.mkdir(exist_ok=True)
+        self.base_results_dir.mkdir(exist_ok=True)
+
+        self.generate_parallelism_configs()
         
+        #TODO(3outeille): make it more generic later
+        if self.model_filter == "llama3":
+            hf_model_name = "meta-llama/Llama-3.2-1B"
+            tt_model_name = "llama3"
+        elif self.model_filter == "deepseek_v3":
+            hf_model_name = "deepseek-ai/DeepSeek-V3"
+            tt_model_name = "deepseek_v3"
+        else:
+            raise ValueError(f"Model filter {self.model_filter} not supported")
+            
+        model_owner, model_repo = hf_model_name.split("/", 1)
+        nd_parallel_upper = self.nd_parallel.upper()
+        self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor
+        self.results_dir.mkdir(parents=True, exist_ok=True)
+
         if self.verbose:
             log_message(LogLevel.INFO, f"Results directory: {self.results_dir}")
-            log_message(LogLevel.INFO, f"Config directory: {self.config_dir}")
 
-        self.generate_parallelism_configs()
+        log_message(LogLevel.INFO, "--- Running baseline (FSDP) for huggingface backend ---")
+
+        log_message(LogLevel.INFO, f"Testing model {hf_model_name} (HF) for {self.nd_parallel} parallelism")
+
+        baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None)
+        
+        baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
+        baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf)
+        baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log"
+        if not self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name):
+            log_message(LogLevel.ERROR, f"Huggingface baseline (FSDP) training failed for {hf_model_name}")
+            # raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}")
+
+        hf_baseline_metrics = self.extract_metrics(baseline_log_hf)
+        if hf_baseline_metrics.loss is None or hf_baseline_metrics.grad_norm is None:
+            log_message(LogLevel.ERROR, f"Could not extract huggingface baseline metrics for {hf_model_name}")
+            # raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
         
-        total_model_failures = 0
+        log_message(LogLevel.INFO, "--- Running baseline (FSDP) for torchtitan backend ---")
 
-        for model_type, model_name in self.HF_MODEL_LISTS.items():
-            # Apply model filter if specified
-            if self.model_filter and self.model_filter not in model_type:
-                continue
+        log_message(LogLevel.INFO, f"Testing model {hf_model_name} (TT) for {self.nd_parallel} parallelism")
 
-            log_message(LogLevel.INFO, f"Testing model: {model_type} ({model_name})")
-            total_tests = 0
-            passed_tests = 0
-            failed_tests = 0
-            configs_to_run = []
+        baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+        baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt)
+        baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log"
+        if not self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name):
+            raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}")
 
-            for config in self.parallelism_configs:
-                # Skip configurations that require more GPUs than available
-                required_gpus = config.dp_replicate * config.tp * config.pp
-                if config.dp_shard != -1:
-                    required_gpus *= config.dp_shard
-
-                if required_gpus > self.nd_parallel_to_nb_gpus[self.nd_parallel]:
-                    log_message(LogLevel.WARNING, 
-                               f"Skipping {config.name}: requires {required_gpus} GPUs but only {self.ngpu} available")
-                    continue
-
-                config_file = self.generate_config(config, model_name, model_type)
-                configs_to_run.append((config, config_file))
-
-            # # Test each parallelism configuration
-            # for config, config_file in configs_to_run:
-            #     log_path = self.results_dir / f"{config.name}_{model_type}_{self.flavor}_{self.ngpu}gpu.log"
-            #     if not self.run_training(config_file, log_path, config.name, model_name):
-            #         log_message(LogLevel.TEST_FAIL, f"{config.name} - Training failed")
-            #         failed_tests += 1
-            #         continue
-            #     test_metrics = self.extract_metrics(log_path)
-            #     if self.compare_metrics(baseline_metrics, test_metrics, config.name):
-            #         passed_tests += 1
-            #     else:
-            #         failed_tests += 1
-            #         diff_file = self.results_dir / f"diff_{config.name}_vs_baseline_{model_type}_{self.flavor}_{self.ngpu}gpu.log"
-            #         self.generate_diff(baseline_log, log_path, diff_file)
-            #         log_message(LogLevel.INFO, f"Diff saved to: {diff_file}")
-            #     total_tests += 1
-
-            # Print summary for this model
-            print()
-            log_message(LogLevel.INFO, f"=== TEST SUMMARY for {model_type} ===")
-            log_message(LogLevel.INFO, f"Total tests: {total_tests}")
-            log_message(LogLevel.SUCCESS, f"Passed: {passed_tests}")
-            if failed_tests > 0:
-                log_message(LogLevel.TEST_FAIL, f"Failed: {failed_tests}")
+        tt_baseline_metrics = self.extract_metrics(baseline_log_tt)
+        if tt_baseline_metrics.loss is None or tt_baseline_metrics.grad_norm is None:
+            raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
+        
+        log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---")
+        
+        passed_tests = 0
+        failed_tests = 0
+        test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"]
+        total_tests = len(test_configs)
+
+        for config in test_configs:
+            # Create a subdirectory for each test configuration
+            test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface"
+            test_dir = self.results_dir / test_dir_name
+            test_dir.mkdir(exist_ok=True)
+
+            config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
+            config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf)
+            log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
+
+            successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name)
+
+            # Compare metrics between baseline (HF) and current (HF) nd-parallelism run
+            hf_metrics = self.extract_metrics(log_path_hf)
+            successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)")
+
+            if successful_hf_run and successful_hf_extract:
+                passed_tests += 1
             else:
-                log_message(LogLevel.INFO, f"Failed: {failed_tests}")
-            print()
+                failed_tests += 1
+                # Generate diff with baseline (HF)
+                diff_file_hf_vs_baseline = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
+                self.generate_diff(baseline_log_hf, log_path_hf, diff_file_hf_vs_baseline)
+                log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_file_hf_vs_baseline}")
+
+                # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run
+                config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+                config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt)
+                log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
+                if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name):
+                    raise ValueError(f"TorchTitan training failed for {tt_model_name}")
+                
+                # generated diff between nd-paralellism TT and current hf nd-parallelism run
+                diff_file_tt_vs_hf = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
+                self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_vs_hf)
+                log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_vs_hf}")
 
-            if failed_tests > 0:
-                total_model_failures += 1
+                # generated diff between baseline TT and current hf nd-parallelism run
+                diff_file_tt_baseline_vs_hf = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
+                self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf)
+                log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf}")
 
-        # Final summary
         print()
+        
         log_message(LogLevel.INFO, "=== FINAL SUMMARY ===")
-        if total_model_failures == 0:
+        if passed_tests == total_tests:
             log_message(LogLevel.SUCCESS, "All model tests passed! 🎉")
             return 0
         else:
-            log_message(LogLevel.TEST_FAIL, f"{total_model_failures} model(s) had test failures")
+            log_message(LogLevel.TEST_FAIL, f"{failed_tests} model(s) had test failures")
             log_message(LogLevel.INFO, f"Check the diff files in {self.results_dir} for details")
             return 1
 
diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
index 80bb2d04ca..4d0319a03f 100755
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
@@ -1,6 +1,5 @@
 #!/usr/bin/bash
 
-# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel
+python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 1d --verbose
 
-# python compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d
-debugpy-run compare_distributed_run.py --steps 5 --model-filter llama --flavor debugmodel --nd_parallel 2d
+# debugpy-run compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d

From 36a5673476b6866e4de97e76dda6292f077432a5 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 24 Sep 2025 12:12:11 +0000
Subject: [PATCH 036/129] add seed + deterministic to compare_distributed_run

---
 .../experiments/transformers_backend/compare_distributed_run.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 9be6b52acf..6f7b539b98 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -414,6 +414,8 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
             "--role", "rank",
             "--tee", "3",
             "-m", "torchtitan.train",
+            "--training.seed", str(self.seed),
+            "--training.deterministic",
             "--job.config_file", str(config_file)
         ]
         

From ed892a2cfeb6a060d645a8d032bf24ecd7c2847b Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 24 Sep 2025 12:40:58 +0000
Subject: [PATCH 037/129] better extract and compare metrics

---
 .../compare_distributed_run.py                | 129 ++++++++++--------
 .../compare_distributed_run.sh                |   2 +-
 2 files changed, 76 insertions(+), 55 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 6f7b539b98..a933f9ed56 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -49,8 +49,9 @@
 import sys
 from pathlib import Path
 from typing import List, Optional
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from enum import Enum
+import torch
 
 # Configure logging with colors
 class Colors:
@@ -110,18 +111,26 @@ class ParallelismConfig:
 @dataclass
 class TrainingMetrics:
     """Training metrics extracted from logs."""
-    loss: Optional[float] = None
-    grad_norm: Optional[float] = None
+    steps: List[int] = field(default_factory=list)
+    loss: List[float] = field(default_factory=list)
+    grad_norm: List[float] = field(default_factory=list)
+    memory: List[float] = field(default_factory=list)
+    tps: List[int] = field(default_factory=list)
+    tflops: List[float] = field(default_factory=list)
+    mfu: List[float] = field(default_factory=list)
 
 class CompareDistributedRun:
     """Main class for running distributed parallelism comparison tests."""
     
     # Default values
-    DEFAULT_THRESHOLD_LOSS = 1e-4
-    DEFAULT_THRESHOLD_GRAD_NORM = 1e-3
     DEFAULT_STEPS = 10
     DEFAULT_SEED = 42
     DEFAULT_FLAVOR = "debugmodel"
+    # value chosen based on diff of llama3 1GPU
+    DEFAULT_LOSS_ATOL = 0.02
+    DEFAULT_LOSS_RTOL = 1e-5
+    DEFAULT_GRAD_NORM_ATOL = 0.005
+    DEFAULT_GRAD_NORM_RTOL = 1e-5
     
     MODEL_LISTS = {
         "torchtitan":  ["llama3", "deepseek_v3"],
@@ -151,14 +160,16 @@ def __init__(self):
         self.base_results_dir = self.script_dir / "results"
         
         # Configuration parameters
-        self.loss_threshold = self.DEFAULT_THRESHOLD_LOSS
-        self.grad_norm_threshold = self.DEFAULT_THRESHOLD_GRAD_NORM
         self.nd_parallel_to_nb_gpus = self.ND_PARALLEL_TO_NB_GPUS
         self.steps = self.DEFAULT_STEPS
         self.seed = self.DEFAULT_SEED
         self.model_filter = ""
         self.flavor = self.DEFAULT_FLAVOR
         self.verbose = False
+        self.loss_atol = self.DEFAULT_LOSS_ATOL
+        self.loss_rtol = self.DEFAULT_LOSS_RTOL
+        self.grad_norm_atol = self.DEFAULT_GRAD_NORM_ATOL
+        self.grad_norm_rtol = self.DEFAULT_GRAD_NORM_RTOL
         self.parallelism_configs: List[ParallelismConfig] = []
         self.results_dir: Optional[Path] = None
 
@@ -301,12 +312,6 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam
 
         log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})")
         return config_file
-
-        with open(config_file, 'w') as f:
-            f.write(content)
-        
-        log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})")
-        return config_file
     
     def extract_metrics(self, log_file: Path) -> TrainingMetrics:
         """Extract metrics from log file."""
@@ -315,20 +320,23 @@ def extract_metrics(self, log_file: Path) -> TrainingMetrics:
         try:
             with open(log_file, 'r') as f:
                 content = f.read()
-            
-            # Extract final loss and grad_norm from the last step
-            loss_matches = re.findall(r'loss:\s*([0-9]+\.?[0-9]*)', content)
-            grad_norm_matches = re.findall(r'grad_norm:\s*([0-9]+\.?[0-9]*)', content)
-            
-            if loss_matches:
-                metrics.loss = float(loss_matches[-1])
-            if grad_norm_matches:
-                metrics.grad_norm = float(grad_norm_matches[-1])
+
+            # Regex to capture all metrics from a log line, ignoring ANSI color codes
+            pattern = re.compile(
+                r"step:\s*(\d+)\s*"
+                r".*?loss:\s*([0-9]+\.?[0-9]*)\s*"
+                r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*"
+            )
+
+            for match in pattern.finditer(content):
+                metrics.steps.append(int(match.group(1)))
+                metrics.loss.append(float(match.group(2)))
+                metrics.grad_norm.append(float(match.group(3)))
                 
         except Exception as e:
             log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}")
         
-        if metrics.loss is None or metrics.grad_norm is None:
+        if not metrics.loss or not metrics.grad_norm:
             log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}")
         
         return metrics
@@ -336,28 +344,33 @@ def extract_metrics(self, log_file: Path) -> TrainingMetrics:
     def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, 
                        config_name: str) -> bool:
         """Compare metrics between baseline and test configuration."""
-        if (baseline_metrics.loss is None or baseline_metrics.grad_norm is None or
-            test_metrics.loss is None or test_metrics.grad_norm is None):
+        if not baseline_metrics.loss or not test_metrics.loss:
             log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics")
             return False
         
-        # Calculate absolute differences
-        loss_diff = abs(baseline_metrics.loss - test_metrics.loss)
-        grad_norm_diff = abs(baseline_metrics.grad_norm - test_metrics.grad_norm)
+        # Convert to tensors
+        baseline_loss = torch.tensor(baseline_metrics.loss)
+        test_loss = torch.tensor(test_metrics.loss)
+        baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm)
+        test_grad_norm = torch.tensor(test_metrics.grad_norm)
         
-        # Check if differences are within thresholds
-        loss_pass = loss_diff < self.loss_threshold
-        grad_pass = grad_norm_diff < self.grad_norm_threshold
+        # Check if tensors are close
+        loss_pass = torch.allclose(baseline_loss, test_loss, atol=self.loss_atol, rtol=self.loss_rtol)
+        grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=self.grad_norm_atol, rtol=self.grad_norm_rtol)
+
+        # Calculate max absolute differences for logging
+        loss_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
+        grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
         
         if loss_pass and grad_pass:
             log_message(LogLevel.TEST_PASS, 
-                       f"{config_name} - Loss diff: {loss_diff:.2e} (< {self.loss_threshold:.2e}), "
-                       f"Grad norm diff: {grad_norm_diff:.2e} (< {self.grad_norm_threshold:.2e})")
+                       f"{config_name} - Max loss diff: {loss_diff:.2e}, "
+                       f"Max grad norm diff: {grad_norm_diff:.2e}")
             return True
         else:
             log_message(LogLevel.TEST_FAIL,
-                       f"{config_name} - Loss diff: {loss_diff:.2e} (threshold: {self.loss_threshold:.2e}), "
-                       f"Grad norm diff: {grad_norm_diff:.2e} (threshold: {self.grad_norm_threshold:.2e})")
+                       f"{config_name} - Max loss diff: {loss_diff:.2e}, "
+                       f"Max grad norm diff: {grad_norm_diff:.2e}")
             return False
     
     def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None:
@@ -452,10 +465,6 @@ def run(self) -> int:
         )      
         parser.add_argument("-m", "--model-filter", default="",
                           help="Filter models by name pattern (e.g., 'llama3')")
-        parser.add_argument("-t", "--loss-threshold", type=float, default=self.DEFAULT_THRESHOLD_LOSS,
-                          help=f"Loss difference threshold (default: {self.DEFAULT_THRESHOLD_LOSS})")
-        parser.add_argument("-g", "--grad-threshold", type=float, default=self.DEFAULT_THRESHOLD_GRAD_NORM,
-                          help=f"Grad norm difference threshold (default: {self.DEFAULT_THRESHOLD_GRAD_NORM})")
         parser.add_argument("-nd", "--nd_parallel", type=str, default="2d",
                           help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})")
         parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS,
@@ -465,21 +474,29 @@ def run(self) -> int:
                                f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]")
         parser.add_argument("-v", "--verbose", action="store_true",
                           help="Verbose output")
+        parser.add_argument("--loss-atol", type=float, default=self.DEFAULT_LOSS_ATOL,
+                          help=f"Absolute tolerance for loss comparison (default: {self.DEFAULT_LOSS_ATOL})")
+        parser.add_argument("--loss-rtol", type=float, default=self.DEFAULT_LOSS_RTOL,
+                          help=f"Relative tolerance for loss comparison (default: {self.DEFAULT_LOSS_RTOL})")
+        parser.add_argument("--grad-norm-atol", type=float, default=self.DEFAULT_GRAD_NORM_ATOL,
+                          help=f"Absolute tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_ATOL})")
+        parser.add_argument("--grad-norm-rtol", type=float, default=self.DEFAULT_GRAD_NORM_RTOL,
+                          help=f"Relative tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_RTOL})")
         
         args = parser.parse_args()
         
-        self.loss_threshold = args.loss_threshold
-        self.grad_norm_threshold = args.grad_threshold
         self.nd_parallel = args.nd_parallel
         self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
         self.steps = args.steps
         self.model_filter = args.model_filter
         self.flavor = args.flavor
         self.verbose = args.verbose
+        self.loss_atol = args.loss_atol
+        self.loss_rtol = args.loss_rtol
+        self.grad_norm_atol = args.grad_norm_atol
+        self.grad_norm_rtol = args.grad_norm_rtol
         
         log_message(LogLevel.INFO, "=== Distributed Parallelism Comparison ===")
-        log_message(LogLevel.INFO, f"Loss threshold: {self.loss_threshold}")
-        log_message(LogLevel.INFO, f"Grad norm threshold: {self.grad_norm_threshold}")
         log_message(LogLevel.INFO, f"GPUs: {self.ngpu}")
         log_message(LogLevel.INFO, f"Steps: {self.steps}")
         log_message(LogLevel.INFO, f"Seed: {self.seed}")
@@ -523,7 +540,7 @@ def run(self) -> int:
             # raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}")
 
         hf_baseline_metrics = self.extract_metrics(baseline_log_hf)
-        if hf_baseline_metrics.loss is None or hf_baseline_metrics.grad_norm is None:
+        if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm:
             log_message(LogLevel.ERROR, f"Could not extract huggingface baseline metrics for {hf_model_name}")
             # raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
         
@@ -538,9 +555,13 @@ def run(self) -> int:
             raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}")
 
         tt_baseline_metrics = self.extract_metrics(baseline_log_tt)
-        if tt_baseline_metrics.loss is None or tt_baseline_metrics.grad_norm is None:
+        if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
         
+
+        if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"):
+            raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
+
         log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---")
         
         passed_tests = 0
@@ -569,9 +590,9 @@ def run(self) -> int:
             else:
                 failed_tests += 1
                 # Generate diff with baseline (HF)
-                diff_file_hf_vs_baseline = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
-                self.generate_diff(baseline_log_hf, log_path_hf, diff_file_hf_vs_baseline)
-                log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_file_hf_vs_baseline}")
+                diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
+                self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism)
+                log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}")
 
                 # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run
                 config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
@@ -581,14 +602,14 @@ def run(self) -> int:
                     raise ValueError(f"TorchTitan training failed for {tt_model_name}")
                 
                 # generated diff between nd-paralellism TT and current hf nd-parallelism run
-                diff_file_tt_vs_hf = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
-                self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_vs_hf)
-                log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_vs_hf}")
+                diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
+                self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism)
+                log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}")
 
                 # generated diff between baseline TT and current hf nd-parallelism run
-                diff_file_tt_baseline_vs_hf = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
-                self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf)
-                log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf}")
+                diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
+                self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism)
+                log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}")
 
         print()
         
diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
index 4d0319a03f..d7e5b77bcb 100755
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/bash
 
-python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 1d --verbose
+python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d --verbose
 
 # debugpy-run compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d

From 1c1452fa29922403f16b6a74fc9abe9c6f645eb9 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 24 Sep 2025 13:07:51 +0000
Subject: [PATCH 038/129] refactor to introduce slurm

---
 .../compare_distributed_run.py                | 109 +++++++++++-------
 1 file changed, 66 insertions(+), 43 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index a933f9ed56..a72e2abf7f 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -166,6 +166,8 @@ def __init__(self):
         self.model_filter = ""
         self.flavor = self.DEFAULT_FLAVOR
         self.verbose = False
+        self.use_slurm = False
+        self.slurm_options = []
         self.loss_atol = self.DEFAULT_LOSS_ATOL
         self.loss_rtol = self.DEFAULT_LOSS_RTOL
         self.grad_norm_atol = self.DEFAULT_GRAD_NORM_ATOL
@@ -420,7 +422,7 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
         log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}")
         cmd = [
             "torchrun",
-            f"--nproc_per_node={self.nd_parallel_to_nb_gpus[self.nd_parallel]}",
+            f"--nproc_per_node={self.ngpu}",
             "--rdzv_backend", "c10d",
             "--rdzv_endpoint=localhost:0",
             "--local-ranks-filter", "0",
@@ -431,7 +433,6 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
             "--training.deterministic",
             "--job.config_file", str(config_file)
         ]
-        
         env = os.environ.copy()
         env["SEED"] = str(self.seed)
         env["MODEL_TYPE"] = model_name
@@ -458,11 +459,62 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
             log_message(LogLevel.ERROR, f"Training failed: {config_name}")
             return False
     
+    def _compare_one_parallelism_config(
+        self,
+        config: "ParallelismConfig",
+        hf_model_name: str,
+        tt_model_name: str,
+        hf_baseline_metrics: "TrainingMetrics",
+        baseline_log_hf: Path,
+        baseline_log_tt: Path,
+    ) -> bool:
+        """Compares a single parallelism configuration against the baseline."""
+        # Create a subdirectory for each test configuration
+        test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface"
+        test_dir = self.results_dir / test_dir_name
+        test_dir.mkdir(exist_ok=True)
+
+        config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
+        config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf)
+        log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
+
+        successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name)
+
+        # Compare metrics between baseline (HF) and current (HF) nd-parallelism run
+        hf_metrics = self.extract_metrics(log_path_hf)
+        successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)")
+
+        if successful_hf_run and successful_hf_extract:
+            return True
+        else:
+            # Generate diff with baseline (HF)
+            diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
+            self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism)
+            log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}")
+
+            # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run
+            config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+            config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt)
+            log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
+            if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name):
+                raise ValueError(f"TorchTitan training failed for {tt_model_name}")
+
+            # generated diff between nd-paralellism TT and current hf nd-parallelism run
+            diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
+            self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism)
+            log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}")
+
+            # generated diff between baseline TT and current hf nd-parallelism run
+            diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
+            self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism)
+            log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}")
+            return False
+
     def run(self) -> int:
         """Main execution function. Runs all test suites for all models."""
         parser = argparse.ArgumentParser(
             description="Test different parallelism configurations against a baseline FSDP model.",
-        )      
+        )
         parser.add_argument("-m", "--model-filter", default="",
                           help="Filter models by name pattern (e.g., 'llama3')")
         parser.add_argument("-nd", "--nd_parallel", type=str, default="2d",
@@ -558,58 +610,29 @@ def run(self) -> int:
         if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
         
-
         if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"):
-            raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
+            log_message(LogLevel.ERROR, f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
+            # raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
 
         log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---")
-        
         passed_tests = 0
         failed_tests = 0
         test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"]
         total_tests = len(test_configs)
 
         for config in test_configs:
-            # Create a subdirectory for each test configuration
-            test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface"
-            test_dir = self.results_dir / test_dir_name
-            test_dir.mkdir(exist_ok=True)
-
-            config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-            config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf)
-            log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
-
-            successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name)
-
-            # Compare metrics between baseline (HF) and current (HF) nd-parallelism run
-            hf_metrics = self.extract_metrics(log_path_hf)
-            successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)")
-
-            if successful_hf_run and successful_hf_extract:
+            passed = self._compare_one_parallelism_config(
+                config,
+                hf_model_name,
+                tt_model_name,
+                hf_baseline_metrics,
+                baseline_log_hf,
+                baseline_log_tt,
+            )
+            if passed:
                 passed_tests += 1
             else:
                 failed_tests += 1
-                # Generate diff with baseline (HF)
-                diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
-                self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism)
-                log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}")
-
-                # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run
-                config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
-                config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt)
-                log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
-                if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name):
-                    raise ValueError(f"TorchTitan training failed for {tt_model_name}")
-                
-                # generated diff between nd-paralellism TT and current hf nd-parallelism run
-                diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
-                self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism)
-                log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}")
-
-                # generated diff between baseline TT and current hf nd-parallelism run
-                diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
-                self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism)
-                log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}")
 
         print()
         

From 5e4911fbfd64b93f2803d5f89506a7f20103f602 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 24 Sep 2025 13:27:38 +0000
Subject: [PATCH 039/129] error handling with subprocess

---
 .../compare_distributed_run.py                | 58 +++++++++++--------
 1 file changed, 35 insertions(+), 23 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index a72e2abf7f..ec58b2e729 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -144,6 +144,7 @@ class CompareDistributedRun:
         "deepseek-ai/DeepSeek-V3": ["debugmodel"],
     }
 
+    #TODO(3outeille): handle slurm later for 4D/5D. Might need to rethink the whole script for that
     # Available ND parallelisms <-> number of GPUs
     ND_PARALLEL_TO_NB_GPUS = {
         "0d": 1,
@@ -417,7 +418,7 @@ def _filter_log(log_file: Path) -> Path:
         except Exception as e:
             log_message(LogLevel.WARNING, f"Could not generate diff: {e}")
     
-    def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> bool:
+    def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> Optional[subprocess.CalledProcessError]:
         """Run training with given configuration."""
         log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}")
         cmd = [
@@ -441,23 +442,33 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
             log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}")
         
         try:
+            # Capture output to include it in the exception, while still writing to log file
+            result = subprocess.run(
+                cmd,
+                cwd=self.torchtitan_root,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,  # decodes stdout/stderr as text
+                env=env,
+                check=True
+            )
             with open(log_file, 'w') as f:
-                result = subprocess.run(
-                    cmd,
-                    cwd=self.torchtitan_root,
-                    stdout=f,
-                    stderr=subprocess.STDOUT,
-                    env=env,
-                    check=True
-                )
+                f.write(result.stdout)
             
             if self.verbose:
                 log_message(LogLevel.SUCCESS, f"Training completed: {config_name}")
-            return True
+            return None
             
         except subprocess.CalledProcessError as e:
             log_message(LogLevel.ERROR, f"Training failed: {config_name}")
-            return False
+            
+            # Write the failed output to the log file
+            with open(log_file, 'w') as f:
+                if e.stdout:
+                    f.write(e.stdout)
+
+            e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or '<no output captured>'}")
+            return e
     
     def _compare_one_parallelism_config(
         self,
@@ -478,7 +489,8 @@ def _compare_one_parallelism_config(
         config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf)
         log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
 
-        successful_hf_run = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name)
+        hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name)
+        successful_hf_run = hf_run_error is None
 
         # Compare metrics between baseline (HF) and current (HF) nd-parallelism run
         hf_metrics = self.extract_metrics(log_path_hf)
@@ -496,8 +508,9 @@ def _compare_one_parallelism_config(
             config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
             config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt)
             log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
-            if not self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name):
-                raise ValueError(f"TorchTitan training failed for {tt_model_name}")
+            tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name)
+            if tt_run_error:
+                raise ValueError(f"TorchTitan training failed for {tt_model_name}") from tt_run_error
 
             # generated diff between nd-paralellism TT and current hf nd-parallelism run
             diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
@@ -587,14 +600,13 @@ def run(self) -> int:
         baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
         baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf)
         baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log"
-        if not self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name):
-            log_message(LogLevel.ERROR, f"Huggingface baseline (FSDP) training failed for {hf_model_name}")
-            # raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}")
+        hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name)
+        if hf_baseline_run_error:
+            raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error
 
         hf_baseline_metrics = self.extract_metrics(baseline_log_hf)
         if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm:
-            log_message(LogLevel.ERROR, f"Could not extract huggingface baseline metrics for {hf_model_name}")
-            # raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
+            raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
         
         log_message(LogLevel.INFO, "--- Running baseline (FSDP) for torchtitan backend ---")
 
@@ -603,16 +615,16 @@ def run(self) -> int:
         baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
         baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt)
         baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log"
-        if not self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name):
-            raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}")
+        tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name)
+        if tt_baseline_run_error:
+            raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error
 
         tt_baseline_metrics = self.extract_metrics(baseline_log_tt)
         if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
         
         if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"):
-            log_message(LogLevel.ERROR, f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
-            # raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
+            raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
 
         log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---")
         passed_tests = 0

From 4891a4783788ea448329be0a6134de169ae1dca4 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 24 Sep 2025 14:18:10 +0000
Subject: [PATCH 040/129] FSDP for llama in 1D works

---
 .../compare_distributed_run.py                | 11 +++++-----
 .../infra/parallelize_hf_transformers.py      | 22 +++++++++----------
 .../model/hf_transformers_args.py             | 16 ++++++++++++++
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index ec58b2e729..44adbbd57a 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -129,7 +129,7 @@ class CompareDistributedRun:
     # value chosen based on diff of llama3 1GPU
     DEFAULT_LOSS_ATOL = 0.02
     DEFAULT_LOSS_RTOL = 1e-5
-    DEFAULT_GRAD_NORM_ATOL = 0.005
+    DEFAULT_GRAD_NORM_ATOL = 0.02
     DEFAULT_GRAD_NORM_RTOL = 1e-5
     
     MODEL_LISTS = {
@@ -392,10 +392,6 @@ def _filter_log(log_file: Path) -> Path:
                                 'torchrun ... --master_port=XXXX', line)
                     line = re.sub(r'PID [0-9]+', 'PID XXXX', line)
                     line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line)
-                    line = re.sub(r'memory: [0-9]+\.[0-9]+GiB', 'memory: XX.XXGiB', line)
-                    line = re.sub(r'tps: [0-9,]+', 'tps: XXXXX', line)
-                    line = re.sub(r'tflops: [0-9]+\.[0-9]+', 'tflops: XX.XX', line)
-                    line = re.sub(r'mfu: [0-9]+\.[0-9]+%', 'mfu: XX.XX%', line)
                     outfile.write(line)
             
             return filtered_file
@@ -443,6 +439,7 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
         
         try:
             # Capture output to include it in the exception, while still writing to log file
+            log_message(LogLevel.INFO, f"Running command: {' '.join(cmd)}")
             result = subprocess.run(
                 cmd,
                 cwd=self.torchtitan_root,
@@ -619,6 +616,10 @@ def run(self) -> int:
         if tt_baseline_run_error:
             raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error
 
+        diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
+        self.generate_diff(baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline)
+        log_message(LogLevel.INFO, f"Diff between baseline (TT) and baseline (HF) saved to: {diff_file_tt_baseline_vs_hf_baseline}")
+
         tt_baseline_metrics = self.extract_metrics(baseline_log_tt)
         if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 76d2d8adb4..1d2b792898 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -315,7 +315,7 @@ def apply_non_moe_tp(
         model,
         tp_mesh,
         {
-            "tok_embeddings": RowwiseParallel(
+            "embed_tokens": RowwiseParallel(
                 input_layouts=Replicate(),
                 output_layouts=Shard(1),
             ),
@@ -437,18 +437,18 @@ def apply_fsdp(
                 f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
             )
 
-    if model.tok_embeddings is not None:
+    if model.embed_tokens is not None:
         fully_shard(
-            model.tok_embeddings,
+            model.embed_tokens,
             **fsdp_config,
             reshard_after_forward=reshard_after_forward,
         )
 
-    for layer_id, transformer_block in model.layers.items():
+    for transformer_block in model.layers:
         # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping
         # - the router and the shared experts are sharded together with the TransformerBlock
         # - the routed experts are sharded with the remaining dp_mod_ep_mesh
-        if transformer_block.moe_enabled and ep_degree > 1:
+        if hasattr(transformer_block, "moe_enabled") and transformer_block.moe_enabled and ep_degree > 1:
             fsdp_mod_ep_config = fsdp_config.copy()
             fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh
 
@@ -489,9 +489,9 @@ def apply_fsdp(
 
     # As an optimization, do not reshard_after_forward the last layers by default
     # since FSDP would prefetch them immediately after the forward pass
-    if model.norm is not None and model.output is not None:
+    if model.norm is not None and model.model.lm_head is not None:
         fully_shard(
-            [model.norm, model.output],
+            [model.norm, model.model.lm_head],
             **fsdp_config,
             reshard_after_forward=reshard_after_forward_policy == "always",
         )
@@ -507,8 +507,8 @@ def apply_fsdp(
     transformer_blocks = list(model.layers.values())
     next_transformer_blocks = transformer_blocks[1:] + [None]
 
-    if model.tok_embeddings is not None and model.layers is not None:
-        model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]])
+    if model.embed_tokens is not None and model.layers is not None:
+        model.embed_tokens.set_modules_to_forward_prefetch([transformer_blocks[0]])
 
     for transformer_block, next_transformer_block in zip(
         transformer_blocks, next_transformer_blocks
@@ -546,8 +546,8 @@ def apply_fsdp(
                 transformer_block.set_modules_to_backward_prefetch(
                     [prev_transformer_block]
                 )
-        elif model.tok_embeddings is not None:
-            transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings])
+        elif model.embed_tokens is not None:
+            transformer_block.set_modules_to_backward_prefetch([model.embed_tokens])
 
 
 def apply_moe_ep_tp(
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 704f83a534..3ecdbddad6 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -294,6 +294,22 @@ def layers(self):
             # Add more cases here if needed for other model architectures
             raise AttributeError("Could not find layers in the model. Please check the model structure.")
 
+    @property
+    def embed_tokens(self):
+        """Returns the model's embed_tokens, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
+            return self.model.model.embed_tokens
+        else:
+            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+
+    @property
+    def norm(self):
+        """Returns the model's norm, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
+            return self.model.model.norm
+        else:
+            raise AttributeError("Could not find norm in the model. Please check the model structure.")
+
     def forward(self, *args, **kwargs):
         output = self.model(*args, **kwargs)
         if isinstance(output, CausalLMOutputWithPast):

From 9e260a0364758e45f85aa2365984d68cff6b4c74 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 24 Sep 2025 14:37:20 +0000
Subject: [PATCH 041/129] better formatting of compare_distributed_run +
 display min/max grad_norm and loss

---
 .../compare_distributed_run.py                | 307 +++++++++++++-----
 1 file changed, 225 insertions(+), 82 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 44adbbd57a..1ac6f8d0da 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -52,16 +52,20 @@
 from dataclasses import dataclass, field
 from enum import Enum
 import torch
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich.table import Table
+
+
+console = Console()
 
-# Configure logging with colors
-class Colors:
-    RED = '\033[0;31m'
-    GREEN = '\033[0;32m'
-    YELLOW = '\033[1;33m'
-    BLUE = '\033[0;34m'
-    MAGENTA = '\033[0;35m'
-    CYAN = '\033[0;36m'
-    NC = '\033[0m'  # No Color
 
 class LogLevel(Enum):
     INFO = "INFO"
@@ -71,17 +75,18 @@ class LogLevel(Enum):
     TEST_PASS = "TEST_PASS"
     TEST_FAIL = "TEST_FAIL"
 
+
 def log_message(level: LogLevel, message: str) -> None:
     """Log a message with appropriate color coding."""
-    color_map = {
-        LogLevel.INFO: Colors.BLUE,
-        LogLevel.SUCCESS: Colors.GREEN,
-        LogLevel.WARNING: Colors.YELLOW,
-        LogLevel.ERROR: Colors.RED,
-        LogLevel.TEST_PASS: Colors.GREEN,
-        LogLevel.TEST_FAIL: Colors.RED,
+    style_map = {
+        LogLevel.INFO: "blue",
+        LogLevel.SUCCESS: "green",
+        LogLevel.WARNING: "yellow",
+        LogLevel.ERROR: "bold red",
+        LogLevel.TEST_PASS: "green",
+        LogLevel.TEST_FAIL: "bold red",
     }
-    
+
     prefix_map = {
         LogLevel.INFO: "[INFO]",
         LogLevel.SUCCESS: "[SUCCESS]",
@@ -90,10 +95,11 @@ def log_message(level: LogLevel, message: str) -> None:
         LogLevel.TEST_PASS: "✅ TEST PASS",
         LogLevel.TEST_FAIL: "❌ TEST FAIL",
     }
-    
-    color = color_map[level]
+
+    style = style_map[level]
     prefix = prefix_map[level]
-    print(f"{color}{prefix}{Colors.NC} {message}")
+    console.print(f"[{style}]{prefix}[/] {message}")
+
 
 @dataclass
 class ParallelismConfig:
@@ -257,10 +263,37 @@ def _get_factors(n: int) -> List[int]:
 
         self.parallelism_configs = unique_configs
         
-        log_message(LogLevel.INFO, f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.")
-        if self.verbose:
-            for config in self.parallelism_configs:
-                log_message(LogLevel.INFO, f"  - {config.name}: dp_replicate={config.dp_replicate}, dp_shard={config.dp_shard}, tp={config.tp}, pp={config.pp}, cp={config.cp}, ep={config.ep}, eptp={config.eptp}")
+        log_message(
+            LogLevel.INFO,
+            f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.",
+        )
+        table = Table(
+            title="[bold]Generated Parallelism Configurations[/bold]",
+            show_header=True,
+            header_style="bold magenta",
+        )
+        table.add_column("Name", style="cyan", no_wrap=True)
+        table.add_column("dp_replicate", justify="right")
+        table.add_column("dp_shard", justify="right")
+        table.add_column("tp", justify="right")
+        table.add_column("pp", justify="right")
+        table.add_column("cp", justify="right")
+        table.add_column("ep", justify="right")
+        table.add_column("eptp", justify="right")
+
+        for config in self.parallelism_configs:
+            table.add_row(
+                config.name,
+                str(config.dp_replicate),
+                str(config.dp_shard),
+                str(config.tp),
+                str(config.pp),
+                str(config.cp),
+                str(config.ep),
+                str(config.eptp),
+            )
+        console.print(table)
+        console.print()
     
     def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None) -> Path:
         """Generate configuration file for a parallelism setup."""
@@ -313,7 +346,8 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam
         with open(config_file, 'w') as f:
             toml.dump(config_data, f)
 
-        log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})")
+        if self.verbose:
+            log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})")
         return config_file
     
     def extract_metrics(self, log_file: Path) -> TrainingMetrics:
@@ -362,18 +396,26 @@ def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: Train
         grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=self.grad_norm_atol, rtol=self.grad_norm_rtol)
 
         # Calculate max absolute differences for logging
-        loss_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
+        loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
         grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
         
+        # Calculate min absolute differences for logging
+        loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
+        grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
+
         if loss_pass and grad_pass:
             log_message(LogLevel.TEST_PASS, 
-                       f"{config_name} - Max loss diff: {loss_diff:.2e}, "
-                       f"Max grad norm diff: {grad_norm_diff:.2e}")
+                       f"{config_name} - Max loss diff: {loss_max_diff:.2e}, "
+                       f"Min loss diff: {loss_min_diff:.2e}, "
+                       f"Max grad norm diff: {grad_norm_diff:.2e}, "
+                       f"Min grad norm diff: {grad_norm_min_diff:.2e}")
             return True
         else:
             log_message(LogLevel.TEST_FAIL,
-                       f"{config_name} - Max loss diff: {loss_diff:.2e}, "
-                       f"Max grad norm diff: {grad_norm_diff:.2e}")
+                       f"{config_name} - Max loss diff: {loss_max_diff:.2e}, "
+                       f"Min loss diff: {loss_min_diff:.2e}, "
+                       f"Max grad norm diff: {grad_norm_diff:.2e}, "
+                       f"Min grad norm diff: {grad_norm_min_diff:.2e}")
             return False
     
     def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None:
@@ -439,7 +481,6 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
         
         try:
             # Capture output to include it in the exception, while still writing to log file
-            log_message(LogLevel.INFO, f"Running command: {' '.join(cmd)}")
             result = subprocess.run(
                 cmd,
                 cwd=self.torchtitan_root,
@@ -497,27 +538,56 @@ def _compare_one_parallelism_config(
             return True
         else:
             # Generate diff with baseline (HF)
-            diff_hf_baseline_vs_hf_nd_parallelism = test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
-            self.generate_diff(baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism)
-            log_message(LogLevel.INFO, f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}")
+            diff_hf_baseline_vs_hf_nd_parallelism = (
+                test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
+            )
+            self.generate_diff(
+                baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism
+            )
+            log_message(
+                LogLevel.INFO,
+                f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}",
+            )
 
             # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run
-            config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+            config_filename_tt = (
+                test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+            )
             config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt)
             log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
             tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name)
             if tt_run_error:
-                raise ValueError(f"TorchTitan training failed for {tt_model_name}") from tt_run_error
+                raise ValueError(
+                    f"TorchTitan training failed for {tt_model_name}"
+                ) from tt_run_error
 
             # generated diff between nd-paralellism TT and current hf nd-parallelism run
-            diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
-            self.generate_diff(log_path_tt, log_path_hf, diff_file_tt_nd_parallelism_vs_hf_nd_parallelism)
-            log_message(LogLevel.INFO, f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}")
+            diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = (
+                test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
+            )
+            self.generate_diff(
+                log_path_tt,
+                log_path_hf,
+                diff_file_tt_nd_parallelism_vs_hf_nd_parallelism,
+            )
+            log_message(
+                LogLevel.INFO,
+                f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}",
+            )
 
             # generated diff between baseline TT and current hf nd-parallelism run
-            diff_file_tt_baseline_vs_hf_nd_parallelism = test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
-            self.generate_diff(baseline_log_tt, log_path_hf, diff_file_tt_baseline_vs_hf_nd_parallelism)
-            log_message(LogLevel.INFO, f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}")
+            diff_file_tt_baseline_vs_hf_nd_parallelism = (
+                test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
+            )
+            self.generate_diff(
+                baseline_log_tt,
+                log_path_hf,
+                diff_file_tt_baseline_vs_hf_nd_parallelism,
+            )
+            log_message(
+                LogLevel.INFO,
+                f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}",
+            )
             return False
 
     def run(self) -> int:
@@ -557,20 +627,29 @@ def run(self) -> int:
         self.loss_rtol = args.loss_rtol
         self.grad_norm_atol = args.grad_norm_atol
         self.grad_norm_rtol = args.grad_norm_rtol
-        
-        log_message(LogLevel.INFO, "=== Distributed Parallelism Comparison ===")
-        log_message(LogLevel.INFO, f"GPUs: {self.ngpu}")
-        log_message(LogLevel.INFO, f"Steps: {self.steps}")
-        log_message(LogLevel.INFO, f"Seed: {self.seed}")
-        log_message(LogLevel.INFO, f"Model filter: {self.model_filter or 'all'}")
-        log_message(LogLevel.INFO, f"Model flavor: {self.flavor}")
-        print()
-        
+
+        console.print(
+            Panel(
+                (
+                    f"[bold]GPUs:[/bold] {self.ngpu}\n"
+                    f"[bold]Steps:[/bold] {self.steps}\n"
+                    f"[bold]Seed:[/bold] {self.seed}\n"
+                    f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n"
+                    f"[bold]Model flavor:[/bold] {self.flavor}"
+                ),
+                title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]",
+                expand=False,
+                border_style="blue",
+                padding=(1, 2),
+            )
+        )
+        console.print()
+
         self.base_results_dir.mkdir(exist_ok=True)
 
         self.generate_parallelism_configs()
-        
-        #TODO(3outeille): make it more generic later
+
+        # TODO(3outeille): make it more generic later
         if self.model_filter == "llama3":
             hf_model_name = "meta-llama/Llama-3.2-1B"
             tt_model_name = "llama3"
@@ -588,9 +667,14 @@ def run(self) -> int:
         if self.verbose:
             log_message(LogLevel.INFO, f"Results directory: {self.results_dir}")
 
-        log_message(LogLevel.INFO, "--- Running baseline (FSDP) for huggingface backend ---")
-
-        log_message(LogLevel.INFO, f"Testing model {hf_model_name} (HF) for {self.nd_parallel} parallelism")
+        console.print(
+            Panel(
+                "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]",
+                expand=False,
+                border_style="blue",
+                padding=(0, 2),
+            )
+        )
 
         baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None)
         
@@ -604,10 +688,6 @@ def run(self) -> int:
         hf_baseline_metrics = self.extract_metrics(baseline_log_hf)
         if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
-        
-        log_message(LogLevel.INFO, "--- Running baseline (FSDP) for torchtitan backend ---")
-
-        log_message(LogLevel.INFO, f"Testing model {hf_model_name} (TT) for {self.nd_parallel} parallelism")
 
         baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
         baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt)
@@ -624,40 +704,103 @@ def run(self) -> int:
         if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
         
-        if not self.compare_metrics(tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"):
-            raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
+        if not self.compare_metrics(
+            tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"
+        ):
+            raise ValueError(
+                f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}"
+            )
 
-        log_message(LogLevel.INFO, "--- Comparing other parallelism configurations (huggingface) ---")
-        passed_tests = 0
+        console.print()
+        console.print(
+            Panel(
+                "[bold cyan]Comparing ND Parallelism Configurations[/bold cyan]",
+                expand=False,
+                border_style="blue",
+                padding=(0, 2),
+            )
+        )
+        passed_tests = 1 # +1 for the baseline (FSDP)
         failed_tests = 0
         test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"]
-        total_tests = len(test_configs)
-
-        for config in test_configs:
-            passed = self._compare_one_parallelism_config(
-                config,
-                hf_model_name,
-                tt_model_name,
-                hf_baseline_metrics,
-                baseline_log_hf,
-                baseline_log_tt,
+        total_tests = len(test_configs) + 1 # +1 for the baseline (FSDP)
+        results = []
+
+        console.print()
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+            TimeElapsedColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task(
+                "[cyan]Comparing configurations...", total=total_tests
+            )
+            for config in test_configs:
+                progress.update(
+                    task, description=f"[cyan]Testing [bold]{config.name}[/bold]"
+                )
+                passed = self._compare_one_parallelism_config(
+                    config,
+                    hf_model_name,
+                    tt_model_name,
+                    hf_baseline_metrics,
+                    baseline_log_hf,
+                    baseline_log_tt,
+                )
+                results.append((config.name, passed))
+                if passed:
+                    passed_tests += 1
+                else:
+                    failed_tests += 1
+                progress.advance(task)
+        console.print()
+
+        console.print(
+            Panel(
+                "[bold cyan]Final Summary[/bold cyan]",
+                expand=False,
+                border_style="blue",
+                padding=(0, 2),
             )
-            if passed:
-                passed_tests += 1
-            else:
-                failed_tests += 1
+        )
+
+        summary_table = Table(show_header=True, header_style="bold magenta")
+        summary_table.add_column("Configuration", style="cyan")
+        summary_table.add_column("Status", justify="center")
+
+        for name, passed in results:
+            status = (
+                "[bold green]✅ PASS[/bold green]"
+                if passed
+                else "[bold red]❌ FAIL[/bold red]"
+            )
+            summary_table.add_row(name, status)
+
+        console.print(summary_table)
+        console.print()
+
+        overall_summary = Table(title="Overall Test Summary")
+        overall_summary.add_column("Metric", style="cyan")
+        overall_summary.add_column("Value", justify="right")
+        overall_summary.add_row("Total Configurations Tested", str(total_tests))
+        overall_summary.add_row("[green]Passed[/green]", str(passed_tests))
+        overall_summary.add_row("[red]Failed[/red]", str(failed_tests))
+        console.print(overall_summary)
 
-        print()
-        
-        log_message(LogLevel.INFO, "=== FINAL SUMMARY ===")
         if passed_tests == total_tests:
             log_message(LogLevel.SUCCESS, "All model tests passed! 🎉")
             return 0
         else:
             log_message(LogLevel.TEST_FAIL, f"{failed_tests} model(s) had test failures")
-            log_message(LogLevel.INFO, f"Check the diff files in {self.results_dir} for details")
+            log_message(
+                LogLevel.INFO, f"Check the diff files in {self.results_dir} for details"
+            )
             return 1
 
+
 def main():
     """Entry point for the script."""
     runner = CompareDistributedRun()

From a604beea57ba25c446f4887eb2ded3f00c170d8b Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 25 Sep 2025 13:25:56 +0000
Subject: [PATCH 042/129] make FSDP work in a cleaner way (mapping instead of
 renaming)

---
 .../infra/parallelize_hf_transformers.py                 | 4 ++--
 .../transformers_backend/model/hf_transformers_args.py   | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 1d2b792898..a97479b216 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -489,9 +489,9 @@ def apply_fsdp(
 
     # As an optimization, do not reshard_after_forward the last layers by default
     # since FSDP would prefetch them immediately after the forward pass
-    if model.norm is not None and model.model.lm_head is not None:
+    if model.norm is not None and model.output is not None:
         fully_shard(
-            [model.norm, model.model.lm_head],
+            [model.norm, model.output],
             **fsdp_config,
             reshard_after_forward=reshard_after_forward_policy == "always",
         )
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 3ecdbddad6..3eb74c6b4b 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -310,6 +310,15 @@ def norm(self):
         else:
             raise AttributeError("Could not find norm in the model. Please check the model structure.")
 
+    @property
+    def output(self):
+        """Returns the model's output layer, handling different Hugging Face model structures."""
+        if hasattr(self.model, "lm_head"):  # For models like LlamaForCausalLM
+            return self.model.lm_head
+        else:
+            # Add more cases here if needed for other model architectures
+            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
+
     def forward(self, *args, **kwargs):
         output = self.model(*args, **kwargs)
         if isinstance(output, CausalLMOutputWithPast):

From 0b38d0d0f8e605c3edd2f312264363172044c546 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 26 Sep 2025 14:32:22 +0000
Subject: [PATCH 043/129] Improve logging in compare_distributed_run

---
 .../compare_distributed_run.py                | 162 ++++++++++++------
 1 file changed, 112 insertions(+), 50 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 1ac6f8d0da..1a432b68bd 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -68,6 +68,7 @@
 
 
 class LogLevel(Enum):
+    COMMAND = "COMMAND"
     INFO = "INFO"
     SUCCESS = "SUCCESS"
     WARNING = "WARNING"
@@ -76,9 +77,10 @@ class LogLevel(Enum):
     TEST_FAIL = "TEST_FAIL"
 
 
-def log_message(level: LogLevel, message: str) -> None:
+def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None:
     """Log a message with appropriate color coding."""
     style_map = {
+        LogLevel.COMMAND: "dim",
         LogLevel.INFO: "blue",
         LogLevel.SUCCESS: "green",
         LogLevel.WARNING: "yellow",
@@ -88,6 +90,7 @@ def log_message(level: LogLevel, message: str) -> None:
     }
 
     prefix_map = {
+        LogLevel.COMMAND: "[COMMAND]",
         LogLevel.INFO: "[INFO]",
         LogLevel.SUCCESS: "[SUCCESS]",
         LogLevel.WARNING: "[WARNING]",
@@ -98,7 +101,21 @@ def log_message(level: LogLevel, message: str) -> None:
 
     style = style_map[level]
     prefix = prefix_map[level]
-    console.print(f"[{style}]{prefix}[/] {message}")
+    if indent > 0:
+        indent_str = "  " * (indent - 1) + "└─ "
+    else:
+        indent_str = ""
+         
+    output = ""
+    if level == LogLevel.COMMAND:
+        output = f"{indent_str}[{style}]{prefix} {message}[/]"
+    else:
+        output = f"{indent_str}[{style}]{prefix}[/] {message}"
+
+    if dim:
+        console.print(f"[dim]{output}[/dim]")
+    else:
+        console.print(output)
 
 
 @dataclass
@@ -196,7 +213,7 @@ def _get_factors(n: int) -> List[int]:
             return sorted(list(factors))
 
         # Baseline FSDP
-        configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="Interleaved1F1B", cp=1, ep=1, eptp=1))
+        configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="1F1B", cp=1, ep=1, eptp=1))
 
         #NOTE(3outeille): No need to handle DDP (dp_replicate) as DDP is not supported > 1D parallelism"
         #(cf https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/infra/parallelize.py#L139)
@@ -228,7 +245,7 @@ def _get_factors(n: int) -> List[int]:
                                 dp_shard=dp_shard,
                                 tp=tp,
                                 pp=pp,
-                                pp_schedule="Interleaved1F1B",
+                                pp_schedule="1F1B",
                                 cp=cp,
                                 ep=1,
                                 eptp=1
@@ -243,7 +260,7 @@ def _get_factors(n: int) -> List[int]:
                                 dp_shard=dp_shard,
                                 tp=tp,
                                 pp=pp,
-                                pp_schedule="Interleaved1F1B",
+                                pp_schedule="1F1B",
                                 cp=cp,
                                 ep=dp_shard,
                                 eptp=1
@@ -295,7 +312,7 @@ def _get_factors(n: int) -> List[int]:
         console.print(table)
         console.print()
     
-    def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None) -> Path:
+    def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None, indent: int = 0, dim: bool = False) -> Path:
         """Generate configuration file for a parallelism setup."""
         import toml
 
@@ -322,7 +339,7 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam
             if self.flavor not in self.MODEL_FLAVORS[model_name]:
                 log_message(LogLevel.WARNING, 
                            f"Flavor '{self.flavor}' not available for {model_name}. "
-                           f"Available: {self.MODEL_FLAVORS[model_name]}")
+                           f"Available: {self.MODEL_FLAVORS[model_name]}", indent=indent, dim=dim)
 
         # Update [training] section
         if "training" not in config_data:
@@ -347,10 +364,10 @@ def generate_config(self, config_dir: Path, config: ParallelismConfig, model_nam
             toml.dump(config_data, f)
 
         if self.verbose:
-            log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})")
+            log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})", indent=indent, dim=dim)
         return config_file
     
-    def extract_metrics(self, log_file: Path) -> TrainingMetrics:
+    def extract_metrics(self, log_file: Path, indent: int = 0, dim: bool = False) -> TrainingMetrics:
         """Extract metrics from log file."""
         metrics = TrainingMetrics()
         
@@ -371,18 +388,18 @@ def extract_metrics(self, log_file: Path) -> TrainingMetrics:
                 metrics.grad_norm.append(float(match.group(3)))
                 
         except Exception as e:
-            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}")
+            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}", indent=indent, dim=dim)
         
         if not metrics.loss or not metrics.grad_norm:
-            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}")
+            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}", indent=indent, dim=dim)
         
         return metrics
     
     def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, 
-                       config_name: str) -> bool:
+                       config_name: str, indent: int = 0, dim: bool = False) -> bool:
         """Compare metrics between baseline and test configuration."""
         if not baseline_metrics.loss or not test_metrics.loss:
-            log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics")
+            log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics", indent=indent, dim=dim)
             return False
         
         # Convert to tensors
@@ -408,17 +425,17 @@ def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: Train
                        f"{config_name} - Max loss diff: {loss_max_diff:.2e}, "
                        f"Min loss diff: {loss_min_diff:.2e}, "
                        f"Max grad norm diff: {grad_norm_diff:.2e}, "
-                       f"Min grad norm diff: {grad_norm_min_diff:.2e}")
+                       f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim)
             return True
         else:
             log_message(LogLevel.TEST_FAIL,
                        f"{config_name} - Max loss diff: {loss_max_diff:.2e}, "
                        f"Min loss diff: {loss_min_diff:.2e}, "
                        f"Max grad norm diff: {grad_norm_diff:.2e}, "
-                       f"Min grad norm diff: {grad_norm_min_diff:.2e}")
+                       f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim)
             return False
     
-    def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path) -> None:
+    def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path, indent: int = 0, dim: bool = False) -> None:
         """Generate diff between baseline and test logs."""
         
         def _filter_log(log_file: Path) -> Path:
@@ -454,17 +471,17 @@ def _filter_log(log_file: Path) -> Path:
             test_filtered.unlink()
             
         except Exception as e:
-            log_message(LogLevel.WARNING, f"Could not generate diff: {e}")
+            log_message(LogLevel.WARNING, f"Could not generate diff: {e}", indent=indent, dim=dim)
     
-    def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str) -> Optional[subprocess.CalledProcessError]:
+    def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]:
         """Run training with given configuration."""
-        log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}")
+        log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}", indent=indent, dim=dim)
         cmd = [
             "torchrun",
             f"--nproc_per_node={self.ngpu}",
             "--rdzv_backend", "c10d",
             "--rdzv_endpoint=localhost:0",
-            "--local-ranks-filter", "0",
+            "--local-ranks-filter", str(self.ngpu - 1),
             "--role", "rank",
             "--tee", "3",
             "-m", "torchtitan.train",
@@ -475,10 +492,10 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
         env = os.environ.copy()
         env["SEED"] = str(self.seed)
         env["MODEL_TYPE"] = model_name
-        
-        if self.verbose:
-            log_message(LogLevel.INFO, f"Command: {' '.join(cmd)}")
-        
+        env["LOG_RANK"] = str(self.ngpu - 1)
+
+        log_message(LogLevel.COMMAND, f"Command: {' '.join(cmd)}", indent=indent, dim=dim)
+
         try:
             # Capture output to include it in the exception, while still writing to log file
             result = subprocess.run(
@@ -494,17 +511,25 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
                 f.write(result.stdout)
             
             if self.verbose:
-                log_message(LogLevel.SUCCESS, f"Training completed: {config_name}")
+                log_message(LogLevel.SUCCESS, f"Training completed: {config_name}", indent=indent, dim=dim)
             return None
             
         except subprocess.CalledProcessError as e:
-            log_message(LogLevel.ERROR, f"Training failed: {config_name}")
+            log_message(LogLevel.ERROR, f"Training failed: {config_name}", indent=indent, dim=dim)
             
             # Write the failed output to the log file
             with open(log_file, 'w') as f:
                 if e.stdout:
                     f.write(e.stdout)
 
+            # Print the tail of the error log to the console for quick debugging
+            if e.stdout:
+                console.print("[bold red]--- Error Log Tail ---[/bold red]")
+                error_lines = e.stdout.strip().split('\n')
+                for line in error_lines[-15:]:
+                    console.print(f"[red]{line}[/red]")
+                console.print("[bold red]--- End Error Log Tail ---[/bold red]")
+
             e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or '<no output captured>'}")
             return e
     
@@ -514,8 +539,10 @@ def _compare_one_parallelism_config(
         hf_model_name: str,
         tt_model_name: str,
         hf_baseline_metrics: "TrainingMetrics",
+        tt_baseline_metrics: "TrainingMetrics",
         baseline_log_hf: Path,
         baseline_log_tt: Path,
+        indent: int = 0,
     ) -> bool:
         """Compares a single parallelism configuration against the baseline."""
         # Create a subdirectory for each test configuration
@@ -524,17 +551,23 @@ def _compare_one_parallelism_config(
         test_dir.mkdir(exist_ok=True)
 
         config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-        config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf)
+        config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf, indent=indent)
         log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
 
-        hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name)
-        successful_hf_run = hf_run_error is None
-
-        # Compare metrics between baseline (HF) and current (HF) nd-parallelism run
-        hf_metrics = self.extract_metrics(log_path_hf)
-        successful_hf_extract = self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)")
+        hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name, indent=indent)
+        
+        test_passed = True
+        hf_metrics = None
+        if hf_run_error:
+            log_message(LogLevel.TEST_FAIL, f"{config.name} (huggingface) - Training script failed.", indent=indent + 5, dim=True)
+            test_passed = False
+        else:
+            # Compare metrics only if training was successful
+            hf_metrics = self.extract_metrics(log_path_hf, indent=indent)
+            if not self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)", indent=indent + 5, dim=True):
+                test_passed = False
 
-        if successful_hf_run and successful_hf_extract:
+        if test_passed:
             return True
         else:
             # Generate diff with baseline (HF)
@@ -542,25 +575,29 @@ def _compare_one_parallelism_config(
                 test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
             )
             self.generate_diff(
-                baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism
+                baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism, indent=indent + 5, dim=True
             )
             log_message(
                 LogLevel.INFO,
                 f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}",
+                indent=indent + 5,
+                dim=True,
             )
 
             # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run
             config_filename_tt = (
                 test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
             )
-            config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt)
+            config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt, indent=indent + 5, dim=True)
             log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
-            tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name)
+            tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name, indent=indent + 5, dim=True)
             if tt_run_error:
                 raise ValueError(
                     f"TorchTitan training failed for {tt_model_name}"
                 ) from tt_run_error
 
+            tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True)
+
             # generated diff between nd-paralellism TT and current hf nd-parallelism run
             diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = (
                 test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
@@ -569,10 +606,22 @@ def _compare_one_parallelism_config(
                 log_path_tt,
                 log_path_hf,
                 diff_file_tt_nd_parallelism_vs_hf_nd_parallelism,
+                indent=indent + 5,
+                dim=True,
             )
+            if hf_metrics:
+                self.compare_metrics(
+                    tt_metrics,
+                    hf_metrics,
+                    f"{config.name} (TT nd-parallel vs HF nd-parallel)",
+                    indent=indent + 5,
+                    dim=True,
+                )
             log_message(
                 LogLevel.INFO,
                 f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}",
+                indent=indent + 5,
+                dim=True,
             )
 
             # generated diff between baseline TT and current hf nd-parallelism run
@@ -583,10 +632,22 @@ def _compare_one_parallelism_config(
                 baseline_log_tt,
                 log_path_hf,
                 diff_file_tt_baseline_vs_hf_nd_parallelism,
+                indent=indent + 5,
+                dim=True,
             )
+            if hf_metrics:
+                self.compare_metrics(
+                    tt_baseline_metrics,
+                    hf_metrics,
+                    f"{config.name} (TT baseline vs HF nd-parallel)",
+                    indent=indent + 5,
+                    dim=True,
+                )
             log_message(
                 LogLevel.INFO,
                 f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}",
+                indent=indent + 5,
+                dim=True,
             )
             return False
 
@@ -679,33 +740,29 @@ def run(self) -> int:
         baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None)
         
         baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-        baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf)
+        baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf, indent=0)
         baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log"
-        hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name)
+        hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name, indent=0)
         if hf_baseline_run_error:
             raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error
 
-        hf_baseline_metrics = self.extract_metrics(baseline_log_hf)
+        hf_baseline_metrics = self.extract_metrics(baseline_log_hf, indent=0)
         if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
 
         baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
-        baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt)
+        baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt, indent=0)
         baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log"
-        tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name)
+        tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name, indent=0)
         if tt_baseline_run_error:
             raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error
 
-        diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
-        self.generate_diff(baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline)
-        log_message(LogLevel.INFO, f"Diff between baseline (TT) and baseline (HF) saved to: {diff_file_tt_baseline_vs_hf_baseline}")
-
-        tt_baseline_metrics = self.extract_metrics(baseline_log_tt)
+        tt_baseline_metrics = self.extract_metrics(baseline_log_tt, indent=0)
         if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
         
         if not self.compare_metrics(
-            tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)"
+            tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0
         ):
             raise ValueError(
                 f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}"
@@ -738,7 +795,10 @@ def run(self) -> int:
             task = progress.add_task(
                 "[cyan]Comparing configurations...", total=total_tests
             )
-            for config in test_configs:
+            for i, config in enumerate(test_configs):
+                if i > 0:
+                    console.rule(style="dim")
+
                 progress.update(
                     task, description=f"[cyan]Testing [bold]{config.name}[/bold]"
                 )
@@ -747,8 +807,10 @@ def run(self) -> int:
                     hf_model_name,
                     tt_model_name,
                     hf_baseline_metrics,
+                    tt_baseline_metrics,
                     baseline_log_hf,
                     baseline_log_tt,
+                    indent=1,
                 )
                 results.append((config.name, passed))
                 if passed:
@@ -794,7 +856,7 @@ def run(self) -> int:
             log_message(LogLevel.SUCCESS, "All model tests passed! 🎉")
             return 0
         else:
-            log_message(LogLevel.TEST_FAIL, f"{failed_tests} model(s) had test failures")
+            log_message(LogLevel.TEST_FAIL, f"{failed_tests} configuration(s) had test failures")
             log_message(
                 LogLevel.INFO, f"Check the diff files in {self.results_dir} for details"
             )

From 025a86f9d411d3f2f9e8e0b27a551a1ae16c7bae Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 26 Sep 2025 14:33:08 +0000
Subject: [PATCH 044/129] PP for llama in 1D works

---
 .../transformers_backend/__init__.py          |   4 +-
 .../infra/parallelize_hf_transformers.py      |  14 +-
 .../transformers_backend/infra/pipeline_hf.py | 495 ++++++++++++++++++
 .../model/hf_transformers_args.py             |  28 +-
 4 files changed, 530 insertions(+), 11 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/infra/pipeline_hf.py

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index fa8cc4c119..6e6894b109 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -13,7 +13,7 @@
 from torchtitan.datasets.hf_datasets import build_hf_dataloader
 from torchtitan.components.tokenizer import build_hf_tokenizer
 
-from torchtitan.models.llama3 import pipeline_llama
+from .infra.pipeline_hf import pipeline_hf_transformers
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
@@ -143,7 +143,7 @@ class DeepSeekV3Args:
     model_cls=HFTransformerModel,
     model_args=flavors,
     parallelize_fn=parallelize_hf_transformers,
-    pipelining_fn=pipeline_llama,
+    pipelining_fn=pipeline_hf_transformers,
     build_optimizers_fn=build_optimizers,
     build_lr_schedulers_fn=build_lr_schedulers,
     build_dataloader_fn=build_hf_dataloader,
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index a97479b216..4ac6d6cd83 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -315,7 +315,7 @@ def apply_non_moe_tp(
         model,
         tp_mesh,
         {
-            "embed_tokens": RowwiseParallel(
+            "tok_embeddings": RowwiseParallel(
                 input_layouts=Replicate(),
                 output_layouts=Shard(1),
             ),
@@ -437,9 +437,9 @@ def apply_fsdp(
                 f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
             )
 
-    if model.embed_tokens is not None:
+    if model.tok_embeddings is not None:
         fully_shard(
-            model.embed_tokens,
+            model.tok_embeddings,
             **fsdp_config,
             reshard_after_forward=reshard_after_forward,
         )
@@ -507,8 +507,8 @@ def apply_fsdp(
     transformer_blocks = list(model.layers.values())
     next_transformer_blocks = transformer_blocks[1:] + [None]
 
-    if model.embed_tokens is not None and model.layers is not None:
-        model.embed_tokens.set_modules_to_forward_prefetch([transformer_blocks[0]])
+    if model.tok_embeddings is not None and model.layers is not None:
+        model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]])
 
     for transformer_block, next_transformer_block in zip(
         transformer_blocks, next_transformer_blocks
@@ -546,8 +546,8 @@ def apply_fsdp(
                 transformer_block.set_modules_to_backward_prefetch(
                     [prev_transformer_block]
                 )
-        elif model.embed_tokens is not None:
-            transformer_block.set_modules_to_backward_prefetch([model.embed_tokens])
+        elif model.tok_embeddings is not None:
+            transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings])
 
 
 def apply_moe_ep_tp(
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
new file mode 100644
index 0000000000..178610343a
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
@@ -0,0 +1,495 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import os
+from typing import Callable
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.pipelining import PipelineStage
+
+from torch.distributed.pipelining.schedules import (
+    _PipelineSchedule,
+    _PipelineScheduleRuntime,
+    get_schedule_class,
+    PipelineScheduleMulti,
+    PipelineScheduleSingle,
+    ScheduleDualPipeV,
+    ScheduleZBVZeroBubble,
+)
+
+from torchtitan.config import JobConfig
+from torchtitan.tools.logging import logger
+
+from torchtitan.distributed import ParallelDims
+from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
+from torchtitan.components.loss import LossFunction
+
+import math
+
+
+def build_pipeline_schedule(
+    job_config: JobConfig, stages: list[PipelineStage], loss_fn: Callable
+) -> _PipelineSchedule:
+    """Builds a pipeline schedule for the given job configuration and stages.
+
+    Args:
+        job_config (JobConfig): The job configuration.
+        stages (list[PipelineStage]): The stages to be scheduled.
+        loss_fn (Callable): The loss function.
+
+    Returns:
+        _PipelineSchedule: The pipeline schedule for the given stages.
+    """
+    pp_schedule_csv = job_config.parallelism.pipeline_parallel_schedule_csv
+
+    # Validate that pp_schedule_csv is a valid path
+    if pp_schedule_csv:
+        if not os.path.isfile(pp_schedule_csv):
+            raise FileNotFoundError(
+                f"The specified path {pp_schedule_csv} does not exist or is not a file."
+            )
+        schedule_class = _PipelineScheduleRuntime
+    else:
+        schedule_class = get_schedule_class(
+            job_config.parallelism.pipeline_parallel_schedule
+        )
+
+    looped_schedule = issubclass(schedule_class, PipelineScheduleMulti)
+    microbatch_size = job_config.parallelism.pipeline_parallel_microbatch_size
+    batch_size = job_config.training.local_batch_size
+    # validate that the batch size is divisible by the microbatch_size otherwise we'll hang or error during training
+    if batch_size % microbatch_size != 0:
+        raise ValueError(
+            f"Batch size {job_config.training.local_batch_size} must be divisible by microbatch_size {microbatch_size}. "
+            "Update the config arguments for either batch_size or pipeline_parallel_microbatch_size."
+        )
+    n_microbatches = batch_size // microbatch_size
+    # We expect that the number of local stages (`len(stages)`) is the same across all ranks
+    num_total_stages = job_config.parallelism.pipeline_parallel_degree * len(stages)
+    if n_microbatches < num_total_stages:
+        logger.warning(
+            f"Number of microbatches ({n_microbatches}) is less than the total number "
+            f"of stages ({num_total_stages}) which may result in a bubble in the pipeline."
+        )
+
+    schedule = schedule_class(
+        stages if looped_schedule else stages[0],
+        n_microbatches=n_microbatches,
+        loss_fn=loss_fn,
+    )
+    logger.info(
+        f"Using pipeline schedule {job_config.parallelism.pipeline_parallel_schedule} "
+        f"with {n_microbatches} microbatches and {num_total_stages} stages."
+    )
+
+    if pp_schedule_csv:
+        assert schedule_class in [
+            PipelineScheduleSingle,
+            PipelineScheduleMulti,
+            _PipelineScheduleRuntime,
+        ], (
+            "Only PipelineScheduleSingle (single stage), PipelineScheduleMulti (multistage), "
+            "and _PipelineScheduleRuntime support csv schedules"
+        )
+        schedule._load_csv(pp_schedule_csv)
+
+    return schedule
+
+
+# TODO(whc) should this be a utility inside torch.pipelining?
+def stage_ids_this_rank(
+    pp_rank: int, pp_size: int, num_stages: int, style: str = "loop"
+) -> tuple[int]:
+    """Compute the stage ids for the stages that will run on this pp rank for either a looped or V style schedule"""
+    assert (
+        num_stages % pp_size == 0
+    ), f"num_stages {num_stages} must be evenly divisible by pp_size {pp_size}"
+    stages_per_rank = num_stages // pp_size
+    if style == "loop":
+        return tuple(pp_rank + s * pp_size for s in range(stages_per_rank))
+    elif style == "v":
+        assert (
+            stages_per_rank == 2
+        ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
+        stage_v_pairs = list(
+            zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1))
+        )
+        return stage_v_pairs[pp_rank]
+
+
+def generate_llm_fqn_per_model_part(
+    num_stages: int,
+    num_layers: int,
+    input_weight: int = 1,
+    output_weight: int = 1,
+) -> list[list[str]]:
+    """
+    Programmatically generates module names model part, focused on LLMs models.
+
+    Args:
+        num_stages: Number of pipeline stages
+        num_layers: Total number of transformer layers in the model
+        input_weight: Weight for input modules (embed_tokens) in layer calculation
+        output_weight: Weight for output modules (norm + output) in layer calculation
+
+    Returns:
+        List of lists containing module names for each model part
+
+    Example:
+        generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2)
+        treats embeddings as 2 layers and norm+output as 2 layers for distribution
+    """
+    if num_stages < 1:
+        raise ValueError("Number of stages must be at least 1")
+
+    if num_stages == 1:
+        # Single stage gets everything
+        layer_names = [f"model.model.layers.{i}" for i in range(num_layers)]
+        return [
+            ["model.model.embed_tokens"]
+            + layer_names
+            + ["model.model.norm", "model.lm_head", "model.model.rotary_emb"]
+        ]
+
+    # Calculate effective layers including weights
+    num_effective_layers = num_layers + input_weight + output_weight
+
+    if num_stages > num_effective_layers:
+        raise ValueError(
+            f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})"
+        )
+
+    # Calculate layers per stage (distribute evenly)
+    layers_per_stage = num_effective_layers // num_stages
+    extra_layers = num_effective_layers % num_stages
+
+    # Feasibility check: Ensure at least 1 layer in each PP stage
+    if layers_per_stage == 0:
+        raise ValueError(
+            f"Configuration would result in empty stages. "
+            f"With {num_stages} stages and {num_effective_layers} effective layers "
+            f"(num_layers={num_layers} + input_weight={input_weight} + output_weight={output_weight}), "
+            f"each stage would get {layers_per_stage} layers on average. "
+            f"Reduce num_stages or increase num_layers/weights."
+        )
+
+    # Balance check: Ensure weights don't exceed minimum layers per stage
+    if input_weight > layers_per_stage:
+        raise ValueError(
+            f"input_weight ({input_weight}) exceeds minimum layers per stage ({layers_per_stage})."
+        )
+    if output_weight > layers_per_stage:
+        raise ValueError(
+            f"output_weight ({output_weight}) exceeds minimum layers per stage ({layers_per_stage})."
+        )
+
+    module_names_per_stage = []
+    current_layer = 0
+
+    for stage_idx in range(num_stages):
+        stage_modules = []
+
+        # Calculate effective layers for this stage
+        effective_layers_for_stage = layers_per_stage
+        if stage_idx < extra_layers:
+            effective_layers_for_stage += 1
+
+        # First stage: handle input modules with weighting
+        if stage_idx == 0:
+            stage_modules.append("model.model.embed_tokens")
+            # Account for input weight in layer distribution
+            remaining_layers_for_stage = effective_layers_for_stage - input_weight
+
+            # Add transformer layers
+            for _ in range(remaining_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"model.model.layers.{current_layer}")
+                    current_layer += 1
+
+        # Last stage: handle output modules with weighting
+        elif stage_idx == num_stages - 1:
+            # Account for output weight in layer distribution
+            remaining_layers_for_stage = effective_layers_for_stage - output_weight
+
+            # Add transformer layers
+            for _ in range(remaining_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"model.model.layers.{current_layer}")
+                    current_layer += 1
+
+            # Add output modules
+            stage_modules.extend(["model.model.norm", "model.lm_head"])
+
+        # Middle stages: only transformer layers
+        else:
+            for _ in range(effective_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"model.model.layers.{current_layer}")
+                    current_layer += 1
+
+        stage_modules.append("model.model.rotary_emb")
+        module_names_per_stage.append(stage_modules)
+
+    return module_names_per_stage
+
+
+def pipeline_module_split(
+    whole_model: nn.Module,
+    pp_mesh: DeviceMesh,
+    pp_schedule: str,
+    device: torch.device,
+    module_names_per_stage: list[list[str]],
+) -> tuple[list[PipelineStage], list[nn.Module]]:
+    """
+    This API creates pipeline stages based on specified module names for each stage.
+
+    Some model restrictions include:
+    - forward() method should tolerate deleted layers
+    - weight initialization methods should tolerate deleted layers
+    - Does not support nested moduledict and modulelist structures
+
+    Args:
+        whole_model: The complete model to be split
+        pp_mesh: Pipeline parallel device mesh
+        pp_schedule: Name of pipeline parallelism schedule
+        device: Device
+        module_names_per_stage: List of lists, where each inner list contains the module names
+                               that should be included in that stage. Module names should be
+                               dot-separated paths. Examples:
+                               - "embed_tokens" for token embeddings
+                               - "layers.0", "layers.1" for specific transformer layers
+                               - "norm" for the final normalization layer
+                               - "output" for the output projection layer
+
+    Returns:
+        Tuple of (stages, models) where stages are PipelineStage objects and models are the
+        corresponding model chunks
+
+    Example usage:
+        module_names_per_stage = [
+            ["embed_tokens", "layers.0"],     # Stage 0: embeddings + first layer
+            ["layers.1", "layers.2"],           # Stage 1: middle layers
+            ["norm", "output"]                  # Stage 2: final norm + output
+        ]
+    """
+    pp_rank = pp_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+
+    def _build_stage_from_modules(
+        stage_idx: int, module_names: list[str], num_stages: int
+    ) -> tuple[PipelineStage, nn.Module]:
+        model = copy.deepcopy(whole_model)
+
+        # Create a set of modules to keep for faster lookup
+        modules_to_keep = set(module_names)
+        print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}")
+        
+        def _prune_modules_recursive(current_module: nn.Module, prefix: str):
+            for name, child in current_module.named_children():
+                child_prefix = f"{prefix}{name}"
+
+                # If the child module is a container, we need to check its children
+                if isinstance(child, (nn.ModuleDict, nn.ModuleList)):
+                    layers_to_keep = {
+                        m.split(".")[-1]
+                        for m in modules_to_keep
+                        if m.startswith(f"{child_prefix}.")
+                    }
+                    if layers_to_keep:
+                        # This container has some layers we need to keep.
+                        if isinstance(child, nn.ModuleDict):
+                            for layer_name in list(child.keys()):
+                                if layer_name not in layers_to_keep:
+                                    del child[layer_name]
+                        elif isinstance(child, nn.ModuleList):
+                            indices_to_keep = {
+                                int(idx) for idx in layers_to_keep if idx.isdigit()
+                            }
+                            new_layers = nn.ModuleList(
+                                [
+                                    layer
+                                    for i, layer in enumerate(child)
+                                    if i in indices_to_keep
+                                ]
+                            )
+                            setattr(current_module, name, new_layers)
+                    else:
+                        # If no sub-modules are kept, replace with an empty container.
+                        if isinstance(child, nn.ModuleDict):
+                            setattr(current_module, name, nn.ModuleDict())
+                        elif isinstance(child, nn.ModuleList):
+                            setattr(current_module, name, nn.ModuleList())
+                elif isinstance(child, nn.Module):
+                    # For a generic nn.Module, check if it or its children should be kept
+                    is_kept = child_prefix in modules_to_keep
+                    is_parent_of_kept = any(
+                        m.startswith(f"{child_prefix}.") for m in modules_to_keep
+                    )
+
+                    if is_kept or is_parent_of_kept:
+                        _prune_modules_recursive(child, f"{child_prefix}.")
+                    else:
+                        # Handle simple module attributes (e.g., "linear", "norm")
+                        setattr(current_module, name, nn.Identity())
+
+        _prune_modules_recursive(model, "")
+
+        stage = PipelineStage(
+            model,
+            stage_idx,
+            num_stages,
+            device,
+            group=pp_mesh.get_group("pp"),
+        )
+        return stage, model
+
+    num_stages = len(module_names_per_stage)
+    stages = []
+    models = []
+
+    schedule_class = get_schedule_class(pp_schedule)
+    style = (
+        "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop"
+    )
+
+    for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style):
+        module_names = module_names_per_stage[stage_idx]
+        stage, model_chunk = _build_stage_from_modules(
+            stage_idx,
+            module_names,
+            num_stages,
+        )
+        logger.info(
+            f"PP rank {pp_rank} is building stage_idx {stage_idx} "
+            f"with modules {module_names}"
+        )
+        stages.append(stage)
+        models.append(model_chunk)
+
+    return stages, models
+
+
+def pipeline_hf_transformers(
+    model: nn.Module,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: torch.device,
+    model_args: BaseModelArgs,
+    parallelize_fn: ParallelizeFunction,
+    loss_fn: LossFunction,
+) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]:
+    if job_config.parallelism.pipeline_parallel_split_points != []:
+        raise ValueError(
+            "pipeline_parallel_split_points is deprecated. Please use module_fqns_per_model_part instead."
+            "You can generate module_fqns_per_model_part programmatically with generate_llm_fqn_per_model_part"
+        )
+
+    pp_mesh = parallel_dims.world_mesh["pp"]
+
+    # Determine the number of virtual stages based on schedule type
+    schedule_class = get_schedule_class(
+        job_config.parallelism.pipeline_parallel_schedule
+    )
+    is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle)
+    layers_per_stage = job_config.parallelism.pipeline_parallel_layers_per_stage
+    if hasattr(model_args, "n_layers"):
+        num_layers = model_args.n_layers
+    else:
+        raise ValueError("Model does not have n_layers attribute.")
+
+    # You can adjust these weights based on the computational cost of embeddings and output layers
+    # Higher weights mean these modules are treated as "heavier" in the distribution
+    input_weight = job_config.parallelism.pipeline_parallel_first_stage_less_layers
+    output_weight = job_config.parallelism.pipeline_parallel_last_stage_less_layers
+
+    # Calculate number of virtual stages
+    if layers_per_stage is not None:
+
+        # Calculate number of virtual stages needed (using ceiling division)
+        # This allows for unequal distribution where stages can differ by at most 1 layer
+        num_virtual_stages = math.ceil(
+            (num_layers + input_weight + output_weight) / layers_per_stage
+        )
+
+        # Validation: check stages per rank based on schedule type
+        model_config_info = f"Model has {num_layers} layers with pipeline_parallel_layers_per_stage={layers_per_stage}"
+        stage_distribution_info = (
+            f"resulting in {num_virtual_stages=} across {parallel_dims.pp} PP ranks"
+        )
+
+        if num_virtual_stages % parallel_dims.pp != 0:
+            raise ValueError(
+                f"Number of virtual stages ({num_virtual_stages}) must be divisible by "
+                f"pipeline parallel size ({parallel_dims.pp}). "
+                f"{model_config_info}. "
+                f"Please adjust pipeline_parallel_layers_per_stage to a value that results in a number of stages "
+                f"divisible by {parallel_dims.pp}."
+            )
+
+        stages_per_rank = num_virtual_stages // parallel_dims.pp
+
+        if is_single_stage_schedule and stages_per_rank != 1:
+            raise ValueError(
+                f"Single stage schedule requires exactly 1 stage per rank, but got {stages_per_rank} stages per rank. "
+                f"{model_config_info}, {stage_distribution_info}. "
+                f"Please increase pipeline_parallel_layers_per_stage to {num_layers // parallel_dims.pp} or higher "
+                f"to achieve 1 stage per rank."
+            )
+
+        if not is_single_stage_schedule and stages_per_rank < 2:
+            raise ValueError(
+                f"Multi-stage schedule requires at least 2 stages per rank, but got {stages_per_rank} stages per rank. "
+                f"{model_config_info}, {stage_distribution_info}. "
+                f"Please decrease pipeline_parallel_layers_per_stage to achieve at least 2 stages per rank."
+            )
+    else:
+        # Fallback to default behavior when layers_per_stage is not provided
+        # For multi-stage schedules, default is 2 virtual stages per rank
+        # For single-stage schedules, default is 1 virtual stage per rank
+        stages_per_rank = 1 if is_single_stage_schedule else 2
+        num_virtual_stages = parallel_dims.pp * stages_per_rank
+
+    module_names_per_stage = job_config.parallelism.module_fqns_per_model_part
+    if module_names_per_stage is None:
+        module_names_per_stage = generate_llm_fqn_per_model_part(
+            num_virtual_stages, num_layers, input_weight, output_weight
+        )
+    for i, stage_ms in enumerate(module_names_per_stage):
+        logger.debug(f"Stage {i}: {stage_ms}")
+
+    stages, model_parts = pipeline_module_split(
+        model,
+        pp_mesh,
+        job_config.parallelism.pipeline_parallel_schedule,
+        device,
+        module_names_per_stage,
+    )
+
+    # For PP with looped schedules, each item in model_parts is one stage-model-chunk.
+    # We need to iterate through model_parts to apply SPMD parallelisms, compilation,
+    # optimizer, and checkpointing
+    for i, m in enumerate(model_parts):
+        # apply SPMD-style PT-D techniques
+        m = parallelize_fn(m, parallel_dims, job_config)
+        model_parts[i] = m
+        # NOTE: this is to update the model in the stage
+        #       in case the model is modified e.g. by torch.compile
+        stages[i].submod = m
+
+    pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
+
+    # This is used in the train loop to determine whether to pass in the input_ids and labels
+    has_first_stage = False
+    has_last_stage = False
+    for stage in stages:
+        if stage.is_first:
+            has_first_stage = True
+        if stage.is_last:
+            has_last_stage = True
+
+    return pp_schedule, model_parts, has_first_stage, has_last_stage
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 3eb74c6b4b..9ac01bcb86 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -295,7 +295,7 @@ def layers(self):
             raise AttributeError("Could not find layers in the model. Please check the model structure.")
 
     @property
-    def embed_tokens(self):
+    def tok_embeddings(self):
         """Returns the model's embed_tokens, handling different Hugging Face model structures."""
         if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
             return self.model.model.embed_tokens
@@ -310,6 +310,13 @@ def norm(self):
         else:
             raise AttributeError("Could not find norm in the model. Please check the model structure.")
 
+    @norm.setter
+    def norm(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
+            setattr(self.model.model, "norm", value)
+        else:
+            raise AttributeError("Could not find norm in the model. Please check the model structure.")
+
     @property
     def output(self):
         """Returns the model's output layer, handling different Hugging Face model structures."""
@@ -326,4 +333,21 @@ def forward(self, *args, **kwargs):
         return output
 
     def init_weights(self, *args, **kwargs):
-        self.model.post_init()
\ No newline at end of file
+        # This method replicates the behavior of the original PreTrainedModel.init_weights,
+        # but with a custom weight initialization function that skips nn.Identity modules (when PP is enabled)
+
+        if self.model.config.pruned_heads:
+            logger.info("Pruning heads as per model configuration.")
+            self.model.prune_heads(self.model.config.pruned_heads)
+
+        original_init_weights_fn = self.model._init_weights
+
+        def selective_init(module):
+            # For pipeline parallel, we need to skip nn.Identity modules
+            if not isinstance(module, nn.Identity):
+                original_init_weights_fn(module)
+
+        logger.info("Applying selective weight initialization, skipping nn.Identity modules when PP is enabled.")
+        self.model.apply(selective_init)
+
+        self.model.tie_weights()
\ No newline at end of file

From 590737f9dec1fd383e34afedcdc8bc892ce39a30 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sun, 28 Sep 2025 10:49:40 +0000
Subject: [PATCH 045/129] simplify PP logic by flattening the named_children
 hierarchy. This will be easier for TP later

---
 .../transformers_backend/infra/pipeline_hf.py | 231 +++++-------------
 .../model/hf_transformers_args.py             |  75 +++++-
 2 files changed, 130 insertions(+), 176 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
index 178610343a..fb707b2509 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
@@ -4,123 +4,34 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import copy
-import os
-from typing import Callable
+import math
 
 import torch
 import torch.nn as nn
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.pipelining import PipelineStage
-
 from torch.distributed.pipelining.schedules import (
     _PipelineSchedule,
-    _PipelineScheduleRuntime,
     get_schedule_class,
-    PipelineScheduleMulti,
     PipelineScheduleSingle,
-    ScheduleDualPipeV,
-    ScheduleZBVZeroBubble,
 )
 
+from torchtitan.components.loss import LossFunction
 from torchtitan.config import JobConfig
-from torchtitan.tools.logging import logger
-
 from torchtitan.distributed import ParallelDims
+from torchtitan.distributed.pipeline_parallel import (
+    build_pipeline_schedule,
+    pipeline_module_split,
+    stage_ids_this_rank,
+)
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.pipelining import PipelineStage
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
-from torchtitan.components.loss import LossFunction
-
-import math
-
-
-def build_pipeline_schedule(
-    job_config: JobConfig, stages: list[PipelineStage], loss_fn: Callable
-) -> _PipelineSchedule:
-    """Builds a pipeline schedule for the given job configuration and stages.
-
-    Args:
-        job_config (JobConfig): The job configuration.
-        stages (list[PipelineStage]): The stages to be scheduled.
-        loss_fn (Callable): The loss function.
-
-    Returns:
-        _PipelineSchedule: The pipeline schedule for the given stages.
-    """
-    pp_schedule_csv = job_config.parallelism.pipeline_parallel_schedule_csv
-
-    # Validate that pp_schedule_csv is a valid path
-    if pp_schedule_csv:
-        if not os.path.isfile(pp_schedule_csv):
-            raise FileNotFoundError(
-                f"The specified path {pp_schedule_csv} does not exist or is not a file."
-            )
-        schedule_class = _PipelineScheduleRuntime
-    else:
-        schedule_class = get_schedule_class(
-            job_config.parallelism.pipeline_parallel_schedule
-        )
-
-    looped_schedule = issubclass(schedule_class, PipelineScheduleMulti)
-    microbatch_size = job_config.parallelism.pipeline_parallel_microbatch_size
-    batch_size = job_config.training.local_batch_size
-    # validate that the batch size is divisible by the microbatch_size otherwise we'll hang or error during training
-    if batch_size % microbatch_size != 0:
-        raise ValueError(
-            f"Batch size {job_config.training.local_batch_size} must be divisible by microbatch_size {microbatch_size}. "
-            "Update the config arguments for either batch_size or pipeline_parallel_microbatch_size."
-        )
-    n_microbatches = batch_size // microbatch_size
-    # We expect that the number of local stages (`len(stages)`) is the same across all ranks
-    num_total_stages = job_config.parallelism.pipeline_parallel_degree * len(stages)
-    if n_microbatches < num_total_stages:
-        logger.warning(
-            f"Number of microbatches ({n_microbatches}) is less than the total number "
-            f"of stages ({num_total_stages}) which may result in a bubble in the pipeline."
-        )
-
-    schedule = schedule_class(
-        stages if looped_schedule else stages[0],
-        n_microbatches=n_microbatches,
-        loss_fn=loss_fn,
-    )
-    logger.info(
-        f"Using pipeline schedule {job_config.parallelism.pipeline_parallel_schedule} "
-        f"with {n_microbatches} microbatches and {num_total_stages} stages."
-    )
-
-    if pp_schedule_csv:
-        assert schedule_class in [
-            PipelineScheduleSingle,
-            PipelineScheduleMulti,
-            _PipelineScheduleRuntime,
-        ], (
-            "Only PipelineScheduleSingle (single stage), PipelineScheduleMulti (multistage), "
-            "and _PipelineScheduleRuntime support csv schedules"
-        )
-        schedule._load_csv(pp_schedule_csv)
-
-    return schedule
-
-
-# TODO(whc) should this be a utility inside torch.pipelining?
-def stage_ids_this_rank(
-    pp_rank: int, pp_size: int, num_stages: int, style: str = "loop"
-) -> tuple[int]:
-    """Compute the stage ids for the stages that will run on this pp rank for either a looped or V style schedule"""
-    assert (
-        num_stages % pp_size == 0
-    ), f"num_stages {num_stages} must be evenly divisible by pp_size {pp_size}"
-    stages_per_rank = num_stages // pp_size
-    if style == "loop":
-        return tuple(pp_rank + s * pp_size for s in range(stages_per_rank))
-    elif style == "v":
-        assert (
-            stages_per_rank == 2
-        ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
-        stage_v_pairs = list(
-            zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1))
-        )
-        return stage_v_pairs[pp_rank]
+from torchtitan.tools.logging import logger
+from torch.distributed.pipelining.schedules import (
+    ScheduleDualPipeV,
+    ScheduleZBVZeroBubble,
+)
 
+# NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part
 
 def generate_llm_fqn_per_model_part(
     num_stages: int,
@@ -130,16 +41,13 @@ def generate_llm_fqn_per_model_part(
 ) -> list[list[str]]:
     """
     Programmatically generates module names model part, focused on LLMs models.
-
     Args:
         num_stages: Number of pipeline stages
         num_layers: Total number of transformer layers in the model
         input_weight: Weight for input modules (embed_tokens) in layer calculation
         output_weight: Weight for output modules (norm + output) in layer calculation
-
     Returns:
         List of lists containing module names for each model part
-
     Example:
         generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2)
         treats embeddings as 2 layers and norm+output as 2 layers for distribution
@@ -149,11 +57,11 @@ def generate_llm_fqn_per_model_part(
 
     if num_stages == 1:
         # Single stage gets everything
-        layer_names = [f"model.model.layers.{i}" for i in range(num_layers)]
+        layer_names = [f"layers.{i}" for i in range(num_layers)]
         return [
-            ["model.model.embed_tokens"]
+            ["tok_embeddings"]
             + layer_names
-            + ["model.model.norm", "model.lm_head", "model.model.rotary_emb"]
+            + ["norm", "output", "rotary_emb"]
         ]
 
     # Calculate effective layers including weights
@@ -201,14 +109,14 @@ def generate_llm_fqn_per_model_part(
 
         # First stage: handle input modules with weighting
         if stage_idx == 0:
-            stage_modules.append("model.model.embed_tokens")
+            stage_modules.append("tok_embeddings")
             # Account for input weight in layer distribution
             remaining_layers_for_stage = effective_layers_for_stage - input_weight
 
             # Add transformer layers
             for _ in range(remaining_layers_for_stage):
                 if current_layer < num_layers:
-                    stage_modules.append(f"model.model.layers.{current_layer}")
+                    stage_modules.append(f"layers.{current_layer}")
                     current_layer += 1
 
         # Last stage: handle output modules with weighting
@@ -219,25 +127,24 @@ def generate_llm_fqn_per_model_part(
             # Add transformer layers
             for _ in range(remaining_layers_for_stage):
                 if current_layer < num_layers:
-                    stage_modules.append(f"model.model.layers.{current_layer}")
+                    stage_modules.append(f"layers.{current_layer}")
                     current_layer += 1
 
             # Add output modules
-            stage_modules.extend(["model.model.norm", "model.lm_head"])
+            stage_modules.extend(["norm", "output"])
 
         # Middle stages: only transformer layers
         else:
             for _ in range(effective_layers_for_stage):
                 if current_layer < num_layers:
-                    stage_modules.append(f"model.model.layers.{current_layer}")
+                    stage_modules.append(f"layers.{current_layer}")
                     current_layer += 1
 
-        stage_modules.append("model.model.rotary_emb")
+        stage_modules.append("rotary_emb")
         module_names_per_stage.append(stage_modules)
 
     return module_names_per_stage
 
-
 def pipeline_module_split(
     whole_model: nn.Module,
     pp_mesh: DeviceMesh,
@@ -261,7 +168,7 @@ def pipeline_module_split(
         module_names_per_stage: List of lists, where each inner list contains the module names
                                that should be included in that stage. Module names should be
                                dot-separated paths. Examples:
-                               - "embed_tokens" for token embeddings
+                               - "tok_embeddings" for token embeddings
                                - "layers.0", "layers.1" for specific transformer layers
                                - "norm" for the final normalization layer
                                - "output" for the output projection layer
@@ -272,7 +179,7 @@ def pipeline_module_split(
 
     Example usage:
         module_names_per_stage = [
-            ["embed_tokens", "layers.0"],     # Stage 0: embeddings + first layer
+            ["tok_embeddings", "layers.0"],     # Stage 0: embeddings + first layer
             ["layers.1", "layers.2"],           # Stage 1: middle layers
             ["norm", "output"]                  # Stage 2: final norm + output
         ]
@@ -288,56 +195,42 @@ def _build_stage_from_modules(
         # Create a set of modules to keep for faster lookup
         modules_to_keep = set(module_names)
         print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}")
-        
-        def _prune_modules_recursive(current_module: nn.Module, prefix: str):
-            for name, child in current_module.named_children():
-                child_prefix = f"{prefix}{name}"
-
-                # If the child module is a container, we need to check its children
-                if isinstance(child, (nn.ModuleDict, nn.ModuleList)):
-                    layers_to_keep = {
-                        m.split(".")[-1]
-                        for m in modules_to_keep
-                        if m.startswith(f"{child_prefix}.")
-                    }
-                    if layers_to_keep:
-                        # This container has some layers we need to keep.
-                        if isinstance(child, nn.ModuleDict):
-                            for layer_name in list(child.keys()):
-                                if layer_name not in layers_to_keep:
-                                    del child[layer_name]
-                        elif isinstance(child, nn.ModuleList):
-                            indices_to_keep = {
-                                int(idx) for idx in layers_to_keep if idx.isdigit()
-                            }
-                            new_layers = nn.ModuleList(
-                                [
-                                    layer
-                                    for i, layer in enumerate(child)
-                                    if i in indices_to_keep
-                                ]
-                            )
-                            setattr(current_module, name, new_layers)
-                    else:
-                        # If no sub-modules are kept, replace with an empty container.
-                        if isinstance(child, nn.ModuleDict):
-                            setattr(current_module, name, nn.ModuleDict())
-                        elif isinstance(child, nn.ModuleList):
-                            setattr(current_module, name, nn.ModuleList())
-                elif isinstance(child, nn.Module):
-                    # For a generic nn.Module, check if it or its children should be kept
-                    is_kept = child_prefix in modules_to_keep
-                    is_parent_of_kept = any(
-                        m.startswith(f"{child_prefix}.") for m in modules_to_keep
-                    )
-
-                    if is_kept or is_parent_of_kept:
-                        _prune_modules_recursive(child, f"{child_prefix}.")
-                    else:
-                        # Handle simple module attributes (e.g., "linear", "norm")
-                        setattr(current_module, name, nn.Identity())
-
-        _prune_modules_recursive(model, "")
+        for module_name, module_value in model.named_children():
+            # Handle layer-like structures (e.g., "layers.0", "layers.1")
+            if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)):
+                layers_to_keep = {
+                    name.split(".", 1)[1]
+                    for name in modules_to_keep
+                    if name.startswith(f"{module_name}.")
+                }
+                if layers_to_keep:
+                    # Keep only specified layers
+                    if isinstance(module_value, nn.ModuleDict):
+                        for layer_name in list(module_value.keys()):
+                            if layer_name not in layers_to_keep:
+                                del module_value[layer_name]
+                    elif isinstance(module_value, nn.ModuleList):
+                        indices_to_keep = {
+                            int(idx) for idx in layers_to_keep if idx.isdigit()
+                        }
+                        new_layers = nn.ModuleList(
+                            [
+                                layer
+                                for i, layer in enumerate(module_value)
+                                if i in indices_to_keep
+                            ]
+                        )
+                        setattr(model, module_name, new_layers)
+                else:
+                    # No layers from this structure needed, set to empty structure
+                    if isinstance(module_value, nn.ModuleDict):
+                        setattr(model, module_name, nn.ModuleDict())
+                    elif isinstance(module_value, nn.ModuleList):
+                        setattr(model, module_name, nn.ModuleList())
+            # Handle simple module attributes (e.g., "linear", "norm")
+            elif module_name not in modules_to_keep:
+                # Replace with Identity
+                setattr(model, module_name, nn.Identity())
 
         stage = PipelineStage(
             model,
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 9ac01bcb86..e74459760a 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -285,6 +285,21 @@ def __init__(self, model_args: HFTransformerModelArgs):
                 )
         self.model = model_cls(config=model_args)
 
+    @property
+    def tok_embeddings(self):
+        """Returns the model's embed_tokens, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
+            return self.model.model.embed_tokens
+        else:
+            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+
+    @tok_embeddings.setter
+    def tok_embeddings(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
+            setattr(self.model.model, "embed_tokens", value)
+        else:
+            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+
     @property
     def layers(self):
         """Returns the model's layers, handling different Hugging Face model structures."""
@@ -294,13 +309,12 @@ def layers(self):
             # Add more cases here if needed for other model architectures
             raise AttributeError("Could not find layers in the model. Please check the model structure.")
 
-    @property
-    def tok_embeddings(self):
-        """Returns the model's embed_tokens, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
-            return self.model.model.embed_tokens
+    @layers.setter
+    def layers(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
+            setattr(self.model.model, "layers", value)
         else:
-            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+            raise AttributeError("Could not find layers in the model. Please check the model structure.")
 
     @property
     def norm(self):
@@ -326,6 +340,28 @@ def output(self):
             # Add more cases here if needed for other model architectures
             raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
 
+    @output.setter
+    def output(self, value):
+        if hasattr(self.model, "lm_head"):  # For models like LlamaForCausalLM
+            setattr(self.model, "lm_head", value)
+        else:
+            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
+
+    @property
+    def rotary_emb(self):
+        """Returns the model's rotary_emb, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
+            return self.model.model.rotary_emb
+        else:
+            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
+
+    @rotary_emb.setter
+    def rotary_emb(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
+            setattr(self.model.model, "rotary_emb", value)
+        else:
+            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
+
     def forward(self, *args, **kwargs):
         output = self.model(*args, **kwargs)
         if isinstance(output, CausalLMOutputWithPast):
@@ -350,4 +386,29 @@ def selective_init(module):
         logger.info("Applying selective weight initialization, skipping nn.Identity modules when PP is enabled.")
         self.model.apply(selective_init)
 
-        self.model.tie_weights()
\ No newline at end of file
+        self.model.tie_weights()
+    
+    def named_children(self):
+        """
+        Provides a flattened view of the model's main components,
+        making it compatible with TorchTitan's expectations.
+        """
+        yield "tok_embeddings", self.tok_embeddings
+        yield "layers", self.layers
+        yield "norm", self.norm
+        yield "output", self.output
+        yield "rotary_emb", self.rotary_emb
+
+    def __setattr__(self, name, value):
+        # If a property with a setter exists for this name, use it.
+        # This is to bypass the nn.Module.__setattr__ logic that
+        # directly registers modules and skips property setters.
+        cls = self.__class__
+        if hasattr(cls, name):
+            prop = getattr(cls, name)
+            if isinstance(prop, property) and prop.fset is not None:
+                prop.fset(self, value)
+                return
+
+        # Otherwise, fall back to the default nn.Module behavior.
+        super().__setattr__(name, value)
\ No newline at end of file

From 1a9af6884cf61ad5cd974a5156343ff9201240a5 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sun, 28 Sep 2025 13:59:05 +0000
Subject: [PATCH 046/129] TP now works in 1D

---
 .../infra/parallelize_hf_transformers.py      | 30 ++++----
 .../model/hf_deepseek_v3_patch.py             |  1 +
 .../model/hf_llama_patch.py                   | 74 ++++++++++++++++++-
 .../model/hf_transformers_args.py             | 19 +++--
 4 files changed, 99 insertions(+), 25 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 4ac6d6cd83..d36bc0589a 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -351,29 +351,29 @@ def apply_non_moe_tp(
         )
 
     # Apply tensor + sequence parallelism to every transformer block
-    for transformer_block in model.layers.values():
+    for transformer_block in model.layers:
         layer_plan = {
-            "attention_norm": SequenceParallel(),
-            "attention": prepare_module_input(
-                input_layouts=(Shard(1), None),
-                desired_input_layouts=(Replicate(), None),
+            "input_layernorm": SequenceParallel(),
+            "self_attn": prepare_module_input(
+                input_kwarg_layouts={"hidden_states": Shard(1)},
+                desired_input_kwarg_layouts={"hidden_states": Replicate()},
             ),
-            "attention.wq": colwise_parallel(),
-            "attention.wk": colwise_parallel(),
-            "attention.wv": colwise_parallel(),
-            "attention.wo": rowwise_parallel(output_layouts=Shard(1)),
-            "ffn_norm": SequenceParallel(),
+            "self_attn.q_proj": colwise_parallel(),
+            "self_attn.k_proj": colwise_parallel(),
+            "self_attn.v_proj": colwise_parallel(),
+            "self_attn.o_proj": rowwise_parallel(output_layouts=Shard(1)),
+            "post_attention_layernorm": SequenceParallel(),
         }
         if not transformer_block.moe_enabled:
             layer_plan.update(
                 {
-                    "feed_forward": prepare_module_input(
+                    "mlp": prepare_module_input(
                         input_layouts=(Shard(1),),
                         desired_input_layouts=(Replicate(),),
                     ),
-                    "feed_forward.w1": colwise_parallel(),
-                    "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)),
-                    "feed_forward.w3": colwise_parallel(),
+                    "mlp.gate_proj": colwise_parallel(),
+                    "mlp.up_proj": colwise_parallel(),
+                    "mlp.down_proj": rowwise_parallel(output_layouts=Shard(1)),
                 }
             )
 
@@ -557,7 +557,7 @@ def apply_moe_ep_tp(
     ep_tp_mesh: DeviceMesh | None,
     etp_enabled: bool,
 ):
-    for transformer_block in model.layers.values():
+    for transformer_block in model.layers:
         if not transformer_block.moe_enabled:
             continue
 
diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
index 346a400260..68594dc2be 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
@@ -37,6 +37,7 @@ def seeded_trunc_normal(*args, **kwargs):
 def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int):
     _original_deepseek_v3_decoder_layer_init(self, config, layer_idx)
     
+    self.layer_idx = layer_idx
     self.mlp.layer_idx = layer_idx
     
     if hasattr(self.mlp, 'experts'):
diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
index 28888f61a6..ddde904cae 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
@@ -1,15 +1,18 @@
-
-
+import torch
 import torch.nn as nn
-
 from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer
+from transformers.models.llama.modeling_llama import LlamaModel, LlamaAttention, LlamaMLP, LlamaDecoderLayer
 from transformers.modeling_utils import PreTrainedModel
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from typing import Optional
+
 
 _original_llama_decoder_layer_init = LlamaDecoderLayer.__init__
 
 def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int):
     _original_llama_decoder_layer_init(self, config, layer_idx)
+    self.layer_idx = layer_idx
     self.mlp.layer_idx = layer_idx
 
 def _initialize_weights_patched(self, module):
@@ -83,8 +86,71 @@ def _init_weights_patched(self, module):
         if hasattr(module, "bias") and module.bias is not None:
             module.bias.data.zero_()
 
+def _patched_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    **kwargs,
+) -> BaseModelOutputWithPast:
+    """
+    A patched version of LlamaModel.forward that disables the causal mask.
+    This is a direct copy of the original method with one line changed.
+    """
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+    if inputs_embeds is None:
+        inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+    if use_cache and past_key_values is None:
+        past_key_values = DynamicCache()
+
+    if cache_position is None:
+        past_seen_tokens = (
+            past_key_values.get_seq_length() if past_key_values is not None else 0
+        )
+        cache_position: torch.Tensor = torch.arange(
+            past_seen_tokens,
+            past_seen_tokens + inputs_embeds.shape[1],
+            device=inputs_embeds.device,
+        )
+
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0)
+
+    # --- START OF PATCH ---
+    # NOTE(3outeille): When TP enabled, the causal_mask will be created based on input_embeds which has sharded seq_len.
+    # We set it to False so that SDPA is creating the causal mask based on query & key seq_len.
+    causal_mask = None
+    # --- END OF PATCH ---
+
+    hidden_states = inputs_embeds
+    position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+    for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        hidden_states = decoder_layer(
+            hidden_states,
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_values,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+    hidden_states = self.norm(hidden_states)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=past_key_values,
+    )
 
 def patch_hf_llama():
+    LlamaModel.forward = _patched_forward
     LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched
     PreTrainedModel._init_weights = _init_weights_patched
     PreTrainedModel._initialize_weights = _initialize_weights_patched
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index e74459760a..66fa558a58 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -6,8 +6,7 @@
 
 import importlib
 from dataclasses import dataclass
-from typing import Optional
-
+import torch
 from torch import nn
 from torchtitan.config import JobConfig
 from torchtitan.protocols import BaseModelArgs
@@ -285,6 +284,12 @@ def __init__(self, model_args: HFTransformerModelArgs):
                 )
         self.model = model_cls(config=model_args)
 
+        for layer in self.model.model.layers:
+            if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace:
+                layer.moe_enabled = True
+            else:
+                layer.moe_enabled = False
+
     @property
     def tok_embeddings(self):
         """Returns the model's embed_tokens, handling different Hugging Face model structures."""
@@ -363,9 +368,10 @@ def rotary_emb(self, value):
             raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
 
     def forward(self, *args, **kwargs):
-        output = self.model(*args, **kwargs)
-        if isinstance(output, CausalLMOutputWithPast):
-            return output.logits
+        position_ids = torch.arange(args[0].shape[1], device=args[0].device).unsqueeze(0)
+        kwargs["position_ids"] = position_ids
+        output = self.model.model(*args, **kwargs)
+        output = self.model.lm_head(output.last_hidden_state)
         return output
 
     def init_weights(self, *args, **kwargs):
@@ -382,8 +388,9 @@ def selective_init(module):
             # For pipeline parallel, we need to skip nn.Identity modules
             if not isinstance(module, nn.Identity):
                 original_init_weights_fn(module)
+            else:
+                logger.info("Skipping nn.Identity module during weight initialization.")
 
-        logger.info("Applying selective weight initialization, skipping nn.Identity modules when PP is enabled.")
         self.model.apply(selective_init)
 
         self.model.tie_weights()

From e6b9ff5f8cc394130241b891c8d025b65d223819 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sun, 28 Sep 2025 13:59:22 +0000
Subject: [PATCH 047/129] add test filtering in compare distributed run

---
 .../compare_distributed_run.py                | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 1a432b68bd..5ee59e60ea 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -198,6 +198,7 @@ def __init__(self):
         self.grad_norm_rtol = self.DEFAULT_GRAD_NORM_RTOL
         self.parallelism_configs: List[ParallelismConfig] = []
         self.results_dir: Optional[Path] = None
+        self.test_filter = ""
 
     def generate_parallelism_configs(self) -> None:
         """Generate parallelism configurations based on the number of GPUs."""
@@ -284,8 +285,16 @@ def _get_factors(n: int) -> List[int]:
             LogLevel.INFO,
             f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.",
         )
+        configs_to_display = self.parallelism_configs
+        table_title = "[bold]Generated Parallelism Configurations[/bold]"
+
+        if self.test_filter:
+            # Keep fsdp baseline and anything that matches the filter
+            configs_to_display = [c for c in self.parallelism_configs if c.name == "fsdp" or self.test_filter in c.name]
+            table_title = f"[bold]Filtered Parallelism Configurations (filter: [cyan]'{self.test_filter}'[/cyan])[/bold]"
+
         table = Table(
-            title="[bold]Generated Parallelism Configurations[/bold]",
+            title=table_title,
             show_header=True,
             header_style="bold magenta",
         )
@@ -298,7 +307,7 @@ def _get_factors(n: int) -> List[int]:
         table.add_column("ep", justify="right")
         table.add_column("eptp", justify="right")
 
-        for config in self.parallelism_configs:
+        for config in configs_to_display:
             table.add_row(
                 config.name,
                 str(config.dp_replicate),
@@ -658,6 +667,8 @@ def run(self) -> int:
         )
         parser.add_argument("-m", "--model-filter", default="",
                           help="Filter models by name pattern (e.g., 'llama3')")
+        parser.add_argument("-t", "--test-filter", default="",
+                          help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')")
         parser.add_argument("-nd", "--nd_parallel", type=str, default="2d",
                           help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})")
         parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS,
@@ -682,6 +693,7 @@ def run(self) -> int:
         self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
         self.steps = args.steps
         self.model_filter = args.model_filter
+        self.test_filter = args.test_filter
         self.flavor = args.flavor
         self.verbose = args.verbose
         self.loss_atol = args.loss_atol
@@ -696,6 +708,7 @@ def run(self) -> int:
                     f"[bold]Steps:[/bold] {self.steps}\n"
                     f"[bold]Seed:[/bold] {self.seed}\n"
                     f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n"
+                    f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n"
                     f"[bold]Model flavor:[/bold] {self.flavor}"
                 ),
                 title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]",
@@ -780,6 +793,11 @@ def run(self) -> int:
         passed_tests = 1 # +1 for the baseline (FSDP)
         failed_tests = 0
         test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"]
+        if self.test_filter:
+            filtered_configs = [c for c in test_configs if self.test_filter in c.name]
+            if not filtered_configs:
+                log_message(LogLevel.WARNING, f"Test filter '{self.test_filter}' did not match any test configurations.")
+            test_configs = filtered_configs
         total_tests = len(test_configs) + 1 # +1 for the baseline (FSDP)
         results = []
 

From a4cb8c3b39c542a3bd34d9fa53d9a49a7bc129ce Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sun, 28 Sep 2025 14:06:22 +0000
Subject: [PATCH 048/129] dont generate EP config if model is not a MoE

---
 .../compare_distributed_run.py                | 43 ++++++++++++-------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 5ee59e60ea..cc8f54f51b 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -200,8 +200,18 @@ def __init__(self):
         self.results_dir: Optional[Path] = None
         self.test_filter = ""
 
-    def generate_parallelism_configs(self) -> None:
+    def generate_parallelism_configs(self, hf_model_name: str) -> None:
         """Generate parallelism configurations based on the number of GPUs."""
+        from transformers import AutoConfig
+
+        try:
+            model_config = AutoConfig.from_pretrained(hf_model_name)
+            is_moe = getattr(model_config, "num_local_experts", 0) > 1
+        except Exception:
+            # Fallback for models not on Hub or other errors
+            is_moe = False
+            log_message(LogLevel.WARNING, f"Could not determine if {hf_model_name} is a MoE model from HuggingFace Hub. EP configurations will not be generated.")
+
         ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
         configs = []
 
@@ -253,20 +263,21 @@ def _get_factors(n: int) -> List[int]:
                             )
                         )
 
-                        # NOTE(3outeille): EP borrowing degree from dp_shard
-                        configs.append(
-                            ParallelismConfig(
-                                name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}",
-                                dp_replicate=1,
-                                dp_shard=dp_shard,
-                                tp=tp,
-                                pp=pp,
-                                pp_schedule="1F1B",
-                                cp=cp,
-                                ep=dp_shard,
-                                eptp=1
+                        if is_moe:
+                            # NOTE(3outeille): EP borrowing degree from dp_shard
+                            configs.append(
+                                ParallelismConfig(
+                                    name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}",
+                                    dp_replicate=1,
+                                    dp_shard=dp_shard,
+                                    tp=tp,
+                                    pp=pp,
+                                    pp_schedule="1F1B",
+                                    cp=cp,
+                                    ep=dp_shard,
+                                    eptp=1
+                                )
                             )
-                        )
         
     
         # Remove duplicates and assign to instance
@@ -721,8 +732,6 @@ def run(self) -> int:
 
         self.base_results_dir.mkdir(exist_ok=True)
 
-        self.generate_parallelism_configs()
-
         # TODO(3outeille): make it more generic later
         if self.model_filter == "llama3":
             hf_model_name = "meta-llama/Llama-3.2-1B"
@@ -733,6 +742,8 @@ def run(self) -> int:
         else:
             raise ValueError(f"Model filter {self.model_filter} not supported")
             
+        self.generate_parallelism_configs(hf_model_name)
+            
         model_owner, model_repo = hf_model_name.split("/", 1)
         nd_parallel_upper = self.nd_parallel.upper()
         self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor

From 12c0c474a38340750afee1bf15da5c5f49720af7 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sun, 28 Sep 2025 14:08:11 +0000
Subject: [PATCH 049/129] disable
 torch.utils.deterministic.fill_uninitialized_memory for Moe during testing

---
 torchtitan/train.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torchtitan/train.py b/torchtitan/train.py
index b15cd73e2c..881f353734 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -253,6 +253,9 @@ def __init__(self, job_config: JobConfig):
             del model
 
             for m in self.model_parts:
+                if is_torch_deterministic():
+                    # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
+                    torch.utils.deterministic.fill_uninitialized_memory = False    
                 m.to_empty(device=init_device)
                 with torch.no_grad():
                     m.init_weights(buffer_device=buffer_device)

From 13edc66cc4e8c1764590226fb6b1d16a1b171a1a Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 29 Sep 2025 12:37:53 +0000
Subject: [PATCH 050/129] CP is now supported

---
 .../infra/parallelize_hf_transformers.py            |  2 +-
 .../model/hf_transformers_args.py                   |  1 -
 torchtitan/train.py                                 | 13 ++++++++-----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index d36bc0589a..56d6cf9ca6 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -194,7 +194,7 @@ def parallelize_hf_transformers(
         job_config.parallelism.context_parallel_degree > 1
         and model.model_args.use_flex_attn
     ):
-        raise NotImplementedError("CP support for FlexAttention is still in progress.")
+        logger.warning("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
         enable_float8_linear = "float8" in job_config.model.converters
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 66fa558a58..afafddd900 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -13,7 +13,6 @@
 from torchtitan.tools.logging import logger
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import CausalLMOutputWithPast
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 881f353734..735180ee5a 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -34,9 +34,6 @@
     maybe_enable_profiling,
 )
 
-from transformers.models.llama.modeling_llama import CausalLMOutputWithPast
-
-
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):
     # core configs
     job_config: JobConfig
@@ -429,11 +426,17 @@ def forward_backward_step(
 
         # apply context parallelism if cp is enabled
         # ensure CP handles the separate freqs_cis buffer for each pp stage
+        cp_buffers = [inputs, labels]
+        cp_seq_dims = [1, 1] 
+        if hasattr(model_parts[0], "freqs_cis"):
+            cp_buffers += [m.freqs_cis for m in model_parts]
+            cp_seq_dims += [0 for _ in model_parts]
+
         optional_context_parallel_ctx = (
             dist_utils.create_context_parallel_ctx(
                 cp_mesh=parallel_dims.world_mesh["cp"],
-                cp_buffers=[inputs, labels] + [m.freqs_cis for m in model_parts],
-                cp_seq_dims=[1, 1] + [0 for _ in model_parts],
+                cp_buffers=cp_buffers,
+                cp_seq_dims=cp_seq_dims,
                 cp_no_restore_buffers={inputs, labels},
                 cp_rotate_method=self.job_config.parallelism.context_parallel_rotate_method,
             )

From 52250fb4de667727b4d92eb3be1de3afe8a1f92f Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 29 Sep 2025 12:43:30 +0000
Subject: [PATCH 051/129] some cleaning

---
 .../compare_distributed_run.sh                |   9 +-
 .../transformers_backend/compare_tt_hf_run.sh | 104 -----------
 .../configs/debug_1_gpu_hf.toml               |  62 -------
 .../configs/debug_fsdp_2_gpu.toml             |  65 -------
 ...debug_1_gpu_tt.toml => test_template.toml} |   3 +
 .../reference_diff_deepseekv3_1gpu.log        | 163 ------------------
 .../reference_diff_llama3_1gpu.log            | 133 --------------
 .../transformers_backend/run_train.sh         |  44 -----
 8 files changed, 9 insertions(+), 574 deletions(-)
 delete mode 100755 torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
 delete mode 100644 torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
 delete mode 100644 torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml
 rename torchtitan/experiments/transformers_backend/configs/{debug_1_gpu_tt.toml => test_template.toml} (95%)
 delete mode 100644 torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log
 delete mode 100644 torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
 delete mode 100755 torchtitan/experiments/transformers_backend/run_train.sh

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
index d7e5b77bcb..2ca9bbee62 100755
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/bash
 
-python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d --verbose
-
-# debugpy-run compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 0d
+if [[ "$1" == "--debug" ]]; then
+    shift
+    debugpy-run compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@"
+else
+    python compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@"
+fi
diff --git a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh b/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
deleted file mode 100755
index 703a9b55c9..0000000000
--- a/torchtitan/experiments/transformers_backend/compare_tt_hf_run.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/usr/bin/bash
-
-set -ex
-set -o pipefail
-
-# Common settings
-NGPU=${NGPU:-"1"}
-export LOG_RANK=${LOG_RANK:-0}
-
-# Parse command line arguments for model selection
-MODEL_TYPE=${1:-"llama"}
-export MODEL_TYPE
-SEED=${SEED:-42}
-export SEED
-# Set model names based on argument
-case $MODEL_TYPE in
-    "llama")
-        TT_MODEL_NAME="llama3"
-        HF_MODEL_NAME="meta-llama/Llama-3.2-1B"
-        ;;
-    "deepseek")
-        TT_MODEL_NAME="deepseek_v3"
-        HF_MODEL_NAME="deepseek-ai/DeepSeek-V3"
-        ;;
-    *)
-        echo "Error: Unsupported model type '$MODEL_TYPE'"
-        echo "Usage: $0 [llama|deepseek] [additional_args...]"
-        echo "  llama   - Uses llama3 for TT and meta-llama/Llama-3.2-1B for HF"
-        echo "  deepseek - Uses deepseek_v3 for TT and deepseek-ai/DeepSeek-V3 for HF"
-        exit 1
-        ;;
-esac
-
-echo "Using model type: $MODEL_TYPE"
-echo "  TT model: $TT_MODEL_NAME"
-echo "  HF model: $HF_MODEL_NAME"
-
-# Shift to remove the model type argument, pass remaining args to training
-shift
-
-run_tt() {
-    echo "##############################################"
-    echo "### Running TorchTitan (native) training ###"
-    echo "##############################################"
-    TT_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml"
-
-    # Use CUDA_VISIBLE_DEVICES=0 for TT run
-    CUDA_VISIBLE_DEVICES=0 \
-    torchrun --nproc_per_node=${NGPU} --master_port 1234 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
-    --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${TT_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${TT_MODEL_NAME} "$@"
-}
-
-run_hf() {
-    echo "#######################################################"
-    echo "### Running TorchTitan with HF backend training ###"
-    echo "#######################################################"
-    HF_CONFIG="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml"
-
-    # Use CUDA_VISIBLE_DEVICES=1 for HF run
-    CUDA_VISIBLE_DEVICES=1 \
-    torchrun --nproc_per_node=${NGPU} --master_port 1235 --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
-    --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-    -m torchtitan.train --job.config_file ${HF_CONFIG} --training.seed ${SEED} --training.deterministic --model.name ${HF_MODEL_NAME} "$@"
-}
-
-TT_LOG="tt_run.log"
-HF_LOG="hf_run.log"
-DIFF_LOG="run_diff.log"
-
-export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_hf"
-run_hf "$@" 2>&1 | tee ${HF_LOG} || true
-export DEBUG_JSON_PATH="/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/debug_mode_tt"
-run_tt "$@" 2>&1 | tee ${TT_LOG} || true
-# run_tt "$@" 2>&1 | tee ${HF_LOG}
-
-
-# Filter logs to remove noisy differences
-TT_LOG_FILTERED="${TT_LOG}.filtered"
-HF_LOG_FILTERED="${HF_LOG}.filtered"
-
-# This sed command removes timestamps, PIDs, master ports, and other
-# volatile details that change between runs.
-# Feel free to adjust the regex patterns to better suit your log format.
-sed -E \
-    -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \
-    -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \
-    -e 's/PID [0-9]+/PID XXXX/g' \
-    -e 's/localhost:[0-9]+/localhost:XXXX/g' \
-    < "${TT_LOG}" > "${TT_LOG_FILTERED}"
-
-sed -E \
-    -e 's/([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?/TIMESTAMP/g' \
-    -e 's/torchrun.*--master_port[= ]([0-9]+)/torchrun ... --master_port=XXXX/g' \
-    -e 's/PID [0-9]+/PID XXXX/g' \
-    -e 's/localhost:[0-9]+/localhost:XXXX/g' \
-    < "${HF_LOG}" > "${HF_LOG_FILTERED}"
-
-echo "############################################"
-echo "### Diff between TT and HF run logs      ###"
-echo "############################################"
-echo "### Log diff is being saved to ${DIFF_LOG}"
-echo "############################################"
-git diff --no-index --color=always --word-diff=color "${TT_LOG_FILTERED}" "${HF_LOG_FILTERED}" | tee "${DIFF_LOG}" || true
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml b/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
deleted file mode 100644
index 95aa9599b2..0000000000
--- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml
+++ /dev/null
@@ -1,62 +0,0 @@
-[job]
-dump_folder = "./outputs"
-description = "HF Llama 3 debug training"
-print_args = false
-use_for_integration_test = true
-
-[profiling]
-enable_profiling = true
-save_traces_folder = "profile_trace_hf"
-profile_freq = 5
-enable_memory_snapshot = false
-save_memory_snapshot_folder = "memory_snapshot"
-
-[metrics]
-log_freq = 1
-disable_color_printing = false
-enable_tensorboard = false
-save_tb_folder = "tb"
-enable_wandb = false
-
-[model]
-name = "meta-llama/Llama-3.2-1B"
-flavor = "debugmodel"
-tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
-
-[optimizer]
-name = "AdamW"
-lr = 8e-4
-eps = 1e-8
-
-[lr_scheduler]
-warmup_steps = 2
-decay_ratio = 0.8
-decay_type = "linear"
-min_lr_factor = 0.0
-
-[training]
-local_batch_size = 8
-seq_len = 2048
-max_norm = 1.0
-steps = 10
-compile = false
-dataset = "c4_test"
-dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
-
-[parallelism]
-data_parallel_replicate_degree = 1
-data_parallel_shard_degree = 1
-tensor_parallel_degree = 1
-pipeline_parallel_degree = 1
-context_parallel_degree = 1
-expert_parallel_degree = 1
-
-[checkpoint]
-enable_checkpoint = false
-
-[activation_checkpoint]
-mode = "selective"
-selective_ac_option = '2'
-
-[validation]
-enabled = false 
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml b/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml
deleted file mode 100644
index db97c9b339..0000000000
--- a/torchtitan/experiments/transformers_backend/configs/debug_fsdp_2_gpu.toml
+++ /dev/null
@@ -1,65 +0,0 @@
-# FSDP-only configuration for a 2-GPU setup.
-# Model is sharded across GPUs.
-
-[job]
-dump_folder = "./outputs"
-description = "Llama 3 debug training with FSDP on 2 GPUs"
-print_args = false
-use_for_integration_test = true
-
-[profiling]
-enable_profiling = false
-save_traces_folder = "profile_trace"
-profile_freq = 10
-enable_memory_snapshot = false
-save_memory_snapshot_folder = "memory_snapshot"
-
-[metrics]
-log_freq = 1
-disable_color_printing = false
-enable_tensorboard = false
-save_tb_folder = "tb"
-enable_wandb = false
-
-[model]
-name = "llama3"
-flavor = "debugmodel"
-tokenizer_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
-
-[optimizer]
-name = "AdamW"
-lr = 8e-4
-eps = 1e-8
-
-[lr_scheduler]
-warmup_steps = 2
-decay_ratio = 0.8
-decay_type = "linear"
-min_lr_factor = 0.0
-
-[training]
-local_batch_size = 8
-seq_len = 2048
-max_norm = 1.0
-steps = 10
-compile = false
-dataset = "c4_test"
-dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
-
-[parallelism]
-data_parallel_replicate_degree = 1
-data_parallel_shard_degree = 2
-tensor_parallel_degree = 1
-pipeline_parallel_degree = 1
-context_parallel_degree = 1
-expert_parallel_degree = 1
-
-[checkpoint]
-enable_checkpoint = false
-
-[activation_checkpoint]
-mode = "selective"
-selective_ac_option = '2'
-
-[validation]
-enabled = false 
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml
similarity index 95%
rename from torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml
rename to torchtitan/experiments/transformers_backend/configs/test_template.toml
index b153a98f21..f56a0332d7 100644
--- a/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml
+++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml
@@ -53,7 +53,10 @@ fsdp_reshard_after_forward = "default" # default / never / always
 tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
 pipeline_parallel_degree = 1
+pipeline_parallel_schedule = "1F1B"
 context_parallel_degree = 1
+expert_parallel_degree = 1
+expert_tensor_parallel_degree = 1
 
 [checkpoint]
 enable = false
diff --git a/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log
deleted file mode 100644
index 1155c9a5db..0000000000
--- a/torchtitan/experiments/transformers_backend/reference_diff_deepseekv3_1gpu.log
+++ /dev/null
@@ -1,163 +0,0 @@
-[1mdiff --git a/tt_run.log.filtered b/hf_run.log.filtered[m
-[1mindex 9726db6..84b6138 100644[m
-[1m--- a/tt_run.log.filtered[m
-[1m+++ b/hf_run.log.filtered[m
-[36m@@ -1,85 +1,153 @@[m
-+ echo [31m'##############################################'[m
-[31m##############################################[m[32m'#######################################################'[m
-[32m#######################################################[m
-+ echo '### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###'
-### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###
-+ echo [31m'##############################################'[m
-[31m##############################################[m[32m'#######################################################'[m
-[32m#######################################################[m
-+ [31mTT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32mHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m
-+ [31mCUDA_VISIBLE_DEVICES=0[m[32mCUDA_VISIBLE_DEVICES=1[m
-+ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file [31m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m --training.seed 42 --training.deterministic --model.name [31mdeepseek_v3[m[32mdeepseek-ai/DeepSeek-V3[m
-[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.[m
-[rank0]:  warnings.warn([m
-[rank0]:[titan] TIMESTAMP - root - [32mWARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.[m
-[32m[rank0]:[titan] TIMESTAMP - root -[m INFO - Starting job: [32mHF[m Llama 3 debug training
-[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [][m
-[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation).[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Building [31mdeepseek_v3[m[32mdeepseek-ai/DeepSeek-V3[m debugmodel with [31mDeepSeekV3ModelArgs(_enforced='This field is used to enforce all fields have defaults.', max_batch_size=8, max_seq_len=2048, vocab_size=2000, dim=256, inter_dim=1024, moe_inter_dim=256, n_layers=2, n_dense_layers=1, n_heads=16, norm_eps=1e-05, moe_args=MoEArgs(num_experts=8,[m[32mHFTransformerModelArgs([m
-[32m[rank0]:attn_implementation='sdpa'[m
-[32m[rank0]:attn_mask_type='causal'[m
-[32m[rank0]:beta_fast=None[m
-[32m[rank0]:beta_slow=None[m
-[32m[rank0]:depth_init=True[m
-[32m[rank0]:dim=256[m
-[32m[rank0]:eos_id=0[m
-[32m[rank0]:ffn_dim_multiplier=None[m
-[32m[rank0]:inter_dim=1024[m
-[32m[rank0]:kv_lora_rank=512[m
-[32m[rank0]:max_seq_len=2048[m
-[32m[rank0]:moe_args=MoEArgs(num_experts=8,[m num_shared_experts=2, score_func='softmax', route_norm=True, route_scale=1.0, score_before_experts=False, top_k=3, use_grouped_mm=True, [31mload_balance_coeff=0.001), n_expert_groups=1, n_limited_groups=1, q_lora_rank=0, kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64, v_head_dim=128, use_flex_attn=False, attn_mask_type='causal', original_seq_len=4096, rope_theta=10000.0, rope_factor=40, beta_fast=32, beta_slow=1, mscale=0.7)[m[32mload_balance_coeff=0.001)[m
-[32m[rank0]:moe_inter_dim=256[m
-[32m[rank0]:moe_intermediate_size=256[m
-[32m[rank0]:mscale=0.7[m
-[32m[rank0]:multiple_of=256[m
-[32m[rank0]:n_dense_layers=1[m
-[32m[rank0]:n_expert_groups=None[m
-[32m[rank0]:n_group=2[m
-[32m[rank0]:n_heads=16[m
-[32m[rank0]:n_kv_heads=16[m
-[32m[rank0]:n_layers=2[m
-[32m[rank0]:n_limited_groups=None[m
-[32m[rank0]:n_routed_experts=8[m
-[32m[rank0]:n_shared_experts=2[m
-[32m[rank0]:norm_eps=1e-05[m
-[32m[rank0]:num_experts_per_tok=3[m
-[32m[rank0]:original_seq_len=None[m
-[32m[rank0]:partial_rotary_factor=4.0[m
-[32m[rank0]:q_lora_rank=None[m
-[32m[rank0]:qk_nope_head_dim=128[m
-[32m[rank0]:qk_rope_head_dim=64[m
-[32m[rank0]:rope_factor=None[m
-[32m[rank0]:rope_theta=10000[m
-[32m[rank0]:topk_group=1[m
-[32m[rank0]:use_flex_attn=False[m
-[32m[rank0]:v_head_dim=128[m
-[32m[rank0]:vocab_size=2000[m
-[32m[rank0]:)[m
-[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory[m
-[31m[rank0]:[titan] TIMESTAMP - root - INFO - Total parameter count: dense 8,923,392, sparse 1,968,128, active 9,908,480[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown:[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mDeepSeekV3Model[m[32mHFTransformerModel[m - 10,891,520 params
-[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(tok_embeddings):[m[32m(embed_tokens):[m Embedding - 512,000 params
-[rank0]:[titan] TIMESTAMP - root - INFO -   (layers): [31mModuleDict[m[32mModuleList[m - 9,867,264 params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (0): [31mTransformerBlock[m[32mDeepseekV3DecoderLayer[m - 4,342,784 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): DeepseekV3Attention[m - 3,555,840 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 786,432 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_a):[m[32m(kv_a_proj_with_mqa):[m Linear - 147,456 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(kv_norm): RMSNorm[m[32m(kv_a_layernorm): DeepseekV3RMSNorm[m - 512 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_b):[m[32m(kv_b_proj):[m Linear - 2,097,152 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 524,288 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(mlp): DeepseekV3MLP[m - [31m256[m[32m786,432[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(ffn_norm): RMSNorm[m[32m(gate_proj): Linear[m - [31m256[m[32m262,144[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(feed_forward): FeedForward[m[32m(up_proj): Linear[m - [31m786,432[m[32m262,144[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(down_proj):[m Linear - 262,144 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(w2): Linear[m[32m(input_layernorm): DeepseekV3RMSNorm[m - [31m262,144[m[32m256[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(w3): Linear[m[32m(post_attention_layernorm): DeepseekV3RMSNorm[m - [31m262,144[m[32m256[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (1): [31mTransformerBlock[m[32mDeepseekV3DecoderLayer[m - 5,524,480 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): DeepseekV3Attention[m - 3,555,840 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 786,432 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_a):[m[32m(kv_a_proj_with_mqa):[m Linear - 147,456 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(kv_norm): RMSNorm[m[32m(kv_a_layernorm): DeepseekV3RMSNorm[m - 512 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wkv_b):[m[32m(kv_b_proj):[m Linear - 2,097,152 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 524,288 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(mlp): DeepseekV3MoE[m - [31m256[m[32m1,968,128[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(ffn_norm): RMSNorm[m[32m(experts): ModuleList[m - [31m256[m[32m1,572,864[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(moe): MoE[m[32m(0): DeepseekV3MLP[m - [31m1,968,128[m[32m196,608[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -             [31m(experts): GroupedExperts[m[32m(gate_proj): Linear[m - [31m1,572,864[m[32m65,536[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -             [31m(router): TokenChoiceTopKRouter[m[32m(up_proj): Linear[m - [31m2,048[m[32m65,536[m params
-[rank0]:[titan] TIMESTAMP - root - INFO -             [31m(gate):[m[32m(down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (1): DeepseekV3MLP - 196,608 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (2): DeepseekV3MLP - 196,608 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (3): DeepseekV3MLP - 196,608 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj):[m Linear - [32m65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (4): DeepseekV3MLP - 196,608 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (5): DeepseekV3MLP - 196,608 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (6): DeepseekV3MLP - 196,608 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -           (7): DeepseekV3MLP - 196,608 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (gate_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (up_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -             (down_proj): Linear - 65,536 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -         (gate): DeepseekV3TopkRouter -[m 2,048 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         (shared_experts): [31mFeedForward[m[32mDeepseekV3MLP[m - 393,216 params
-[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(w1):[m[32m(gate_proj):[m Linear - 131,072 params
-[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(w2):[m[32m(up_proj):[m Linear - 131,072 params
-[rank0]:[titan] TIMESTAMP - root - INFO -           [31m(w3):[m[32m(down_proj):[m Linear - 131,072 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [32m(input_layernorm): DeepseekV3RMSNorm - 256 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -       (post_attention_layernorm): DeepseekV3RMSNorm - 256 params[m
-[32m[rank0]:[titan] TIMESTAMP - root - INFO -[m   (norm): [31mRMSNorm[m[32mDeepseekV3RMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(output):[m[32m(lm_head):[m Linear - 512,000 params
-[rank0]:[titan] TIMESTAMP - root - INFO - [34mModel [31mdeepseek_v3[m[32mdeepseek-ai/DeepSeek-V3[m debugmodel [31msize: 10,891,520 total parameters[39m
-[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14[m
-[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.05GiB(0.06%)[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2)[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at [31m./outputs/profile_trace[m
-[31m[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/lib/python3.12/site-packages/torch/nn/functional.py:2920: UserWarning: Mismatch dtype between input and weight: input dtype = c10::BFloat16, weight dtype = float, Cannot dispatch to fused implementation. (Triggered internally at /pytorch/aten/src/ATen/native/layer_norm.cpp:344.)[m
-[31m[rank0]:  return torch.rms_norm(input, normalized_shape, weight, eps)[m[32m./outputs/profile_trace_hf[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m8.1381[m[32m8.1218[m  [38;2;180;60;0mgrad_norm:  [31m2.7374[m[32m2.7807[m  [38;2;54;234;195mmemory:  [31m2.14GiB(2.70%)[m[32m2.48GiB(3.13%)[m  [34mtps: [31m18,024[m[32m11,445[m  [36mtflops: [31m1.24[m[32m0.89[m  [35mmfu: [31m0.13%[39m[m[32m0.09%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.0208[m[32m6.8905[m  [38;2;180;60;0mgrad_norm:  [31m3.2615[m[32m3.2709[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.49GiB(3.13%)[m  [34mtps: [31m20,232[m[32m17,755[m  [36mtflops: [31m1.40[m[32m1.38[m  [35mmfu: 0.14%[39m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m5.2642[m[32m5.1682[m  [38;2;180;60;0mgrad_norm:  [31m2.8735[m[32m2.8229[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.49GiB(3.13%)[m  [34mtps: [31m325,066[m[32m119,606[m  [36mtflops: [31m22.42[m[32m9.32[m  [35mmfu: [31m2.27%[39m[m[32m0.94%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m4.8286[m[32m4.7719[m  [38;2;180;60;0mgrad_norm:  [31m2.1885[m[32m2.2433[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.51GiB(3.15%)[m  [34mtps: [31m345,536[m[32m135,937[m  [36mtflops: [31m23.83[m[32m10.59[m  [35mmfu: [31m2.41%[39m[m[32m1.07%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m4.4370[m[32m4.3827[m  [38;2;180;60;0mgrad_norm:  [31m2.3053[m[32m2.3779[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.51GiB(3.15%)[m  [34mtps: [31m296,009[m[32m133,266[m  [36mtflops: [31m20.41[m[32m10.39[m  [35mmfu: [31m2.06%[39m[m[32m1.05%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.05[m seconds
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.3063[m[32m4.2368[m  [38;2;180;60;0mgrad_norm:  [31m2.2445[m[32m2.2557[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m136,065[m[32m66,465[m  [36mtflops: [31m9.38[m[32m5.18[m  [35mmfu: [31m0.95%[39m[m[32m0.52%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.1253[m[32m4.0403[m  [38;2;180;60;0mgrad_norm:  [31m1.9626[m[32m1.9132[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m299,863[m[32m131,077[m  [36mtflops: [31m20.68[m[32m10.22[m  [35mmfu: [31m2.09%[39m[m[32m1.03%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.0645[m[32m3.9796[m  [38;2;180;60;0mgrad_norm:  [31m1.8299[m[32m1.8154[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m343,855[m[32m147,955[m  [36mtflops: [31m23.71[m[32m11.53[m  [35mmfu: [31m2.40%[39m[m[32m1.17%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.4758[m[32m4.4010[m  [38;2;180;60;0mgrad_norm:  [31m1.4743[m[32m1.4965[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m346,707[m[32m139,416[m  [36mtflops: [31m23.91[m[32m10.87[m  [35mmfu: [31m2.42%[39m[m[32m1.10%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9483[m[32m3.8448[m  [38;2;180;60;0mgrad_norm:  [31m1.6240[m[32m1.6185[m  [38;2;54;234;195mmemory:  [31m2.15GiB(2.71%)[m[32m2.71GiB(3.41%)[m  [34mtps: [31m303,029[m[32m139,581[m  [36mtflops: [31m20.90[m[32m10.88[m  [35mmfu: [31m2.11%[39m[m[32m1.10%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.02[m[32m0.04[m seconds
-[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Training completed[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed[m
diff --git a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log b/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
deleted file mode 100644
index 84eff10ff8..0000000000
--- a/torchtitan/experiments/transformers_backend/reference_diff_llama3_1gpu.log
+++ /dev/null
@@ -1,133 +0,0 @@
-[1mdiff --git a/tt_run.log.filtered b/hf_run.log.filtered[m
-[1mindex 1f72d39..c1856a6 100644[m
-[1m--- a/tt_run.log.filtered[m
-[1m+++ b/hf_run.log.filtered[m
-[36m@@ -1,125 +1,125 @@[m
-+ echo [31m'##############################################'[m
-[31m##############################################[m[32m'#######################################################'[m
-[32m#######################################################[m
-+ echo '### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###'
-### Running TorchTitan [31m(native)[m[32mwith HF backend[m training ###
-+ echo [31m'##############################################'[m
-[31m##############################################[m[32m'#######################################################'[m
-[32m#######################################################[m
-+ [31mTT_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32mHF_CONFIG=/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m
-+ [31mCUDA_VISIBLE_DEVICES=0[m[32mCUDA_VISIBLE_DEVICES=1[m
-+ torchrun ... --master_port=XXXX --rdzv_backend c10d --rdzv_endpoint=localhost:XXXX --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file [31m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_tt.toml[m[32m/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/experiments/transformers_backend/configs/debug_1_gpu_hf.toml[m --training.seed 42 --training.deterministic --model.name [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m
-[rank0]:/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/transformers/src/transformers/utils/hub.py:111: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.[m
-[rank0]:  warnings.warn([m
-[rank0]:[titan] TIMESTAMP - root - [32mWARNING - tokenizer_path is deprecated, use model.hf_assets_path instead. Setting hf_assets_path to tokenizer_path temporarily.[m
-[32m[rank0]:[titan] TIMESTAMP - root -[m INFO - Starting job: [32mHF[m Llama 3 debug training
-[rank0]:[titan] TIMESTAMP - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Building 0-D device mesh with [], [][m
-[rank0]:[titan] TIMESTAMP - root - INFO - [GC] Initial GC collection 0.00 seconds[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Deterministic algorithm enabled (expect perf degradation).[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Loading tokenizer from tokenizer.json[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Preparing c4_test dataset from /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Building [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel with [31mTransformerModelArgs(_enforced='This field is used to enforce all fields have defaults.', dim=256,[m[32mHFTransformerModelArgs(dim=256,[m n_layers=6, n_heads=16, [31mn_kv_heads=None,[m[32mn_kv_heads=16,[m vocab_size=2000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, rope_theta=500000, max_seq_len=2048, depth_init=True, use_flex_attn=False, attn_mask_type='causal', [31meos_id=0)[m[32meos_id=0, attn_implementation='sdpa')[m
-[rank0]:[titan] TIMESTAMP - root - INFO - CUDA capacity: NVIDIA H100 80GB HBM3 with 79.44GiB memory[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Model Structure Parameter Breakdown:[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mTransformer[m[32mHFTransformerModel[m - 6,139,136 params
-[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(tok_embeddings):[m[32m(embed_tokens):[m Embedding - 512,000 params
-[rank0]:[titan] TIMESTAMP - root - INFO -   (layers): [31mModuleDict[m[32mModuleList[m - 5,114,880 params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (0): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (1): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (2): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (3): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (4): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -     (5): [31mTransformerBlock[m[32mLlamaDecoderLayer[m - 852,480 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention): Attention[m[32m(self_attn): LlamaAttention[m - 262,144 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wq):[m[32m(q_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wk):[m[32m(k_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wv):[m[32m(v_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(wo):[m[32m(o_proj):[m Linear - 65,536 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(feed_forward): FeedForward[m[32m(mlp): LlamaMLP[m - 589,824 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w1):[m[32m(gate_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w2):[m[32m(up_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -         [31m(w3):[m[32m(down_proj):[m Linear - 196,608 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(attention_norm): RMSNorm[m[32m(input_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -       [31m(ffn_norm): RMSNorm[m[32m(post_attention_layernorm): LlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -   (norm): [31mRMSNorm[m[32mLlamaRMSNorm[m - 256 params
-[rank0]:[titan] TIMESTAMP - root - INFO -   [31m(output):[m[32m(lm_head):[m Linear - 512,000 params
-[rank0]:[titan] TIMESTAMP - root - INFO - [34mModel [31mllama3[m[32mmeta-llama/Llama-3.2-1B[m debugmodel [31msize: 6,139,136 total parameters[39m
-[rank0]:[titan] TIMESTAMP - root - INFO - Applied selective activation checkpointing to the model[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Peak FLOPS used for computing MFU: 9.890e+14[m
-[rank0]:[titan] TIMESTAMP - root - INFO - CUDA memory usage for model: 0.04GiB(0.05%)[m
-[31m[rank0]:[titan] TIMESTAMP - root - WARNING - model.safetensors.index.json not found at hf_assets_path: /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer/model.safetensors.index.json.                     Defaulting to saving a single safetensors file if checkpoint is saved in HF format[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Mixed precision training is handled by AMP[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Trainer is initialized with local batch size 8, global batch size 8, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2)[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Training starts at step 1[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Profiling active. Traces will be saved at [31m./outputs/profile_trace[m[32m./outputs/profile_trace_hf[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  1  [32mloss:  [31m7.8723[m[32m7.8704[m  [38;2;180;60;0mgrad_norm:  [31m1.5167[m[32m1.5185[m  [38;2;54;234;195mmemory:  [31m1.39GiB(1.75%)[m[32m1.67GiB(2.10%)[m  [34mtps: [31m43,375[m[32m32,685[m  [36mtflops: [31m3.10[m[32m2.44[m  [35mmfu: [31m0.31%[39m[m[32m0.25%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  2  [32mloss:  [31m7.5246[m[32m7.5209[m  [38;2;180;60;0mgrad_norm:  [31m1.6359[m[32m1.6373[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m20,834[m[32m19,798[m  [36mtflops: [31m1.49[m[32m1.48[m  [35mmfu: 0.15%[39m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  3  [32mloss:  [31m6.7900[m[32m6.7789[m  [38;2;180;60;0mgrad_norm:  [31m2.0345[m[32m2.0390[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m338,323[m[32m199,161[m  [36mtflops: [31m24.19[m[32m14.85[m  [35mmfu: [31m2.45%[39m[m[32m1.50%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  4  [32mloss:  [31m5.9829[m[32m5.9673[m  [38;2;180;60;0mgrad_norm:  [31m2.4129[m[32m2.4176[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m362,741[m[32m207,198[m  [36mtflops: [31m25.94[m[32m15.45[m  [35mmfu: [31m2.62%[39m[m[32m1.56%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  5  [32mloss:  [31m5.0536[m[32m5.0388[m  [38;2;180;60;0mgrad_norm:  [31m2.5305[m[32m2.5275[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m287,315[m[32m187,882[m  [36mtflops: [31m20.55[m[32m14.01[m  [35mmfu: [31m2.08%[39m[m[32m1.42%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 5[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.04[m seconds
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  6  [32mloss:  [31m4.6370[m[32m4.6283[m  [38;2;180;60;0mgrad_norm:  [31m2.2826[m[32m2.2818[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m130,121[m[32m83,115[m  [36mtflops: [31m9.31[m[32m6.20[m  [35mmfu: [31m0.94%[39m[m[32m0.63%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  7  [32mloss:  [31m4.3133[m[32m4.3077[m  [38;2;180;60;0mgrad_norm:  [31m2.1019[m[32m2.1023[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m295,546[m[32m174,068[m  [36mtflops: [31m21.13[m[32m12.98[m  [35mmfu: [31m2.14%[39m[m[32m1.31%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  8  [32mloss:  [31m4.1398[m[32m4.1349[m  [38;2;180;60;0mgrad_norm:  [31m1.9342[m[32m1.9334[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m361,129[m[32m206,837[m  [36mtflops: [31m25.82[m[32m15.43[m  [35mmfu: [31m2.61%[39m[m[32m1.56%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep:  9  [32mloss:  [31m4.5326[m[32m4.5289[m  [38;2;180;60;0mgrad_norm:  [31m1.5111[m[32m1.5103[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m363,728[m[32m208,233[m  [36mtflops: [31m26.01[m[32m15.53[m  [35mmfu: [31m2.63%[39m[m[32m1.57%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - [31mstep: 10  [32mloss:  [31m3.9859[m[32m3.9828[m  [38;2;180;60;0mgrad_norm:  [31m1.7799[m[32m1.7849[m  [38;2;54;234;195mmemory:  [31m1.52GiB(1.91%)[m[32m1.79GiB(2.26%)[m  [34mtps: [31m294,013[m[32m188,295[m  [36mtflops: [31m21.03[m[32m14.04[m  [35mmfu: [31m2.13%[39m[m[32m1.42%[39m[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Dumping profiler traces at step 10[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Finished dumping profiler traces in [31m0.03[m[32m0.04[m seconds
-[rank0]:[titan] TIMESTAMP - root - INFO - Sleeping 2 seconds for other ranks to complete[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Training completed[m
-[rank0]:[titan] TIMESTAMP - root - INFO - Process group destroyed[m
diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh
deleted file mode 100755
index 6151fcda64..0000000000
--- a/torchtitan/experiments/transformers_backend/run_train.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -ex
-
-# use envs as local overwrites for convenience
-# e.g.
-# BACKEND=tt LOG_RANK=0,1 NGPU=4 ./run_train.sh
-NGPU=${NGPU:-"8"}
-export LOG_RANK=${LOG_RANK:-0}
-
-DEBUG_PORT=${DEBUG_PORT:-5678}
-# Option to switch between debug and train
-MODE=${MODE:-"train"}  # Set MODE=debug or MODE=train
-
-# Option to switch between hf and tt backend
-BACKEND=${BACKEND:-"hf"}
-
-if [ "$BACKEND" = "tt" ]; then
-    CONFIG_FILE=${CONFIG_FILE:-"/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/torchtitan/models/llama3/train_configs/my_debug_model.toml"}
-elif [ "$BACKEND" = "hf" ]; then
-    CONFIG_FILE=${CONFIG_FILE:-"configs/debug_1_gpu_hf.toml"}
-else
-    echo "Invalid BACKEND set: ${BACKEND}"
-    exit 1
-fi
-
-if [ "$MODE" = "debug" ]; then
-    PYTHON_CMD="debugpy-run -p ${DEBUG_PORT} -m torch.distributed.run --"
-else
-    PYTHON_CMD="torchrun"
-fi
-
-TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
-
-PYTORCH_ALLOC_CONF="expandable_segments:True" \
-TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \
-$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
---local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
--m torchtitan.train --job.config_file ${CONFIG_FILE} "$@"
\ No newline at end of file

From c523ede6e930d30a84553b4f2233f8fd0691d1d6 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 29 Sep 2025 14:09:06 +0000
Subject: [PATCH 052/129] cleaner way to make create_causal_mask = None

---
 .../infra/parallelize_hf_transformers.py      |  1 -
 .../model/hf_llama_patch.py                   | 69 +------------------
 .../model/hf_transformers_args.py             |  6 +-
 3 files changed, 6 insertions(+), 70 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 56d6cf9ca6..469c3407a8 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -192,7 +192,6 @@ def parallelize_hf_transformers(
 
     if (
         job_config.parallelism.context_parallel_degree > 1
-        and model.model_args.use_flex_attn
     ):
         logger.warning("CP support for FlexAttention is still in progress.")
 
diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
index ddde904cae..c3557f6973 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
@@ -1,11 +1,8 @@
 import torch
 import torch.nn as nn
 from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import LlamaModel, LlamaAttention, LlamaMLP, LlamaDecoderLayer
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer
 from transformers.modeling_utils import PreTrainedModel
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from typing import Optional
 
 
 _original_llama_decoder_layer_init = LlamaDecoderLayer.__init__
@@ -86,71 +83,7 @@ def _init_weights_patched(self, module):
         if hasattr(module, "bias") and module.bias is not None:
             module.bias.data.zero_()
 
-def _patched_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Cache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    **kwargs,
-) -> BaseModelOutputWithPast:
-    """
-    A patched version of LlamaModel.forward that disables the causal mask.
-    This is a direct copy of the original method with one line changed.
-    """
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    if inputs_embeds is None:
-        inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
-
-    if use_cache and past_key_values is None:
-        past_key_values = DynamicCache()
-
-    if cache_position is None:
-        past_seen_tokens = (
-            past_key_values.get_seq_length() if past_key_values is not None else 0
-        )
-        cache_position: torch.Tensor = torch.arange(
-            past_seen_tokens,
-            past_seen_tokens + inputs_embeds.shape[1],
-            device=inputs_embeds.device,
-        )
-
-    if position_ids is None:
-        position_ids = cache_position.unsqueeze(0)
-
-    # --- START OF PATCH ---
-    # NOTE(3outeille): When TP enabled, the causal_mask will be created based on input_embeds which has sharded seq_len.
-    # We set it to False so that SDPA is creating the causal mask based on query & key seq_len.
-    causal_mask = None
-    # --- END OF PATCH ---
-
-    hidden_states = inputs_embeds
-    position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-    for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-        hidden_states = decoder_layer(
-            hidden_states,
-            attention_mask=causal_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_values,
-            cache_position=cache_position,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-
-    hidden_states = self.norm(hidden_states)
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=past_key_values,
-    )
-
 def patch_hf_llama():
-    LlamaModel.forward = _patched_forward
     LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched
     PreTrainedModel._init_weights = _init_weights_patched
     PreTrainedModel._initialize_weights = _initialize_weights_patched
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index afafddd900..21fe8f1786 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -13,6 +13,8 @@
 from torchtitan.tools.logging import logger
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import AttentionInterface
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
@@ -47,7 +49,7 @@ def __init__(
         titan_args,
         deepseek_v3_args=None,
         # HuggingFace specific args
-        attn_implementation: str = "sdpa",
+        attn_implementation: str = "sdpa_torchtitan",
         **kwargs,
     ):
         assert titan_args is not None, "titan_args is required"
@@ -72,6 +74,8 @@ def __init__(
 
         # HuggingFace specific args
         self.attn_implementation = attn_implementation
+        #NOTE:(3outeille):This will force create_causal_mask to return None
+        AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
 
         # Start with passed_args as just titan_args
         self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}

From f9f5c66b1ceea0d49de22344078dfeaa724c7dc9 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 29 Sep 2025 14:58:41 +0000
Subject: [PATCH 053/129] uniformize llama and moe args passing

---
 .../transformers_backend/__init__.py          | 106 +++++++-----------
 .../compare_distributed_run.py                |  10 +-
 .../model/hf_transformers_args.py             |  12 +-
 3 files changed, 61 insertions(+), 67 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 6e6894b109..ac0431ec3f 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -20,10 +20,6 @@
 from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel
 
 from torchtitan.models.moe import MoEArgs
-from .model.hf_llama_patch import patch_hf_llama
-from .model.hf_deepseek_v3_patch import patch_hf_deepseek_v3
-
-
 
 __all__ = [
     "HFTransformerModelArgs",
@@ -74,69 +70,49 @@ class DeepSeekV3Args:
     mscale: Optional[float] = None
     partial_rotary_factor: Optional[float] = None
 
-# #TODO(3outeille): identify that if MoE model is used, we add a moe_args field
-
-if os.environ.get("MODEL_TYPE") == "llama3" or os.environ.get("MODEL_TYPE") == "meta-llama/Llama-3.2-1B":
-    print("Using llama model")
-    patch_hf_llama()
-    flavors = {
-        "debugmodel": HFTransformerModelArgs(
-            titan_args=TitanModelArgs(
-                max_seq_len=2048,
-                dim=256,
-                n_layers=6,
-                n_heads=16,
-                n_kv_heads=16,
-                vocab_size=2000,
-                rope_theta=500000
-            ),
-        ),
-        "medium": HFTransformerModelArgs(
-            titan_args=TitanModelArgs(
-                dim=1024,
-                n_layers=12,
-            ),
-        ),
-        "full": HFTransformerModelArgs(
-            titan_args=TitanModelArgs(),
+flavors = {
+    "debugmodel": HFTransformerModelArgs(
+        titan_args=TitanModelArgs(
+            vocab_size=2000,
+            dim=256,
+            n_layers=6,
+            n_heads=16,
+            n_kv_heads=16,
         ),
-    }
-else:
-    print("Using deepseek model")
-    patch_hf_deepseek_v3()
-    flavors = {
-        "debugmodel": HFTransformerModelArgs(
-            titan_args=TitanModelArgs(
-                vocab_size=2000,
-                dim=256,
-                n_layers=2,
-                n_heads=16,
-                n_kv_heads=16,
-            ),
-            deepseek_v3_args=DeepSeekV3Args(
-                partial_rotary_factor=4.0,
-                inter_dim=1024,
-                moe_inter_dim=256,
-                n_dense_layers=1,
-                n_group=2,
-                topk_group=1,
-                kv_lora_rank=512,
-                q_lora_rank=0,
-                qk_nope_head_dim=128,
-                qk_rope_head_dim=64,
-                v_head_dim=128,
-                mscale=0.70,
-                moe_args=MoEArgs(
-                    num_experts=8,
-                    num_shared_experts=2,
-                    top_k=3,
-                    score_func="softmax",
-                    route_norm=True,
-                    score_before_experts=False,
-                ),
-            )
+        deepseek_v3_args=None
+        # deepseek_v3_args=DeepSeekV3Args(
+        #     partial_rotary_factor=4.0,
+        #     inter_dim=1024,
+        #     moe_inter_dim=256,
+        #     n_dense_layers=1,
+        #     n_group=2,
+        #     topk_group=1,
+        #     kv_lora_rank=512,
+        #     q_lora_rank=0,
+        #     qk_nope_head_dim=128,
+        #     qk_rope_head_dim=64,
+        #     v_head_dim=128,
+        #     mscale=0.70,
+        #     moe_args=MoEArgs(
+        #         num_experts=8,
+        #         num_shared_experts=2,
+        #         top_k=3,
+        #         score_func="softmax",
+        #         route_norm=True,
+        #         score_before_experts=False,
+        #     ),
+        # )
+    ),
+    "medium": HFTransformerModelArgs(
+        titan_args=TitanModelArgs(
+            dim=1024,
+            n_layers=12,
         ),
-    }
+    ),
+    "full": HFTransformerModelArgs(
+        titan_args=TitanModelArgs(),
+    ),
+}
 
 hf_train_spec = TrainSpec(
     name="hf_auto_model",
diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index cc8f54f51b..3211326caf 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -511,7 +511,6 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
         ]
         env = os.environ.copy()
         env["SEED"] = str(self.seed)
-        env["MODEL_TYPE"] = model_name
         env["LOG_RANK"] = str(self.ngpu - 1)
 
         log_message(LogLevel.COMMAND, f"Command: {' '.join(cmd)}", indent=indent, dim=dim)
@@ -788,6 +787,15 @@ def run(self) -> int:
         if not self.compare_metrics(
             tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0
         ):
+            # generate diff between baseline TT and baseline HF
+            diff_file_tt_baseline_vs_hf_baseline = (
+                self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
+            )
+            self.generate_diff(
+                baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0
+            )
+            log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0)
+           
             raise ValueError(
                 f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}"
             )
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 21fe8f1786..2b9cec5678 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -15,6 +15,8 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_utils import AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
+from torchtitan.experiments.transformers_backend.model.hf_llama_patch import patch_hf_llama
+from torchtitan.experiments.transformers_backend.model.hf_deepseek_v3_patch import patch_hf_deepseek_v3
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
@@ -81,7 +83,7 @@ def __init__(
         self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}
         self._passed_args.update(kwargs)
 
-        # If DeepSeekV3 args are provided, fill the rest
+        #NOTE(3outeille): Wait for transformers uniformization of MoE args
         if deepseek_v3_args is not None:
             # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
             # setting it to None in HuggingFace.
@@ -285,6 +287,14 @@ def __init__(self, model_args: HFTransformerModelArgs):
                     f"Could not find model class '{model_class_name}' in globals or transformers. "
                     f"Make sure the class is available. Original error: {e}"
                 )
+        
+        if model_args.architectures[0] == "DeepseekV3Model":
+            print("Patching deepseek")
+            patch_hf_deepseek_v3()
+        else:
+            print("Patching llama")
+            patch_hf_llama()
+
         self.model = model_cls(config=model_args)
 
         for layer in self.model.model.layers:

From 5a875b66a0947f87053369a2b565f731e11777be Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 29 Sep 2025 15:24:10 +0000
Subject: [PATCH 054/129] cleaning code

---
 .../model/hf_deepseek_v3_patch.py             | 29 +----------
 .../model/hf_transformers_args.py             | 25 +---------
 torchtitan/models/deepseek_v3/__init__.py     |  2 +-
 torchtitan/models/deepseek_v3/model/args.py   | 16 ------
 torchtitan/models/deepseek_v3/model/model.py  | 32 +-----------
 torchtitan/models/llama3/infra/parallelize.py |  1 +
 torchtitan/models/llama3/model/args.py        | 16 ------
 torchtitan/models/moe.py                      | 31 +-----------
 torchtitan/train.py                           | 11 +----
 torchtitan/utils/test_utils.py                | 49 +++++++++++++++++++
 10 files changed, 58 insertions(+), 154 deletions(-)
 create mode 100644 torchtitan/utils/test_utils.py

diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
index 68594dc2be..c2cb960ac5 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
@@ -1,38 +1,13 @@
 import os
-import torch
 import torch.nn as nn
-import functools
+from torchtitan.utils.test_utils import seeded_init_decorator_for_test
 
 from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
 from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer
 from transformers.modeling_utils import PreTrainedModel
 
-_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__
-
-def seeded_init_decorator_for_test(seed):
-    """
-    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
-    and prints layer weights after initialization.
-    """
-    import lovely_tensors as lt; lt.monkey_patch()
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(self, module):
-            original_trunc_normal = nn.init.trunc_normal_
 
-            def seeded_trunc_normal(*args, **kwargs):
-                torch.manual_seed(seed)
-                tensor = args[0]  # First argument is always the tensor
-                result = original_trunc_normal(*args, **kwargs)
-                # module_name = getattr(module, "__class__", type(module)).__name__
-                # print(f"Module: {module_name}, Tensor value: {tensor}")
-                return result
-
-            nn.init.trunc_normal_ = seeded_trunc_normal
-            return func(self, module)
-
-        return wrapper
-    return decorator
+_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__
 
 def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int):
     _original_deepseek_v3_decoder_layer_init(self, config, layer_idx)
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 2b9cec5678..917d50a43f 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -12,6 +12,7 @@
 from torchtitan.protocols import BaseModelArgs
 from torchtitan.tools.logging import logger
 from transformers import AutoConfig
+from transformers.utils import is_torch_deterministic
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_utils import AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
@@ -247,30 +248,6 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in
 
         return nparams, num_flops_per_token
 
-    def debug_structure_param(self, model: nn.Module):
-        logger.info("Model Structure Parameter Breakdown:")
-
-        def _format_module(module: nn.Module, prefix: str = ""):
-            for name, sub_module in module.named_children():
-                sub_module_params = sum(p.numel() for p in sub_module.parameters())
-                if sub_module_params == 0:
-                    continue
-
-                # For HF models, we want to "unwrap" the ".model" attribute
-                # to get a view comparable to the native TorchTitan models.
-                if name == "model":
-                    _format_module(sub_module, prefix)
-                else:
-                    logger.info(
-                        f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
-                    )
-                    _format_module(sub_module, prefix + "  ")
-
-        total_params = sum(p.numel() for p in model.parameters())
-        logger.info(f"{model.__class__.__name__} - {total_params:,} params")
-        _format_module(model, "  ")
-
-
 class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
         super().__init__()
diff --git a/torchtitan/models/deepseek_v3/__init__.py b/torchtitan/models/deepseek_v3/__init__.py
index 3322ad0a83..1c3d2b19d2 100644
--- a/torchtitan/models/deepseek_v3/__init__.py
+++ b/torchtitan/models/deepseek_v3/__init__.py
@@ -35,7 +35,7 @@
         dim=256,
         inter_dim=1024,
         moe_inter_dim=256,
-        n_layers=2,
+        n_layers=3,
         n_dense_layers=1,
         n_heads=16,
         moe_args=MoEArgs(
diff --git a/torchtitan/models/deepseek_v3/model/args.py b/torchtitan/models/deepseek_v3/model/args.py
index 9451f01b01..d6afedfa34 100644
--- a/torchtitan/models/deepseek_v3/model/args.py
+++ b/torchtitan/models/deepseek_v3/model/args.py
@@ -159,19 +159,3 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in
         )
 
         return nparams, num_flops_per_token
-
-    def debug_structure_param(self, model: nn.Module):
-        logger.info("Model Structure Parameter Breakdown:")
-
-        def _format_module(module: nn.Module, prefix: str = ""):
-            for name, sub_module in module.named_children():
-                sub_module_params = sum(p.numel() for p in sub_module.parameters())
-                if sub_module_params > 0:
-                    logger.info(
-                        f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
-                    )
-                    _format_module(sub_module, prefix + "  ")
-
-        total_params = sum(p.numel() for p in model.parameters())
-        logger.info(f"{model.__class__.__name__} - {total_params:,} params")
-        _format_module(model, "  ")
diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py
index 5547840e27..260c7bf49a 100644
--- a/torchtitan/models/deepseek_v3/model/model.py
+++ b/torchtitan/models/deepseek_v3/model/model.py
@@ -6,7 +6,6 @@
 
 import math
 import os
-import functools
 from typing import Tuple
 
 import torch
@@ -15,39 +14,10 @@
 from torchtitan.models.attention import build_attention
 from torchtitan.models.moe import FeedForward, MoE
 from torchtitan.protocols.train_spec import ModelProtocol
-
+from torchtitan.utils.test_utils import seeded_init_decorator_for_test
 from .args import DeepSeekV3ModelArgs
 
 
-def seeded_init_decorator_for_test(seed):
-    """
-    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
-    and prints layer weights after initialization.
-    """
-    import lovely_tensors as lt; lt.monkey_patch()
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            original_trunc_normal = nn.init.trunc_normal_
- 
-            def seeded_trunc_normal(*trunc_args, **trunc_kwargs):
-                torch.manual_seed(seed)
-                tensor = trunc_args[0]  # First argument is always the tensor
-                result = original_trunc_normal(*trunc_args, **trunc_kwargs)
-                # # Try to get module info from the calling context
-                # module_name = "Unknown"
-                # if len(args) > 0 and hasattr(args[0], "__class__"):
-                #     module_name = args[0].__class__.__name__
-                # print(f"Module: {module_name}, Tensor value: {tensor}")
-                return result
- 
-            nn.init.trunc_normal_ = seeded_trunc_normal
-            return func(*args, **kwargs)
- 
-        return wrapper
-    return decorator
-
-
 # Adapted from https://github.com/DeepSeek-ai/DeepSeek-V3/blob/main/inference/model.py#L294
 def precompute_freqs_cis(args: DeepSeekV3ModelArgs) -> torch.Tensor:
     """
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
index 1a2528be6d..7d0b5de92b 100644
--- a/torchtitan/models/llama3/infra/parallelize.py
+++ b/torchtitan/models/llama3/infra/parallelize.py
@@ -34,6 +34,7 @@
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.tools.logging import logger
 
+
 def parallelize_llama(
     model: nn.Module,
     parallel_dims: ParallelDims,
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
index 5aaf3839ed..e2f698f8b1 100644
--- a/torchtitan/models/llama3/model/args.py
+++ b/torchtitan/models/llama3/model/args.py
@@ -75,19 +75,3 @@ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, in
         num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
 
         return nparams, num_flops_per_token
-    
-    def debug_structure_param(self, model: nn.Module):
-        logger.info("Model Structure Parameter Breakdown:")
-
-        def _format_module(module: nn.Module, prefix: str = ""):
-            for name, sub_module in module.named_children():
-                sub_module_params = sum(p.numel() for p in sub_module.parameters())
-                if sub_module_params > 0:
-                    logger.info(
-                        f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
-                    )
-                    _format_module(sub_module, prefix + "  ")
-
-        total_params = sum(p.numel() for p in model.parameters())
-        logger.info(f"{model.__class__.__name__} - {total_params:,} params")
-        _format_module(model, "  ")
\ No newline at end of file
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
index 5ba63b9157..e2e3981625 100644
--- a/torchtitan/models/moe.py
+++ b/torchtitan/models/moe.py
@@ -13,36 +13,7 @@
 
 from torchtitan.distributed.expert_parallel import expert_parallel
 import os
-import functools
-
-
-def seeded_init_decorator_for_test(seed):
-    """
-    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
-    and prints layer weights after initialization.
-    """
-    import lovely_tensors as lt; lt.monkey_patch()
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            original_trunc_normal = nn.init.trunc_normal_
- 
-            def seeded_trunc_normal(*trunc_args, **trunc_kwargs):
-                torch.manual_seed(seed)
-                tensor = trunc_args[0]  # First argument is always the tensor
-                result = original_trunc_normal(*trunc_args, **trunc_kwargs)
-                # # Try to get module info from the calling context
-                # module_name = "Unknown"
-                # if len(args) > 0 and hasattr(args[0], "__class__"):
-                #     module_name = args[0].__class__.__name__
-                # print(f"Module: {module_name}, Tensor value: {tensor}")
-                return result
- 
-            nn.init.trunc_normal_ = seeded_trunc_normal
-            return func(*args, **kwargs)
- 
-        return wrapper
-    return decorator
+from torchtitan.utils.test_utils import seeded_init_decorator_for_test
 
 @dataclass
 class MoEArgs:
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 735180ee5a..6fee3d587f 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -8,12 +8,11 @@
 import os
 import time
 from datetime import timedelta
-from transformers.utils import is_torch_deterministic
 from typing import Any, Generator, Iterable, Optional
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
-
+from torchtitan.utils.test_utils import debug_structure_param
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
 from torchtitan.components.dataloader import DataloaderExhaustedError
@@ -178,7 +177,7 @@ def __init__(self, job_config: JobConfig):
             self.metrics_processor.num_flops_per_token,
         ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
         
-        model_args.debug_structure_param(model)
+        debug_structure_param(model)
         
         logger.info(
             f"{color.blue}Model {self.train_spec.name} {job_config.model.flavor} "
@@ -250,9 +249,6 @@ def __init__(self, job_config: JobConfig):
             del model
 
             for m in self.model_parts:
-                if is_torch_deterministic():
-                    # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
-                    torch.utils.deterministic.fill_uninitialized_memory = False    
                 m.to_empty(device=init_device)
                 with torch.no_grad():
                     m.init_weights(buffer_device=buffer_device)
@@ -263,9 +259,6 @@ def __init__(self, job_config: JobConfig):
         else:
             # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
             model = self.train_spec.parallelize_fn(model, parallel_dims, job_config)
-            if is_torch_deterministic():
-                # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
-                torch.utils.deterministic.fill_uninitialized_memory = False
             model.to_empty(device=init_device)
             with torch.no_grad():
                 model.init_weights(buffer_device=buffer_device)
diff --git a/torchtitan/utils/test_utils.py b/torchtitan/utils/test_utils.py
new file mode 100644
index 0000000000..77db8bcfe6
--- /dev/null
+++ b/torchtitan/utils/test_utils.py
@@ -0,0 +1,49 @@
+import torch
+import functools
+import torch.nn as nn
+from torchtitan.tools.logging import logger
+from transformers.utils import is_torch_deterministic
+import lovely_tensors as lt; lt.monkey_patch()
+
+def debug_structure_param(model: nn.Module):
+    """Print a breakdown of model parameters by module structure."""
+    logger.info("Model Structure Parameter Breakdown:")
+
+    if is_torch_deterministic():
+        # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
+        torch.utils.deterministic.fill_uninitialized_memory = False
+
+    def _format_module(module: nn.Module, prefix: str = ""):
+        for name, sub_module in module.named_children():
+            sub_module_params = sum(p.numel() for p in sub_module.parameters())
+            if sub_module_params > 0:
+                logger.info(
+                    f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
+                )
+                _format_module(sub_module, prefix + "  ")
+
+    total_params = sum(p.numel() for p in model.parameters())
+    logger.info(f"{model.__class__.__name__} - {total_params:,} params")
+    _format_module(model, "  ")
+
+def seeded_init_decorator_for_test(seed):
+    """
+    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
+    and prints layer weights after initialization.
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            original_trunc_normal = nn.init.trunc_normal_
+            
+            def seeded_trunc_normal(*trunc_args, **trunc_kwargs):
+                torch.manual_seed(seed)
+                tensor = trunc_args[0]  # First argument is always the tensor
+                result = original_trunc_normal(*trunc_args, **trunc_kwargs)
+                return result
+            
+            nn.init.trunc_normal_ = seeded_trunc_normal
+            return func(*args, **kwargs)
+        
+        return wrapper
+    return decorator

From e4d963c5bfff9b66cc3c1569447cfcd8381cc4df Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 30 Sep 2025 13:40:02 +0000
Subject: [PATCH 055/129] fix same global_batch_size across training + fix
 float32 for test (even for fsdp)

---
 .../compare_distributed_run.py                | 45 +++++++++++++++----
 .../configs/test_template.toml                |  5 ++-
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 3211326caf..345dc91d33 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -11,6 +11,7 @@
             - train the nd-// TT counterpart
                 - diff between TT nd-// and HF nd-//
                 - diff between TT FSDP (baseline) and HF nd-//
+                - diff between TT FSDP (baseline) and TF nd-//
 results/
 |_ meta-llama
 	|_ Llama-3.2-1B
@@ -668,6 +669,32 @@ def _compare_one_parallelism_config(
                 indent=indent + 5,
                 dim=True,
             )
+
+            # generated diff between baseline TT and current tt nd-parallelism run
+            diff_file_tt_baseline_vs_tt_nd_parallelism = (
+                test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log"
+            )
+            self.generate_diff(
+                baseline_log_tt,
+                log_path_tt,
+                diff_file_tt_baseline_vs_tt_nd_parallelism,
+                indent=indent + 5,
+                dim=True,
+            )
+            if tt_metrics:
+                self.compare_metrics(
+                    tt_baseline_metrics,
+                    tt_metrics,
+                    f"{config.name} (TT baseline vs TT nd-parallel)",
+                    indent=indent + 5,
+                    dim=True,
+                )
+            log_message(
+                LogLevel.INFO,
+                f"Diff between baseline TT and current (TT) nd-parallelism run saved to: {diff_file_tt_baseline_vs_tt_nd_parallelism}",
+                indent=indent + 5,
+                dim=True,
+            )
             return False
 
     def run(self) -> int:
@@ -784,18 +811,18 @@ def run(self) -> int:
         if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
         
+        # generate diff between baseline TT and baseline HF
+        diff_file_tt_baseline_vs_hf_baseline = (
+            self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
+        )
+        self.generate_diff(
+            baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0
+        )
+        log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0)
+        
         if not self.compare_metrics(
             tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0
         ):
-            # generate diff between baseline TT and baseline HF
-            diff_file_tt_baseline_vs_hf_baseline = (
-                self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
-            )
-            self.generate_diff(
-                baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0
-            )
-            log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0)
-           
             raise ValueError(
                 f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}"
             )
diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml
index f56a0332d7..238f325ba2 100644
--- a/torchtitan/experiments/transformers_backend/configs/test_template.toml
+++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml
@@ -39,12 +39,15 @@ decay_type = "linear"
 min_lr_factor = 0.0
 
 [training]
-local_batch_size = 8
+global_batch_size = 4
+local_batch_size = 2
 seq_len = 2048
 max_norm = 1.0  # grad norm clipping
 steps = 10
 dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
 dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
+mixed_precision_param = "float32" # force float32 for comparison
+mixed_precision_reduce = "float32"
 
 [parallelism]
 data_parallel_replicate_degree = 1

From 957cc4a90007e1822430e435acad8456f6104b49 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 30 Sep 2025 15:29:12 +0000
Subject: [PATCH 056/129] refactor compare_distributed_run to make it slurm
 compatible

---
 .../compare_distributed_run.py                | 281 ++++++++++--------
 1 file changed, 158 insertions(+), 123 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 345dc91d33..8ec761fda2 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -514,7 +514,7 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
         env["SEED"] = str(self.seed)
         env["LOG_RANK"] = str(self.ngpu - 1)
 
-        log_message(LogLevel.COMMAND, f"Command: {' '.join(cmd)}", indent=indent, dim=dim)
+        log_message(LogLevel.COMMAND, f"{' '.join(cmd)}", indent=indent, dim=dim)
 
         try:
             # Capture output to include it in the exception, while still writing to log file
@@ -565,137 +565,134 @@ def _compare_one_parallelism_config(
         indent: int = 0,
     ) -> bool:
         """Compares a single parallelism configuration against the baseline."""
-        # Create a subdirectory for each test configuration
+        # New flow: launch all training, then all diff, then all extract/compare metrics
+
+        # --- 1. Setup directories and config files ---
         test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface"
         test_dir = self.results_dir / test_dir_name
         test_dir.mkdir(exist_ok=True)
 
         config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-        config_file_hf = self.generate_config(config_dir=test_dir, config=config, model_name=hf_model_name, backend="huggingface", filename=config_filename_hf, indent=indent)
+        config_file_hf = self.generate_config(
+            config_dir=test_dir,
+            config=config,
+            model_name=hf_model_name,
+            backend="huggingface",
+            filename=config_filename_hf,
+            indent=indent,
+        )
         log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
 
-        hf_run_error = self.run_training(config_file=config_file_hf, log_file=log_path_hf, config_name=config.name, model_name=hf_model_name, indent=indent)
-        
-        test_passed = True
-        hf_metrics = None
-        if hf_run_error:
-            log_message(LogLevel.TEST_FAIL, f"{config.name} (huggingface) - Training script failed.", indent=indent + 5, dim=True)
-            test_passed = False
-        else:
-            # Compare metrics only if training was successful
-            hf_metrics = self.extract_metrics(log_path_hf, indent=indent)
-            if not self.compare_metrics(hf_baseline_metrics, hf_metrics, f"{config.name} (huggingface)", indent=indent + 5, dim=True):
-                test_passed = False
+        config_filename_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+        config_file_tt = self.generate_config(
+            config_dir=test_dir,
+            config=config,
+            model_name=tt_model_name,
+            backend="torchtitan",
+            filename=config_filename_tt,
+            indent=indent + 5,
+            dim=True,
+        )
+        log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
+
+        # --- 2. Launch all training (HF and TT) ---
+        hf_run_error = self.run_training(
+            config_file=config_file_hf,
+            log_file=log_path_hf,
+            config_name=config.name,
+            model_name=hf_model_name,
+            indent=indent,
+        )
+        tt_run_error = self.run_training(
+            config_file=config_file_tt,
+            log_file=log_path_tt,
+            config_name=config.name,
+            model_name=tt_model_name,
+            indent=indent + 5,
+            dim=True,
+        )
 
-        if test_passed:
-            return True
-        else:
-            # Generate diff with baseline (HF)
-            diff_hf_baseline_vs_hf_nd_parallelism = (
-                test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"
-            )
-            self.generate_diff(
-                baseline_log_hf, log_path_hf, diff_hf_baseline_vs_hf_nd_parallelism, indent=indent + 5, dim=True
-            )
+        # If either training failed, log and skip further steps for this config
+        if hf_run_error:
             log_message(
-                LogLevel.INFO,
-                f"Diff between baseline (HF) and current (HF) nd-parallelism run saved to: {diff_hf_baseline_vs_hf_nd_parallelism}",
+                LogLevel.TEST_FAIL,
+                f"{config.name} (huggingface) - Training script failed.",
                 indent=indent + 5,
                 dim=True,
             )
+            return False
 
-            # Run TT counterpart and generated diff between nd-paralellism TT and current hf nd-parallelism run
-            config_filename_tt = (
-                test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
-            )
-            config_file_tt = self.generate_config(config_dir=test_dir, config=config, model_name=tt_model_name, backend="torchtitan", filename=config_filename_tt, indent=indent + 5, dim=True)
-            log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
-            tt_run_error = self.run_training(config_file=config_file_tt, log_file=log_path_tt, config_name=config.name, model_name=tt_model_name, indent=indent + 5, dim=True)
-            if tt_run_error:
-                raise ValueError(
-                    f"TorchTitan training failed for {tt_model_name}"
-                ) from tt_run_error
-
-            tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True)
-
-            # generated diff between nd-paralellism TT and current hf nd-parallelism run
-            diff_file_tt_nd_parallelism_vs_hf_nd_parallelism = (
-                test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"
-            )
-            self.generate_diff(
-                log_path_tt,
-                log_path_hf,
-                diff_file_tt_nd_parallelism_vs_hf_nd_parallelism,
+        if tt_run_error:
+            log_message(
+                LogLevel.TEST_FAIL,
+                f"{config.name} (torchtitan) - Training script failed.",
                 indent=indent + 5,
                 dim=True,
             )
-            if hf_metrics:
-                self.compare_metrics(
+            return False
+
+        # --- 3. Generate all diffs ---
+        list_of_diffs = {
+            "HF baseline vs HF nd-parallel": (baseline_log_hf, log_path_hf, test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"),
+            "TT nd-parallel vs HF nd-parallel": (log_path_tt, log_path_hf, test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"),
+            "TT baseline vs HF nd-parallel": (baseline_log_tt, log_path_hf, test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"),
+            "TT baseline vs TT nd-parallel": (baseline_log_tt, log_path_tt, test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log"),
+        }
+        for src, dst, output in list_of_diffs.values():
+            self.generate_diff(src, dst, output, indent=indent + 5, dim=True)
+
+        # --- 4. Extract all metrics ---
+        hf_metrics = self.extract_metrics(log_path_hf, indent=indent)
+        tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True)
+
+        # --- 5. Compare metrics and determine pass/fail ---
+        test_passed = True
+
+        for diff_name, (src, dst, output) in list_of_diffs.items():
+            if "TT nd-parallel vs HF nd-parallel" == diff_name:
+                metrics_passed = self.compare_metrics(
                     tt_metrics,
                     hf_metrics,
-                    f"{config.name} (TT nd-parallel vs HF nd-parallel)",
+                    diff_name,
                     indent=indent + 5,
                     dim=True,
                 )
-            log_message(
-                LogLevel.INFO,
-                f"Diff between nd-paralellism TT and current (HF) nd-parallelism run saved to: {diff_file_tt_nd_parallelism_vs_hf_nd_parallelism}",
-                indent=indent + 5,
-                dim=True,
-            )
-
-            # generated diff between baseline TT and current hf nd-parallelism run
-            diff_file_tt_baseline_vs_hf_nd_parallelism = (
-                test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"
-            )
-            self.generate_diff(
-                baseline_log_tt,
-                log_path_hf,
-                diff_file_tt_baseline_vs_hf_nd_parallelism,
-                indent=indent + 5,
-                dim=True,
-            )
-            if hf_metrics:
-                self.compare_metrics(
+            elif "TT baseline vs TT nd-parallel" == diff_name:
+                metrics_passed = self.compare_metrics(
                     tt_baseline_metrics,
-                    hf_metrics,
-                    f"{config.name} (TT baseline vs HF nd-parallel)",
+                    tt_metrics,
+                    diff_name,
                     indent=indent + 5,
                     dim=True,
                 )
-            log_message(
-                LogLevel.INFO,
-                f"Diff between baseline TT and current (HF) nd-parallelism run saved to: {diff_file_tt_baseline_vs_hf_nd_parallelism}",
-                indent=indent + 5,
-                dim=True,
-            )
-
-            # generated diff between baseline TT and current tt nd-parallelism run
-            diff_file_tt_baseline_vs_tt_nd_parallelism = (
-                test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log"
-            )
-            self.generate_diff(
-                baseline_log_tt,
-                log_path_tt,
-                diff_file_tt_baseline_vs_tt_nd_parallelism,
-                indent=indent + 5,
-                dim=True,
-            )
-            if tt_metrics:
-                self.compare_metrics(
+            elif "TT baseline vs HF nd-parallel" == diff_name:
+                metrics_passed = self.compare_metrics(
                     tt_baseline_metrics,
-                    tt_metrics,
-                    f"{config.name} (TT baseline vs TT nd-parallel)",
+                    hf_metrics,
+                    diff_name,
                     indent=indent + 5,
                     dim=True,
                 )
+            else:  # HF baseline vs HF nd-parallel == diff_name
+                metrics_passed = self.compare_metrics(
+                    hf_baseline_metrics,
+                    hf_metrics,
+                    diff_name,
+                    indent=indent + 5,
+                    dim=True,
+                )
+
+            if not metrics_passed:
+                test_passed = False
+
             log_message(
                 LogLevel.INFO,
-                f"Diff between baseline TT and current (TT) nd-parallelism run saved to: {diff_file_tt_baseline_vs_tt_nd_parallelism}",
-                indent=indent + 5,
+                f"Diff between {diff_name} saved to: {output}",
+                indent=indent + 10,
                 dim=True,
             )
-            return False
+
+        return test_passed
 
     def run(self) -> int:
         """Main execution function. Runs all test suites for all models."""
@@ -788,44 +785,82 @@ def run(self) -> int:
         )
 
         baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None)
-        
+        # --- 1. Generate configs ---
         baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-        baseline_config_file_hf = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=hf_model_name, backend="huggingface", filename=baseline_config_filename_hf, indent=0)
+        baseline_config_file_hf = self.generate_config(
+            config_dir=self.results_dir,
+            config=baseline_config,
+            model_name=hf_model_name,
+            backend="huggingface",
+            filename=baseline_config_filename_hf,
+            indent=0
+        )
         baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log"
-        hf_baseline_run_error = self.run_training(config_file=baseline_config_file_hf, log_file=baseline_log_hf, config_name=baseline_config.name, model_name=hf_model_name, indent=0)
+
+        baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+        baseline_config_file_tt = self.generate_config(
+            config_dir=self.results_dir,
+            config=baseline_config,
+            model_name=tt_model_name,
+            backend="torchtitan", 
+            filename=baseline_config_filename_tt,
+            indent=0
+        )
+        baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log"
+
+        # --- 2. Launch all training ---
+        hf_baseline_run_error = self.run_training(
+            config_file=baseline_config_file_hf,
+            log_file=baseline_log_hf,
+            config_name=baseline_config.name,
+            model_name=hf_model_name,
+            indent=0
+        )
         if hf_baseline_run_error:
             raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error
 
+        tt_baseline_run_error = self.run_training(
+            config_file=baseline_config_file_tt,
+            log_file=baseline_log_tt,
+            config_name=baseline_config.name,
+            model_name=tt_model_name,
+            indent=0
+        )
+        if tt_baseline_run_error:
+            raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error
+
+        # --- 3. Generate diff ---
+        diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
+        self.generate_diff(
+            baseline_log_tt,
+            baseline_log_hf,
+            diff_file_tt_baseline_vs_hf_baseline,
+            indent=0
+        )
+        log_message(
+            LogLevel.INFO,
+            f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}",
+            indent=5,
+            dim=True
+        )
+
+        # --- 4. Extract metrics ---
         hf_baseline_metrics = self.extract_metrics(baseline_log_hf, indent=0)
         if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
 
-        baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
-        baseline_config_file_tt = self.generate_config(config_dir=self.results_dir, config=baseline_config, model_name=tt_model_name, backend="torchtitan", filename=baseline_config_filename_tt, indent=0)
-        baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log"
-        tt_baseline_run_error = self.run_training(config_file=baseline_config_file_tt, log_file=baseline_log_tt, config_name=baseline_config.name, model_name=tt_model_name, indent=0)
-        if tt_baseline_run_error:
-            raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error
-
         tt_baseline_metrics = self.extract_metrics(baseline_log_tt, indent=0)
         if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
             raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
-        
-        # generate diff between baseline TT and baseline HF
-        diff_file_tt_baseline_vs_hf_baseline = (
-            self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
-        )
-        self.generate_diff(
-            baseline_log_tt, baseline_log_hf, diff_file_tt_baseline_vs_hf_baseline, indent=0
-        )
-        log_message(LogLevel.INFO, f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}", indent=0)
-        
+
+        # --- 5. Compare metrics ---
         if not self.compare_metrics(
-            tt_baseline_metrics, hf_baseline_metrics, "baseline (TT) vs baseline (HF)", indent=0
+            tt_baseline_metrics,
+            hf_baseline_metrics,
+            "baseline (TT) vs baseline (HF)",
+            indent=5
         ):
-            raise ValueError(
-                f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}"
-            )
+            raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
 
         console.print()
         console.print(

From a317c53dcddc0e0685c63bd2e21af0cfa13631c1 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 1 Oct 2025 12:31:03 +0000
Subject: [PATCH 057/129] breaking test

---
 .../compare_distributed_run.py                | 206 ++++++++++++++----
 1 file changed, 169 insertions(+), 37 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
index 8ec761fda2..b42e8b0138 100644
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
@@ -21,7 +21,7 @@
 				|_ baseline_tt_fsdp_4gpu.log
 				|_ baseline_fsdp_debugmodel_4gpu_huggingface.toml
 				|_ baseline_fsdp_debugmodel_4gpu_torchtitan.toml
-				|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface/
+				|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu/
 					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.toml
 					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_torchtitan.toml
 					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.log
@@ -33,7 +33,7 @@
 				|_ baseline_tt_fsdp_4gpu.log
 				|_ baseline_fsdp_full_4gpu_huggingface.toml
 				|_ baseline_fsdp_full_4gpu_torchtitan.toml
-				|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface/
+				|_ fsdp1_cp1_tp2_pp2_full_4gpu/
 					|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.toml
 					|_ fsdp1_cp1_tp2_pp2_full_4gpu_torchtitan.toml
 					|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.log
@@ -494,7 +494,7 @@ def _filter_log(log_file: Path) -> Path:
         except Exception as e:
             log_message(LogLevel.WARNING, f"Could not generate diff: {e}", indent=indent, dim=dim)
     
-    def run_training(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]:
+    def run_training_local(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]:
         """Run training with given configuration."""
         log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}", indent=indent, dim=dim)
         cmd = [
@@ -553,6 +553,9 @@ def run_training(self, config_file: Path, log_file: Path, config_name: str, mode
             e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or '<no output captured>'}")
             return e
     
+    def run_training_slurm(self):
+        pass
+
     def _compare_one_parallelism_config(
         self,
         config: "ParallelismConfig",
@@ -568,7 +571,7 @@ def _compare_one_parallelism_config(
         # New flow: launch all training, then all diff, then all extract/compare metrics
 
         # --- 1. Setup directories and config files ---
-        test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface"
+        test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu"
         test_dir = self.results_dir / test_dir_name
         test_dir.mkdir(exist_ok=True)
 
@@ -596,14 +599,14 @@ def _compare_one_parallelism_config(
         log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
 
         # --- 2. Launch all training (HF and TT) ---
-        hf_run_error = self.run_training(
+        hf_run_error = self.run_training_local(
             config_file=config_file_hf,
             log_file=log_path_hf,
             config_name=config.name,
             model_name=hf_model_name,
             indent=indent,
         )
-        tt_run_error = self.run_training(
+        tt_run_error = self.run_training_local(
             config_file=config_file_tt,
             log_file=log_path_tt,
             config_name=config.name,
@@ -694,35 +697,8 @@ def _compare_one_parallelism_config(
 
         return test_passed
 
-    def run(self) -> int:
+    def run_local(self, args: argparse.Namespace) -> int:
         """Main execution function. Runs all test suites for all models."""
-        parser = argparse.ArgumentParser(
-            description="Test different parallelism configurations against a baseline FSDP model.",
-        )
-        parser.add_argument("-m", "--model-filter", default="",
-                          help="Filter models by name pattern (e.g., 'llama3')")
-        parser.add_argument("-t", "--test-filter", default="",
-                          help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')")
-        parser.add_argument("-nd", "--nd_parallel", type=str, default="2d",
-                          help=f"Parallelism to use (default: {self.ND_PARALLEL_TO_NB_GPUS.keys()})")
-        parser.add_argument("-s", "--steps", type=int, default=self.DEFAULT_STEPS,
-                          help=f"Training steps (default: {self.DEFAULT_STEPS})")
-        parser.add_argument("--flavor", default=self.DEFAULT_FLAVOR,
-                          help=f"Model flavor/size (default: {self.DEFAULT_FLAVOR}). "
-                               f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]")
-        parser.add_argument("-v", "--verbose", action="store_true",
-                          help="Verbose output")
-        parser.add_argument("--loss-atol", type=float, default=self.DEFAULT_LOSS_ATOL,
-                          help=f"Absolute tolerance for loss comparison (default: {self.DEFAULT_LOSS_ATOL})")
-        parser.add_argument("--loss-rtol", type=float, default=self.DEFAULT_LOSS_RTOL,
-                          help=f"Relative tolerance for loss comparison (default: {self.DEFAULT_LOSS_RTOL})")
-        parser.add_argument("--grad-norm-atol", type=float, default=self.DEFAULT_GRAD_NORM_ATOL,
-                          help=f"Absolute tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_ATOL})")
-        parser.add_argument("--grad-norm-rtol", type=float, default=self.DEFAULT_GRAD_NORM_RTOL,
-                          help=f"Relative tolerance for grad norm comparison (default: {self.DEFAULT_GRAD_NORM_RTOL})")
-        
-        args = parser.parse_args()
-        
         self.nd_parallel = args.nd_parallel
         self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
         self.steps = args.steps
@@ -809,7 +785,7 @@ def run(self) -> int:
         baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log"
 
         # --- 2. Launch all training ---
-        hf_baseline_run_error = self.run_training(
+        hf_baseline_run_error = self.run_training_local(
             config_file=baseline_config_file_hf,
             log_file=baseline_log_hf,
             config_name=baseline_config.name,
@@ -819,7 +795,7 @@ def run(self) -> int:
         if hf_baseline_run_error:
             raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error
 
-        tt_baseline_run_error = self.run_training(
+        tt_baseline_run_error = self.run_training_local(
             config_file=baseline_config_file_tt,
             log_file=baseline_log_tt,
             config_name=baseline_config.name,
@@ -960,12 +936,168 @@ def run(self) -> int:
                 LogLevel.INFO, f"Check the diff files in {self.results_dir} for details"
             )
             return 1
+        
+    def run_slurm(self, args: argparse.Namespace) -> int:
+        """Main execution function. Runs all test suites for all models."""
+        self.nd_parallel = args.nd_parallel
+        self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
+        self.steps = args.steps
+        self.model_filter = args.model_filter
+        self.test_filter = args.test_filter
+        self.flavor = args.flavor
+        self.verbose = args.verbose
+        self.loss_atol = args.loss_atol
+        self.loss_rtol = args.loss_rtol
+        self.grad_norm_atol = args.grad_norm_atol
+        self.grad_norm_rtol = args.grad_norm_rtol
+
+        console.print(
+            Panel(
+                (
+                    f"[bold]GPUs:[/bold] {self.ngpu}\n"
+                    f"[bold]Steps:[/bold] {self.steps}\n"
+                    f"[bold]Seed:[/bold] {self.seed}\n"
+                    f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n"
+                    f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n"
+                    f"[bold]Model flavor:[/bold] {self.flavor}"
+                ),
+                title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]",
+                expand=False,
+                border_style="blue",
+                padding=(1, 2),
+            )
+        )
+        console.print()
+
+        self.base_results_dir.mkdir(exist_ok=True)
+
+        # TODO(3outeille): make it more generic later
+        if self.model_filter == "llama3":
+            hf_model_name = "meta-llama/Llama-3.2-1B"
+            tt_model_name = "llama3"
+        elif self.model_filter == "deepseek_v3":
+            hf_model_name = "deepseek-ai/DeepSeek-V3"
+            tt_model_name = "deepseek_v3"
+        else:
+            raise ValueError(f"Model filter {self.model_filter} not supported")
+            
+        self.generate_parallelism_configs(hf_model_name)
+            
+        model_owner, model_repo = hf_model_name.split("/", 1)
+        nd_parallel_upper = self.nd_parallel.upper()
+        self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor
+        self.results_dir.mkdir(parents=True, exist_ok=True)
+
+        if self.verbose:
+            log_message(LogLevel.INFO, f"Results directory: {self.results_dir}")
+
+        console.print(
+            Panel(
+                "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]",
+                expand=False,
+                border_style="blue",
+                padding=(0, 2),
+            )
+        )
+
+        # --- 1. Generate configs ---
+
+        L = []
+
+        for config in self.parallelism_configs:
+
+            config_dir = self.results_dir if config.name == "fsdp" else self.results_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu"
+            config_dir.mkdir(exist_ok=True)
+
+            config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
+            config_file_hf = self.generate_config(
+                config_dir=config_dir,
+                config=config,
+                model_name=hf_model_name,
+                backend="huggingface",
+                filename=config_filename_hf,
+                indent=0
+            )
+            config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
+            config_file_tt = self.generate_config(
+                config_dir=config_dir,
+                config=config,
+                model_name=tt_model_name,
+                backend="torchtitan", 
+                filename=config_filename_tt,
+                indent=0
+            )
+            log_path_hf = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
+            log_path_tt = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
 
+            L.append((config_file_hf, config_file_tt, log_path_hf, log_path_tt))
+
+
+        # Launch slurm training
+        jobs = []
+        from slurm_utils import Job, Status
+        for config_file_hf, config_file_tt, log_path_hf, log_path_tt in L:
+            job_hf = Job(config_file_hf, log_path_hf, qos="high")
+            job_tt = Job(config_file_tt, log_path_tt, qos="high")
+
+            job_tt.set_status(Status.INIT)
+            job_hf.set_status(Status.INIT)
+            jobs.append(job_hf)
+            jobs.append(job_tt)
+
+        scheduler = Scheduler()
+
+        scheduler.create_slurm_script(jobs)
+        # submit in subprocess
+        scheduler.submit_jobs(jobs) # -> job.set_status(Status.PENDING)
+
+        scheduler.wait_for_all_jobs_to_complete() # spawn tmux to monitor jobs
+        #NOTE(3outeille): run_slurm() should not be run if <TODO(3outeille): think of the condition to not rerun>
+
+    def run_tests_slurm(self, args: argparse.Namespace) -> int:
+        # TODO(3outeille): do diff + compare metrics
+        pass
 
 def main():
     """Entry point for the script."""
+    parser = argparse.ArgumentParser(
+        description="Test different parallelism configurations against a baseline FSDP model.",
+    )
+    parser.add_argument("--use_slurm", action="store_true",
+                        help="Use SLURM for job submission")
+    parser.add_argument("--run_tests_slurm", action="store_true",
+                        help="Run tests with SLURM")
+    parser.add_argument("-m", "--model-filter", default="",
+                        help="Filter models by name pattern (e.g., 'llama3')")
+    parser.add_argument("-t", "--test-filter", default="",
+                        help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')")
+    parser.add_argument("-nd", "--nd_parallel", type=str, default="2d",
+                        help=f"Parallelism to use (default: {CompareDistributedRun.ND_PARALLEL_TO_NB_GPUS.keys()})")
+    parser.add_argument("-s", "--steps", type=int, default=CompareDistributedRun.DEFAULT_STEPS,
+                        help=f"Training steps (default: {CompareDistributedRun.DEFAULT_STEPS})")
+    parser.add_argument("--flavor", default=CompareDistributedRun.DEFAULT_FLAVOR,
+                        help=f"Model flavor/size (default: {CompareDistributedRun.DEFAULT_FLAVOR}). "
+                            f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]")
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="Verbose output")
+    parser.add_argument("--loss-atol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_ATOL,
+                        help=f"Absolute tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_ATOL})")
+    parser.add_argument("--loss-rtol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_RTOL,
+                        help=f"Relative tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_RTOL})")
+    parser.add_argument("--grad-norm-atol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL,
+                        help=f"Absolute tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL})")
+    parser.add_argument("--grad-norm-rtol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL,
+                        help=f"Relative tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL})")
+    
+    args = parser.parse_args()
+        
     runner = CompareDistributedRun()
-    return runner.run()
+    if args.use_slurm:
+        return runner.run_slurm(args)
+    elif args.run_tests_slurm:
+        return runner.run_tests_slurm(args)
+    else:
+        return runner.run_local(args)
 
 if __name__ == "__main__":
     sys.exit(main())

From d2f80a213564bf64d86bb9c48c446d95a7bb4692 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sat, 4 Oct 2025 10:41:44 +0000
Subject: [PATCH 058/129] refactor test

---
 .../configs/template.slurm                    |  54 +++
 .../configs/test_template.toml                |   5 +-
 .../test_hf_integration.py                    | 340 ++++++++++++++++++
 3 files changed, 397 insertions(+), 2 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/configs/template.slurm
 create mode 100644 torchtitan/experiments/transformers_backend/test_hf_integration.py

diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm
new file mode 100644
index 0000000000..3d4d5d587d
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/configs/template.slurm
@@ -0,0 +1,54 @@
+#!/bin/bash
+#SBATCH --job-name={{ name }}
+#SBATCH --output={{ root_path }}/slurm_%j.out
+#SBATCH --error={{ root_path }}/slurm_%j.out
+#SBATCH --nodes={{ nodes }}
+#SBATCH --ntasks-per-node={{ n_proc_per_node }}
+#SBATCH --gpus-per-task=1
+#SBATCH --qos={{ qos }}
+#SBATCH --cpus-per-task=12
+
+# Misc initializations.
+echo "========================"
+echo "START TIME: $(date)"
+source /etc/profile.d/modules.sh
+source /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/bin/activate
+echo python3 version = $(python3 --version)
+echo "==========="
+
+# Slurm stuff
+export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=$((1024 + RANDOM % 64511))
+
+export TMPDIR=/scratch
+export TORCH_HOME="/fsx/ferdinandmom/cache/torch"
+export HF_HOME="/fsx/ferdinandmom/cache/huggingface"
+export HF_DATASETS_CACHE="/fsx/ferdinandmom/cache/huggingface/datasets"
+export TRANSFORMERS_CACHE="/fsx/ferdinandmom/cache/huggingface/transformers"
+export CUBLAS_WORKSPACE_CONFIG=":4096:8"
+export CUDA_DEVICE_MAX_CONNECTIONS="1"
+export UV_CACHE_DIR="/fsx/ferdinandmom/.cache/uv"
+
+module load cuda/12.4
+
+echo "Running training job: {{ name }}"
+echo "Config file: {{ config_path }}"
+
+{% if name == "seed_checkpoint" %}
+python /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/scripts/download_hf_assets.py --repo_id {{ repo_id }} --local_dir {{ root_path }} --assets tokenizer
+{% endif %}
+
+torchrun \
+   --nproc_per_node {{ n_proc_per_node }} \
+   --nnodes {{ nodes }} \
+   --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
+   --rdzv_backend c10d \
+   --max_restarts 0 \
+   --tee 3 \
+   -m torchtitan.train \
+   --checkpoint.enable \
+   {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \
+   --training.seed 42 \
+   --training.deterministic \
+   --job.config_file {{ config_path }}
diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml
index 238f325ba2..8521b351a6 100644
--- a/torchtitan/experiments/transformers_backend/configs/test_template.toml
+++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml
@@ -4,7 +4,7 @@
 dump_folder = "./outputs"
 description = "Llama 3 debug training"
 print_args = false
-use_for_integration_test = true
+use_for_integration_test = false
 
 [profiling]
 enable_profiling = true
@@ -24,7 +24,8 @@ enable_wandb = false
 name = "llama3"
 flavor = "debugmodel"
 # test folder with tokenizer.json, for debug purpose only
-hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+#hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+hf_assets_path = ""
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
new file mode 100644
index 0000000000..ef645eaac7
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py
@@ -0,0 +1,340 @@
+import toml
+from argparse import ArgumentParser
+from pathlib import Path
+import re
+import os
+import subprocess
+from enum import Enum
+from jinja2 import Template
+
+
+def _create_slurm_script(
+    config: dict,
+    config_path: Path,
+    script_path: Path,
+    job_name: str,
+    initial_load_path: str = None,
+    repo_id: str = None,
+):
+    with open(config_path, "r") as file:
+        config = toml.load(file)
+
+    pp = config["parallelism"]["pipeline_parallel_degree"]
+    dp = config["parallelism"]["data_parallel_shard_degree"]
+    tp = config["parallelism"]["tensor_parallel_degree"]
+    cp = config["parallelism"]["context_parallel_degree"]
+    world_size = pp * dp * tp * cp
+
+    nodes = max(1, world_size // 8)
+    n_proc_per_node = min(8, world_size // nodes)
+
+    print(f"world_size: {world_size}, nodes: {nodes}, n_proc_per_node: {n_proc_per_node}")
+
+    # Read the SLURM script template from the file
+    template_path = Path(__file__).parent / "configs/template.slurm"
+    with open(template_path, "r") as f:
+        slurm_script_template = f.read()
+    base_bench_template = Template(slurm_script_template)
+
+    context_bench = {
+        "name": job_name,
+        "nodes": nodes,
+        "n_proc_per_node": n_proc_per_node,
+        "root_path": script_path.parent,
+        "config_path": config_path,
+        "initial_load_path": initial_load_path,
+        "repo_id": repo_id,
+        "qos": "high" if nodes > 1 else "normal",  # Example logic for qos
+    }
+
+    with open(script_path, "w") as file:
+        file.write(base_bench_template.render(context_bench))
+
+    print(f"Slurm script created at {script_path}")
+
+
+def create_configs(model_name: str, out_dir: str, flavor: str):
+    """
+    results/
+        |_ meta-llama
+            |_ Llama-3.2-1B
+                |_ debugmodel/
+                    |_ seed_checkpoint/
+                        |_ config.toml
+                        |_ seed.slurm
+                        |_ step-0/
+                           |_ ....
+                    |_baseline_fsdp2/
+                        |_ config.toml
+                        |_ nd_parallelism.slurm
+                        |_ nd_parallelism.log
+                    |_ fsdp2_tp2_cp1_pp1/
+                        |_ config.toml
+                        |_ nd_parallelism.slurm
+                        |_ nd_parallelism.log
+                        |_ diff_baseline_vs_nd_parallelism.log
+                    |_ fsdp2_tp1_cp1_pp2/
+                        |_ config.toml
+                        |_ nd_parallelism.slurm
+                        |_ nd_parallelism.log
+                        |_ diff_baseline_vs_nd_parallelism.log
+                    |_ fsdp2_tp1_cp2_pp1/
+                        |_ config.toml
+                        |_ nd_parallelism.slurm
+                        |_ nd_parallelism.log
+                        |_ diff_baseline_vs_nd_parallelism.log
+                    |_ fsdp2_tp1_cp2_pp2/
+                        |_ config.toml
+                        |_ nd_parallelism.slurm
+                        |_ nd_parallelism.log
+                        |_ diff_baseline_vs_nd_parallelism.log
+                    |_ fsdp2_tp2_cp2_pp1/
+                        |_ config.toml
+                        |_ nd_parallelism.slurm
+                        |_ nd_parallelism.log
+                        |_ diff_baseline_vs_nd_parallelism.log
+                    |_ fsdp2_tp2_cp2_pp2/
+                        |_ config.toml
+                        |_ nd_parallelism.slurm
+                        |_ nd_parallelism.log
+                        |_ diff_baseline_vs_nd_parallelism.log`
+                |_ full/
+                ...
+        |_ llama3 #torchtitan model
+    """
+
+    base_config = "configs/test_template.toml"
+    with open(base_config, "r") as f:
+        config = toml.load(f)
+
+    config["model"]["name"] = model_name
+    config["model"]["flavor"] = flavor
+
+    parallelism_configs = [
+        "fsdp2_tp1_cp1_pp1", # baseline
+        "fsdp2_tp2_cp1_pp1",
+        "fsdp2_tp1_cp1_pp2",
+        "fsdp2_tp1_cp2_pp1",
+        "fsdp2_tp1_cp2_pp2",
+        "fsdp2_tp2_cp2_pp1",
+        "fsdp2_tp2_cp2_pp2",
+    ]
+
+    out_path = Path(out_dir) / model_name / flavor
+    out_path.mkdir(parents=True, exist_ok=True)
+
+    # Create seed checkpoint
+    seed_config = toml.loads(toml.dumps(config))
+    seed_config["parallelism"]["data_parallel_shard_degree"] = 1
+    seed_config["parallelism"]["tensor_parallel_degree"] = 1
+    seed_config["parallelism"]["pipeline_parallel_degree"] = 1
+    seed_config["parallelism"]["context_parallel_degree"] = 1
+    seed_checkpoint_dir = out_path / "seed_checkpoint"
+    seed_checkpoint_dir.mkdir(exist_ok=True)
+    seed_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name)
+    seed_config["model"]["tokenizer_path"] = str(seed_checkpoint_dir / Path(model_name).name)
+    seed_config_path = seed_checkpoint_dir / "config.toml"
+    with open(seed_config_path, "w") as f:
+        toml.dump(seed_config, f)
+    print(f"Created {seed_config_path}")
+    _create_slurm_script(
+        seed_config,
+        seed_config_path,
+        seed_checkpoint_dir / "seed.slurm",
+        "seed_checkpoint",
+        repo_id=model_name,
+    )
+
+    # Create parallelism configs
+    for pc in parallelism_configs:
+        iter_config = toml.loads(toml.dumps(config))
+
+        m = re.match(r"fsdp(\d+)_tp(\d+)_cp(\d+)_pp(\d+)", pc)
+        if not m:
+            print(f"Skipping invalid config string: {pc}")
+            continue
+
+        fsdp, tp, cp, pp = map(int, m.groups())
+
+        pc_dir = out_path / pc
+        pc_dir.mkdir(exist_ok=True)
+
+        iter_config["parallelism"]["data_parallel_shard_degree"] = fsdp
+        iter_config["parallelism"]["tensor_parallel_degree"] = tp
+        iter_config["parallelism"]["context_parallel_degree"] = cp
+        iter_config["parallelism"]["pipeline_parallel_degree"] = pp
+        iter_config["parallelism"]["pipeline_parallel_schedule"] = "1F1B"
+        iter_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name)
+
+        config_path = pc_dir / "config.toml"
+        with open(config_path, "w") as f:
+            toml.dump(iter_config, f)
+        print(f"Created {config_path}")
+        _create_slurm_script(
+            iter_config,
+            config_path,
+            pc_dir / "nd_parallelism.slurm",
+            pc,
+            initial_load_path=str(seed_checkpoint_dir / "step-0"),
+            repo_id=model_name,
+        )
+
+class Status(Enum):
+    # INIT -> PENDING -> [RUNNING | FAIL] -> COMPLETED
+    INIT = "init"  # Job is created
+    PENDING = "pending"  # Job is waiting for ressources
+    RUNNING = "running"  # Job is running
+    FAIL = "fail"  # Job failed
+    COMPLETED = "completed"  # Job is completed
+
+class Job:
+    def __init__(self, root_path: str, qos: str) -> None:
+        self.root_path = root_path
+        self.name = os.path.basename(root_path)
+        if self.name == os.path.basename(os.path.normpath(args.inp_dir)):
+            self.name = "baseline_fsdp2"
+            self.config = os.path.join(root_path, "baseline_fsdp2_config.toml")
+            self.slurm_script = os.path.join(root_path, "baseline_fsdp2.slurm")
+        else:
+            self.config = os.path.join(root_path, "config.toml")
+            self.slurm_script = os.path.join(root_path, "nd_parallelism.slurm")
+
+        self.qos = qos
+
+        # Check if the status.txt file exists
+        status_file_path = os.path.join(self.root_path, "status.txt")
+        if not os.path.exists(status_file_path):
+            # Create the status.txt file with INIT status
+            with open(status_file_path, "w") as f:
+                f.write(Status.INIT.value)
+        self.status = self.get_status()
+
+    def get_status(self) -> Status:
+        """
+        Read the status of the job from `status.txt` and return it
+        """
+        is_existing = lambda value_to_check: any(
+            value.value == value_to_check for value in Status.__members__.values()
+        )
+
+        status_file_path = os.path.join(self.root_path, "status.txt")
+        with open(status_file_path, "r") as f:
+            status = f.read().strip()
+            if not is_existing(status):
+                raise ValueError(f"Invalid status: {status}")
+            return Status(status)
+
+    def set_status(self, status: Status) -> Status:
+        """
+        Update the status of the job in `status.txt` and return the new status
+        """
+        status_file_path = os.path.join(self.root_path, "status.txt")
+        with open(status_file_path, "w") as f:
+            f.write(status.value)
+            return status
+
+class Scheduler:
+    def __init__(self, inp_dir: str, qos: str) -> None:
+        # Find all leaf directories, and the top-level directory if it contains a config.
+        jobs_directory_paths = []
+        for root, dirs, files in os.walk(inp_dir):
+            is_job_dir = any(f.endswith(".toml") for f in files)
+            if is_job_dir:
+                if not dirs: # leaf node
+                    jobs_directory_paths.append(os.path.abspath(root))
+                # also capture baseline job in root
+                elif root == inp_dir:
+                    jobs_directory_paths.append(os.path.abspath(root))
+
+        self.job_lists = [Job(job_path, qos) for job_path in jobs_directory_paths]
+
+    def keep_only_jobs(self, status: Status):
+        return [job for job in self.job_lists if job.status == status]
+
+    def filter_out_jobs(self, status: Status):
+        return [job for job in self.job_lists if job.status != status]
+
+    def check_status(self):
+        status_files = [os.path.join(job.root_path, "status.txt") for job in self.job_lists]
+
+        status_counts = {status.value: 0 for status in Status}
+
+        for status_file in status_files:
+            with open(status_file, "r") as f:
+                status = f.read().strip()
+                if status in status_counts:
+                    status_counts[status] += 1
+                else:
+                    raise ValueError(f"Invalid status: {status}")
+
+        total = sum(status_counts.values())
+
+        print(f"{'Status':<10} | {'Count':<6}")
+        print(f"{'-'*10}-|-{'-'*6}")
+        for status, count in status_counts.items():
+            print(f"{status.capitalize():<10} | {count:<6}")
+
+        print(f"{'-'*10}-|-{'-'*6}")
+        print(f"{'Total':<10} | {total:<6}")
+
+
+def submit_jobs(inp_dir, qos, only: str = None, seed_checkpoint: str = None):
+    scheduler = Scheduler(inp_dir, qos)
+
+    env_vars = os.environ.copy()
+    total_jobs = len(scheduler.job_lists)
+
+    if only:
+        try:
+            status_to_filter = Status(only)
+            scheduler.job_lists = scheduler.keep_only_jobs(status_to_filter)
+        except ValueError:
+            print(f"Invalid status for --only: {only}")
+            return
+
+    if only is not None:
+        filtered_jobs = len(scheduler.job_lists)
+        if filtered_jobs == 0:
+            print(f"No '{only}' jobs to resubmit")
+            return
+        print(
+            f"Only {filtered_jobs}/{total_jobs} jobs with status '{only}' will be resubmitted"
+        )
+
+    scheduler.job_lists = scheduler.filter_out_jobs(Status.COMPLETED)
+
+    for job in scheduler.job_lists:
+        subprocess.run(["sbatch", job.slurm_script], env=env_vars)
+        job.set_status(Status.PENDING)
+
+
+def report(inp_dir: str):
+    scheduler = Scheduler(inp_dir, qos="N/A")
+    scheduler.check_status()
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    subparsers = parser.add_subparsers(dest="action")
+
+    create_configs_parser = subparsers.add_parser("create_configs")
+    create_configs_parser.add_argument("--model_name", type=str, required=True)
+    create_configs_parser.add_argument("--out_dir", type=str, required=True)
+    create_configs_parser.add_argument("--flavor", type=str, required=True)
+    submit_jobs_parser = subparsers.add_parser("submit_jobs")
+    submit_jobs_parser.add_argument("--inp_dir", type=str, required=True)
+    submit_jobs_parser.add_argument("--seed_checkpoint", type=str, default=None)
+    submit_jobs_parser.add_argument("--qos", type=str, required=True, choices=["low", "normal", "high", "prod"])
+    submit_jobs_parser.add_argument("--only", type=str, default=None, choices=[s.value for s in Status])
+
+    report_parser = subparsers.add_parser("report")
+    report_parser.add_argument("--inp_dir", type=str, required=True)
+
+    args = parser.parse_args()
+
+    if args.action == "create_configs":
+        create_configs(args.model_name, args.out_dir, args.flavor)
+    elif args.action == "submit_jobs":
+        submit_jobs(args.inp_dir, args.qos, args.only, args.seed_checkpoint)
+    elif args.action == "report":
+        report(args.inp_dir)
\ No newline at end of file

From 6454e40a93ef8a41b39b81893b2a7f4081e7ed03 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sun, 5 Oct 2025 20:40:04 +0000
Subject: [PATCH 059/129] fix running job to slurm

---
 .../transformers_backend/configs/template.slurm            | 6 ++----
 .../transformers_backend/configs/test_template.toml        | 3 +--
 .../transformers_backend/test_hf_integration.py            | 7 +++----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm
index 3d4d5d587d..31016c37f2 100644
--- a/torchtitan/experiments/transformers_backend/configs/template.slurm
+++ b/torchtitan/experiments/transformers_backend/configs/template.slurm
@@ -35,16 +35,14 @@ module load cuda/12.4
 echo "Running training job: {{ name }}"
 echo "Config file: {{ config_path }}"
 
-{% if name == "seed_checkpoint" %}
-python /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/scripts/download_hf_assets.py --repo_id {{ repo_id }} --local_dir {{ root_path }} --assets tokenizer
-{% endif %}
-
 torchrun \
    --nproc_per_node {{ n_proc_per_node }} \
    --nnodes {{ nodes }} \
    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
    --rdzv_backend c10d \
    --max_restarts 0 \
+   --local-ranks-filter {{ n_proc_per_node - 1 }} \
+   --role rank \
    --tee 3 \
    -m torchtitan.train \
    --checkpoint.enable \
diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml
index 8521b351a6..fa0c763ed7 100644
--- a/torchtitan/experiments/transformers_backend/configs/test_template.toml
+++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml
@@ -24,8 +24,7 @@ enable_wandb = false
 name = "llama3"
 flavor = "debugmodel"
 # test folder with tokenizer.json, for debug purpose only
-#hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
-hf_assets_path = ""
+hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
 # converters = ["float8"]
 
 [optimizer]
diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
index ef645eaac7..0e0dbe3e78 100644
--- a/torchtitan/experiments/transformers_backend/test_hf_integration.py
+++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py
@@ -131,8 +131,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
     seed_config["parallelism"]["context_parallel_degree"] = 1
     seed_checkpoint_dir = out_path / "seed_checkpoint"
     seed_checkpoint_dir.mkdir(exist_ok=True)
-    seed_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name)
-    seed_config["model"]["tokenizer_path"] = str(seed_checkpoint_dir / Path(model_name).name)
+    seed_config["job"]["dump_folder"] = str(seed_checkpoint_dir)
     seed_config_path = seed_checkpoint_dir / "config.toml"
     with open(seed_config_path, "w") as f:
         toml.dump(seed_config, f)
@@ -164,7 +163,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
         iter_config["parallelism"]["context_parallel_degree"] = cp
         iter_config["parallelism"]["pipeline_parallel_degree"] = pp
         iter_config["parallelism"]["pipeline_parallel_schedule"] = "1F1B"
-        iter_config["model"]["hf_assets_path"] = str(seed_checkpoint_dir / Path(model_name).name)
+        iter_config["job"]["dump_folder"] = str(pc_dir)
 
         config_path = pc_dir / "config.toml"
         with open(config_path, "w") as f:
@@ -175,7 +174,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
             config_path,
             pc_dir / "nd_parallelism.slurm",
             pc,
-            initial_load_path=str(seed_checkpoint_dir / "step-0"),
+            initial_load_path=str(seed_checkpoint_dir / "checkpoint/step-0"),
             repo_id=model_name,
         )
 

From b99a4d2c082ffd44e2c949c929229276a2f3f778 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sun, 5 Oct 2025 21:22:34 +0000
Subject: [PATCH 060/129] finally have a better testing xp with slurm

---
 .../test_hf_integration.py                    | 491 ++++++++++++++++--
 1 file changed, 443 insertions(+), 48 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
index 0e0dbe3e78..38c4982319 100644
--- a/torchtitan/experiments/transformers_backend/test_hf_integration.py
+++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py
@@ -6,6 +6,54 @@
 import subprocess
 from enum import Enum
 from jinja2 import Template
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
+
+console = Console()
+
+class LogLevel(Enum):
+    INFO = "INFO"
+    SUCCESS = "SUCCESS"
+    WARNING = "WARNING"
+    ERROR = "ERROR"
+    TEST_PASS = "TEST_PASS"
+    TEST_FAIL = "TEST_FAIL"
+
+def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None:
+    """Log a message with appropriate color coding."""
+    style_map = {
+        LogLevel.INFO: "blue",
+        LogLevel.SUCCESS: "green",
+        LogLevel.WARNING: "yellow",
+        LogLevel.ERROR: "bold red",
+        LogLevel.TEST_PASS: "green",
+        LogLevel.TEST_FAIL: "bold red",
+    }
+
+    prefix_map = {
+        LogLevel.INFO: "[INFO]",
+        LogLevel.SUCCESS: "[SUCCESS]",
+        LogLevel.WARNING: "[WARNING]",
+        LogLevel.ERROR: "[ERROR]",
+        LogLevel.TEST_PASS: "✅ TEST PASS",
+        LogLevel.TEST_FAIL: "❌ TEST FAIL",
+    }
+
+    style = style_map[level]
+    prefix = prefix_map[level]
+    if indent > 0:
+        indent_str = "  " * (indent - 1) + "└─ "
+    else:
+        indent_str = ""
+         
+    output = f"{indent_str}[{style}]{prefix}[/] {message}"
+
+    if dim:
+        console.print(f"[dim]{output}[/dim]")
+    else:
+        console.print(output)
 
 
 def _create_slurm_script(
@@ -64,7 +112,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
                         |_ seed.slurm
                         |_ step-0/
                            |_ ....
-                    |_baseline_fsdp2/
+                    |_ fsdp2_tp1_cp1_pp1/
                         |_ config.toml
                         |_ nd_parallelism.slurm
                         |_ nd_parallelism.log
@@ -84,16 +132,6 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
                         |_ nd_parallelism.log
                         |_ diff_baseline_vs_nd_parallelism.log
                     |_ fsdp2_tp1_cp2_pp2/
-                        |_ config.toml
-                        |_ nd_parallelism.slurm
-                        |_ nd_parallelism.log
-                        |_ diff_baseline_vs_nd_parallelism.log
-                    |_ fsdp2_tp2_cp2_pp1/
-                        |_ config.toml
-                        |_ nd_parallelism.slurm
-                        |_ nd_parallelism.log
-                        |_ diff_baseline_vs_nd_parallelism.log
-                    |_ fsdp2_tp2_cp2_pp2/
                         |_ config.toml
                         |_ nd_parallelism.slurm
                         |_ nd_parallelism.log
@@ -187,15 +225,15 @@ class Status(Enum):
     COMPLETED = "completed"  # Job is completed
 
 class Job:
-    def __init__(self, root_path: str, qos: str) -> None:
+    def __init__(self, root_path: str, qos: str, inp_dir: str = None) -> None:
         self.root_path = root_path
         self.name = os.path.basename(root_path)
-        if self.name == os.path.basename(os.path.normpath(args.inp_dir)):
-            self.name = "baseline_fsdp2"
-            self.config = os.path.join(root_path, "baseline_fsdp2_config.toml")
-            self.slurm_script = os.path.join(root_path, "baseline_fsdp2.slurm")
+        
+        self.config = os.path.join(root_path, "config.toml")
+        seed_slurm = os.path.join(root_path, "seed.slurm")
+        if os.path.exists(seed_slurm):
+            self.slurm_script = seed_slurm
         else:
-            self.config = os.path.join(root_path, "config.toml")
             self.slurm_script = os.path.join(root_path, "nd_parallelism.slurm")
 
         self.qos = qos
@@ -245,7 +283,7 @@ def __init__(self, inp_dir: str, qos: str) -> None:
                 elif root == inp_dir:
                     jobs_directory_paths.append(os.path.abspath(root))
 
-        self.job_lists = [Job(job_path, qos) for job_path in jobs_directory_paths]
+        self.job_lists = [Job(job_path, qos, inp_dir) for job_path in jobs_directory_paths]
 
     def keep_only_jobs(self, status: Status):
         return [job for job in self.job_lists if job.status == status]
@@ -253,31 +291,8 @@ def keep_only_jobs(self, status: Status):
     def filter_out_jobs(self, status: Status):
         return [job for job in self.job_lists if job.status != status]
 
-    def check_status(self):
-        status_files = [os.path.join(job.root_path, "status.txt") for job in self.job_lists]
-
-        status_counts = {status.value: 0 for status in Status}
-
-        for status_file in status_files:
-            with open(status_file, "r") as f:
-                status = f.read().strip()
-                if status in status_counts:
-                    status_counts[status] += 1
-                else:
-                    raise ValueError(f"Invalid status: {status}")
-
-        total = sum(status_counts.values())
-
-        print(f"{'Status':<10} | {'Count':<6}")
-        print(f"{'-'*10}-|-{'-'*6}")
-        for status, count in status_counts.items():
-            print(f"{status.capitalize():<10} | {count:<6}")
-
-        print(f"{'-'*10}-|-{'-'*6}")
-        print(f"{'Total':<10} | {total:<6}")
 
-
-def submit_jobs(inp_dir, qos, only: str = None, seed_checkpoint: str = None):
+def submit_jobs(inp_dir, qos, only: str = None):
     scheduler = Scheduler(inp_dir, qos)
 
     env_vars = os.environ.copy()
@@ -307,10 +322,385 @@ def submit_jobs(inp_dir, qos, only: str = None, seed_checkpoint: str = None):
         job.set_status(Status.PENDING)
 
 
-def report(inp_dir: str):
-    scheduler = Scheduler(inp_dir, qos="N/A")
-    scheduler.check_status()
+def check_status(inp_dir: str):
+    """
+    Display a table showing the count of jobs in each status.
+    Reads status.txt from all job directories found in inp_dir.
+    """
+    # Find all directories with status.txt files
+    jobs_directory_paths = []
+    for root, dirs, files in os.walk(inp_dir):
+        if "status.txt" in files:
+            jobs_directory_paths.append(os.path.abspath(root))
+    
+    if not jobs_directory_paths:
+        print(f"No jobs found in {inp_dir}")
+        return
+    
+    # Count jobs by status
+    status_counts = {status: 0 for status in Status}
+    for job_path in jobs_directory_paths:
+        job = Job(job_path, qos="N/A")
+        status_counts[job.status] += 1
+    
+    total = len(jobs_directory_paths)
+    
+    # Print table
+    print("\nJob Status Summary")
+    print("=" * 30)
+    print(f"{'Status':<12} | {'Count':>5}")
+    print("-" * 30)
+    print(f"{'Init':<12} | {status_counts[Status.INIT]:>5}")
+    print(f"{'Pending':<12} | {status_counts[Status.PENDING]:>5}")
+    print(f"{'Running':<12} | {status_counts[Status.RUNNING]:>5}")
+    print(f"{'Fail':<12} | {status_counts[Status.FAIL]:>5}")
+    print(f"{'Completed':<12} | {status_counts[Status.COMPLETED]:>5}")
+    print("-" * 30)
+    print(f"{'Total':<12} | {total:>5}")
+    print("=" * 30)
+
 
+def report(inp_dir: str):
+    """
+    Generate diff reports between baseline (fsdp2_tp1_cp1_pp1) and all other parallelism configs.
+    Creates diff_baseline_vs_nd_parallelism.log in each non-baseline config directory.
+    Automatically discovers all model/flavor combinations under inp_dir.
+    """
+    # Add imports
+    import torch
+    from dataclasses import dataclass, field
+    from typing import List
+    
+    @dataclass
+    class TrainingMetrics:
+        """Training metrics extracted from logs."""
+        steps: List[int] = field(default_factory=list)
+        loss: List[float] = field(default_factory=list)
+        grad_norm: List[float] = field(default_factory=list)
+    
+    # Default tolerance values (matching compare_distributed_run.py)
+    DEFAULT_LOSS_ATOL = 0.02
+    DEFAULT_LOSS_RTOL = 1e-5
+    DEFAULT_GRAD_NORM_ATOL = 0.02
+    DEFAULT_GRAD_NORM_RTOL = 1e-5
+    
+    def _extract_metrics(log_file: Path) -> TrainingMetrics:
+        """Extract metrics from log file."""
+        metrics = TrainingMetrics()
+        
+        try:
+            with open(log_file, 'r') as f:
+                content = f.read()
+
+            # Regex to capture all metrics from a log line, ignoring ANSI color codes
+            pattern = re.compile(
+                r"step:\s*(\d+)\s*"
+                r".*?loss:\s*([0-9]+\.?[0-9]*)\s*"
+                r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*"
+            )
+
+            for match in pattern.finditer(content):
+                metrics.steps.append(int(match.group(1)))
+                metrics.loss.append(float(match.group(2)))
+                metrics.grad_norm.append(float(match.group(3)))
+                
+        except Exception as e:
+            log_message(LogLevel.WARNING, f"Could not extract metrics: {e}", indent=3, dim=True)
+        
+        return metrics
+    
+    def _compare_metrics(baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, 
+                        config_name: str) -> tuple[bool, str]:
+        """Compare metrics between baseline and test configuration.
+        
+        Returns:
+            tuple[bool, str]: (passed, summary_message)
+        """
+        if not baseline_metrics.loss or not test_metrics.loss:
+            return False, f"Unable to extract metrics"
+        
+        # Convert to tensors
+        baseline_loss = torch.tensor(baseline_metrics.loss)
+        test_loss = torch.tensor(test_metrics.loss)
+        baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm)
+        test_grad_norm = torch.tensor(test_metrics.grad_norm)
+        
+        # Check if tensors are close
+        loss_pass = torch.allclose(baseline_loss, test_loss, atol=DEFAULT_LOSS_ATOL, rtol=DEFAULT_LOSS_RTOL)
+        grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=DEFAULT_GRAD_NORM_ATOL, rtol=DEFAULT_GRAD_NORM_RTOL)
+
+        # Calculate max absolute differences for logging
+        loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
+        grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
+        
+        # Calculate min absolute differences for logging
+        loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
+        grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
+
+        summary = (f"Max loss diff: {loss_max_diff:.2e}, "
+                  f"Min loss diff: {loss_min_diff:.2e}, "
+                  f"Max grad norm diff: {grad_norm_diff:.2e}, "
+                  f"Min grad norm diff: {grad_norm_min_diff:.2e}")
+        
+        return (loss_pass and grad_pass), summary
+
+    def _filter_log(log_file: Path) -> Path:
+        """Filter log file to normalize volatile information (timestamps, PIDs, ports)."""
+        filtered_file = log_file.with_suffix(log_file.suffix + '.filtered')
+        
+        with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile:
+            for line in infile:
+                # Apply filtering patterns to remove volatile information
+                line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', 
+                            'TIMESTAMP', line)
+                line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', 
+                            'torchrun ... --master_port=XXXX', line)
+                line = re.sub(r'PID [0-9]+', 'PID XXXX', line)
+                line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line)
+                outfile.write(line)
+        
+        return filtered_file
+
+    def _generate_diff(baseline_log: Path, test_log: Path, diff_file: Path) -> tuple[bool, str]:
+        """Generate diff between baseline and test logs using git diff.
+        
+        Returns:
+            tuple[bool, str]: (success, diff_output or error_message)
+        """
+        # Filter logs to remove timestamps and volatile information
+        baseline_filtered = _filter_log(baseline_log)
+        test_filtered = _filter_log(test_log)
+        
+        try:
+            # Generate colored diff using git diff
+            cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color",
+                str(baseline_filtered), str(test_filtered)]
+            
+            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            
+            # git diff returns exit code 1 when files differ (which is expected), not an error
+            if result.returncode not in [0, 1]:
+                error_msg = f"git diff failed with code {result.returncode}\n{result.stderr}"
+                return False, error_msg
+            
+            # Write diff to file
+            with open(diff_file, 'w') as f:
+                f.write(result.stdout)
+            
+            return True, result.stdout
+            
+        finally:
+            # Clean up filtered files
+            if baseline_filtered.exists():
+                baseline_filtered.unlink()
+            if test_filtered.exists():
+                test_filtered.unlink()
+
+    def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
+        """Process a single model/flavor directory.
+        
+        Returns:
+            tuple[int, int]: (passed_count, failed_count)
+        """
+        # Find baseline directory
+        baseline_dir = flavor_dir / "fsdp2_tp1_cp1_pp1"
+        if not baseline_dir.exists():
+            log_message(LogLevel.WARNING, f"No baseline directory found in {flavor_dir.relative_to(inp_path)}, skipping", indent=1)
+            return 0, 0
+        
+        # Find baseline .out file
+        baseline_out_files = list(baseline_dir.glob("*.out"))
+        if not baseline_out_files:
+            log_message(LogLevel.WARNING, f"No .out file found in baseline {baseline_dir.relative_to(inp_path)}, skipping", indent=1)
+            return 0, 0
+        baseline_out = baseline_out_files[0]
+        
+        # Extract baseline metrics
+        log_message(LogLevel.INFO, f"Extracting baseline metrics from {baseline_out.name}...", indent=1)
+        baseline_metrics = _extract_metrics(baseline_out)
+        if not baseline_metrics.loss or not baseline_metrics.grad_norm:
+            log_message(LogLevel.WARNING, "Could not extract baseline metrics, skipping comparisons", indent=1)
+            return 0, 0
+        
+        # Find all parallelism config directories (excluding seed_checkpoint and baseline)
+        config_dirs = []
+        for item in flavor_dir.iterdir():
+            if item.is_dir() and item.name.startswith("fsdp2_") and item.name != "fsdp2_tp1_cp1_pp1":
+                config_dirs.append(item)
+        
+        if not config_dirs:
+            log_message(LogLevel.INFO, f"No test configurations found in {flavor_dir.relative_to(inp_path)}", indent=1)
+            return 0, 0
+        
+        console.print()
+        console.print(
+            Panel(
+                f"[cyan]Baseline:[/cyan] {baseline_out.relative_to(flavor_dir)}\n"
+                f"[cyan]Configurations to compare:[/cyan] {len(config_dirs)}",
+                title=f"[bold cyan]Processing {flavor_dir.relative_to(inp_path)}[/bold cyan]",
+                expand=False,
+                border_style="cyan",
+                padding=(0, 2),
+            )
+        )
+        
+        # Track results for summary
+        results = []
+        
+        # Generate diffs for each config
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+            TimeElapsedColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task("[cyan]Processing configurations...", total=len(config_dirs))
+            
+            for i, config_dir in enumerate(sorted(config_dirs)):
+                if i > 0:
+                    console.rule(style="dim")
+                
+                progress.update(task, description=f"[cyan]Testing [bold]{config_dir.name}[/bold]")
+                
+                # Find .out file in config directory
+                test_out_files = list(config_dir.glob("*.out"))
+                if not test_out_files:
+                    log_message(LogLevel.WARNING, f"{config_dir.name}: No .out file found, skipping", indent=1)
+                    results.append((config_dir.name, False, "No .out file found"))
+                    progress.advance(task)
+                    continue
+                
+                test_out = test_out_files[0]
+                diff_file = config_dir / "diff_baseline_vs_nd_parallelism.log"
+                
+                # Extract test metrics
+                test_metrics = _extract_metrics(test_out)
+                
+                # Compare metrics
+                if test_metrics.loss and test_metrics.grad_norm:
+                    test_passed, metrics_summary = _compare_metrics(baseline_metrics, test_metrics, config_dir.name)
+                    
+                    if test_passed:
+                        log_message(LogLevel.TEST_PASS, f"{config_dir.name} - {metrics_summary}", indent=1)
+                        results.append((config_dir.name, True, metrics_summary))
+                    else:
+                        log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - {metrics_summary}", indent=1)
+                        results.append((config_dir.name, False, metrics_summary))
+                else:
+                    log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - Unable to extract metrics", indent=1)
+                    results.append((config_dir.name, False, "Unable to extract metrics"))
+                
+                # Generate diff
+                try:
+                    success, output = _generate_diff(baseline_out, test_out, diff_file)
+                    
+                    if success:
+                        log_message(LogLevel.INFO, f"Diff between baseline vs HF nd-parallel saved to:", indent=5, dim=True)
+                        console.print(f"      [dim]{diff_file.relative_to(flavor_dir)}[/dim]")
+                    else:
+                        log_message(LogLevel.WARNING, f"Failed to generate diff: {output}", indent=5, dim=True)
+                        
+                except Exception as e:
+                    log_message(LogLevel.WARNING, f"Failed to generate diff - {e}", indent=5, dim=True)
+                
+                progress.advance(task)
+        
+        console.print()
+        # Create summary table
+        summary_table = Table(
+            title=f"[bold]Summary for {flavor_dir.relative_to(inp_path)}[/bold]",
+            show_header=True,
+            header_style="bold magenta"
+        )
+        summary_table.add_column("Configuration", style="cyan")
+        summary_table.add_column("Status", justify="center")
+        summary_table.add_column("Metrics", style="dim")
+        
+        for name, passed, summary in results:
+            status = "[bold green]✅ PASS[/bold green]" if passed else "[bold red]❌ FAIL[/bold red]"
+            # Truncate summary if too long
+            display_summary = summary if len(summary) < 60 else summary[:57] + "..."
+            summary_table.add_row(name, status, display_summary)
+        
+        console.print(summary_table)
+        console.print()
+        
+        passed_count = sum(1 for _, passed, _ in results if passed)
+        failed_count = len(results) - passed_count
+        
+        return passed_count, failed_count
+
+    inp_path = Path(inp_dir)
+    
+    if not inp_path.exists():
+        console.print(f"[bold red]Error:[/bold red] Directory not found: {inp_path}")
+        return
+    
+    console.print(
+        Panel(
+            "[bold cyan]HuggingFace Integration Test Report Generator[/bold cyan]",
+            expand=False,
+            border_style="blue",
+            padding=(1, 2),
+        )
+    )
+    console.print()
+    
+    # Find all directories that contain a baseline (fsdp2_tp1_cp1_pp1) subdirectory
+    flavor_dirs = []
+    for root, dirs, files in os.walk(inp_path):
+        if "fsdp2_tp1_cp1_pp1" in dirs:
+            flavor_dirs.append(Path(root))
+    
+    if not flavor_dirs:
+        log_message(LogLevel.ERROR, f"No directories with baseline configuration found under {inp_path}")
+        console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1_pp1' subdirectory[/yellow]")
+        return
+    
+    log_message(LogLevel.INFO, f"Found {len(flavor_dirs)} model/flavor combination(s) to process:")
+    for flavor_dir in flavor_dirs:
+        console.print(f"  [cyan]•[/cyan] {flavor_dir.relative_to(inp_path)}")
+    
+    # Process each flavor directory
+    total_passed = 0
+    total_failed = 0
+    
+    for flavor_dir in flavor_dirs:
+        passed, failed = _process_flavor_dir(flavor_dir)
+        total_passed += passed
+        total_failed += failed
+    
+    # Final summary
+    console.print()
+    console.print(
+        Panel(
+            "[bold cyan]Overall Summary[/bold cyan]",
+            expand=False,
+            border_style="blue",
+            padding=(0, 2),
+        )
+    )
+    
+    overall_table = Table(show_header=True, header_style="bold magenta")
+    overall_table.add_column("Metric", style="cyan")
+    overall_table.add_column("Value", justify="right")
+    
+    total_tests = total_passed + total_failed
+    overall_table.add_row("Total Configurations Tested", str(total_tests))
+    overall_table.add_row("[green]Passed[/green]", str(total_passed))
+    overall_table.add_row("[red]Failed[/red]", str(total_failed))
+    
+    console.print(overall_table)
+    console.print()
+    
+    if total_failed == 0 and total_tests > 0:
+        log_message(LogLevel.SUCCESS, "All tests passed! 🎉")
+    elif total_tests > 0:
+        log_message(LogLevel.WARNING, f"{total_failed} configuration(s) had test failures")
+    
+    log_message(LogLevel.SUCCESS, "Diff generation complete!")
 
 if __name__ == "__main__":
     parser = ArgumentParser()
@@ -320,20 +710,25 @@ def report(inp_dir: str):
     create_configs_parser.add_argument("--model_name", type=str, required=True)
     create_configs_parser.add_argument("--out_dir", type=str, required=True)
     create_configs_parser.add_argument("--flavor", type=str, required=True)
+
     submit_jobs_parser = subparsers.add_parser("submit_jobs")
     submit_jobs_parser.add_argument("--inp_dir", type=str, required=True)
-    submit_jobs_parser.add_argument("--seed_checkpoint", type=str, default=None)
     submit_jobs_parser.add_argument("--qos", type=str, required=True, choices=["low", "normal", "high", "prod"])
     submit_jobs_parser.add_argument("--only", type=str, default=None, choices=[s.value for s in Status])
 
     report_parser = subparsers.add_parser("report")
     report_parser.add_argument("--inp_dir", type=str, required=True)
 
+    check_status_parser = subparsers.add_parser("check_status")
+    check_status_parser.add_argument("--inp_dir", type=str, required=True)
+
     args = parser.parse_args()
 
     if args.action == "create_configs":
         create_configs(args.model_name, args.out_dir, args.flavor)
     elif args.action == "submit_jobs":
-        submit_jobs(args.inp_dir, args.qos, args.only, args.seed_checkpoint)
+        submit_jobs(args.inp_dir, args.qos, args.only)
     elif args.action == "report":
-        report(args.inp_dir)
\ No newline at end of file
+        report(args.inp_dir)
+    elif args.action == "check_status":
+        check_status(args.inp_dir)
\ No newline at end of file

From 218f40071d77ad471869d033c3ce6e3cdb0ef215 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 9 Oct 2025 12:09:50 +0000
Subject: [PATCH 061/129] now everything works (1D/2D/3D/4D). need to fix
 correctness with PP

---
 .../configs/template.slurm                    | 73 +++++++++++++++++--
 .../infra/parallelize_hf_transformers.py      | 41 ++++++++---
 .../model/hf_transformers_args.py             | 21 +++++-
 .../test_hf_integration.py                    | 35 +++++++--
 4 files changed, 143 insertions(+), 27 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm
index 31016c37f2..493b569e95 100644
--- a/torchtitan/experiments/transformers_backend/configs/template.slurm
+++ b/torchtitan/experiments/transformers_backend/configs/template.slurm
@@ -3,8 +3,8 @@
 #SBATCH --output={{ root_path }}/slurm_%j.out
 #SBATCH --error={{ root_path }}/slurm_%j.out
 #SBATCH --nodes={{ nodes }}
-#SBATCH --ntasks-per-node={{ n_proc_per_node }}
-#SBATCH --gpus-per-task=1
+#SBATCH --gres=gpu:{{ n_proc_per_node }}
+#SBATCH --ntasks-per-node=1
 #SBATCH --qos={{ qos }}
 #SBATCH --cpus-per-task=12
 
@@ -30,23 +30,86 @@ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
 export CUDA_DEVICE_MAX_CONNECTIONS="1"
 export UV_CACHE_DIR="/fsx/ferdinandmom/.cache/uv"
 
+# EFA settings
+export FI_PROVIDER=efa
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_ENABLE_SHM_TRANSFER=1
+export NCCL_PROTO=simple
+export NCCL_SOCKET_IFNAME=enp
+
 module load cuda/12.4
 
 echo "Running training job: {{ name }}"
 echo "Config file: {{ config_path }}"
 
-torchrun \
+# Function to update status based on squeue output
+update_status() {
+    job_id=$1
+    status_file=$2
+    # For unknown reasons, it doenst update status for pending. It only works for running 
+    while true; do
+        job_status=$(squeue --job $job_id --noheader --format=%T)
+        echo "Job status: $job_status"
+        if [ -z "$job_status" ]; then
+            # Job has finished or is not found
+            break
+        elif [ "$job_status" = "RUNNING" ]; then
+            printf "running" > $status_file
+            break
+        fi
+        sleep 10
+    done
+}
+
+# Update status to "pending" or "running" in the background
+update_status $job_id {{ root_path }}/status.txt &
+
+# LOG_DIR="{{ root_path }}/logs"
+# mkdir -p ${LOG_DIR}
+
+# CMD="torchrun \
+#    --nproc_per_node {{ n_proc_per_node }} \
+#    --nnodes {{ nodes }} \
+#    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
+#    --rdzv_backend c10d \
+#    --max_restarts 0 \
+#    --log-dir ${LOG_DIR} \
+#    --role rank \
+#    --tee 3 \
+#    -m torchtitan.train \
+#    --checkpoint.enable \
+#    {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \
+#    --training.seed 42 \
+#    --training.deterministic \
+#    --training.steps 1 \
+#    --job.config_file {{ config_path }}"
+
+
+CMD="torchrun \
    --nproc_per_node {{ n_proc_per_node }} \
    --nnodes {{ nodes }} \
    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
    --rdzv_backend c10d \
    --max_restarts 0 \
-   --local-ranks-filter {{ n_proc_per_node - 1 }} \
    --role rank \
+   --local_ranks_filter {{ n_proc_per_node - 1 }} \
    --tee 3 \
    -m torchtitan.train \
    --checkpoint.enable \
    {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \
    --training.seed 42 \
    --training.deterministic \
-   --job.config_file {{ config_path }}
+   --job.config_file {{ config_path }}"
+
+# Run the main command
+echo "Running command: srun -u $CMD"
+srun -u $CMD
+exit_status=$?
+
+
+# Update status based on the exit status of `srun`
+if [ $exit_status -eq 0 ]; then
+   printf "completed" > {{ root_path }}/status.txt
+else
+   printf "fail" > {{ root_path }}/status.txt
+fi
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 469c3407a8..b512ca026c 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -196,6 +196,7 @@ def parallelize_hf_transformers(
         logger.warning("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
+        model.set_tp_mesh(world_mesh["tp"])
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.float8.recipe_name in (
             "rowwise",
@@ -281,6 +282,7 @@ def parallelize_hf_transformers(
             logger.info("Applied FSDP to the model")
 
         if parallel_dims.cp_enabled:
+            model.set_cp_mesh(world_mesh["cp"])
             logger.info("Applied Context Parallel to the model")
 
         if job_config.training.enable_cpu_offload:
@@ -296,6 +298,9 @@ def parallelize_hf_transformers(
             enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
         )
 
+    if parallel_dims.pp_enabled:
+        model.set_pp_mesh(world_mesh["pp"])
+
     return model
 
 
@@ -310,22 +315,36 @@ def apply_non_moe_tp(
     # transformer block's inputs)
     # 2. Parallelize the root norm layer over the sequence dim
     # 3. Parallelize the final linear output layer
-    parallelize_module(
-        model,
-        tp_mesh,
-        {
-            "tok_embeddings": RowwiseParallel(
+    
+    # skipping nn.Identity modules (which are added by pipeline parallelism for unused modules)
+    root_plan = {}
+    
+    if hasattr(model, 'tok_embeddings'):
+        if isinstance(model.tok_embeddings, nn.Identity):
+            root_plan["tok_embeddings"] = NoParallel()
+        else:
+            root_plan["tok_embeddings"] = RowwiseParallel(
                 input_layouts=Replicate(),
                 output_layouts=Shard(1),
-            ),
-            "norm": SequenceParallel(),
-            "output": ColwiseParallel(
+            )
+    
+    if hasattr(model, 'norm'):
+        if isinstance(model.norm, nn.Identity):
+            root_plan["norm"] = NoParallel()
+        else:
+            root_plan["norm"] = SequenceParallel()
+    
+    if hasattr(model, 'output'):
+        if isinstance(model.output, nn.Identity):
+            root_plan["output"] = NoParallel()
+        else:
+            root_plan["output"] = ColwiseParallel(
                 input_layouts=Shard(1),
                 output_layouts=Shard(-1) if loss_parallel else Replicate(),
                 use_local_output=not loss_parallel,
-            ),
-        },
-    )
+            )
+    if root_plan:  # Only call if there's something to parallelize
+        parallelize_module(model, tp_mesh, root_plan)
 
     # Parallel styles used for transformer block linear weights and their
     # inputs may be different for float8 linears with tensorwise scaling.
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 917d50a43f..7bc444f1eb 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -273,13 +273,27 @@ def __init__(self, model_args: HFTransformerModelArgs):
             patch_hf_llama()
 
         self.model = model_cls(config=model_args)
-
+        self.max_seq_len = model_args.max_seq_len
+        
         for layer in self.model.model.layers:
             if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace:
                 layer.moe_enabled = True
             else:
                 layer.moe_enabled = False
 
+        self.cp_mesh = None
+        self.tp_mesh = None
+        self.pp_mesh = None
+
+    def set_cp_mesh(self, mesh):
+        self.cp_mesh = mesh
+    
+    def set_tp_mesh(self, mesh):
+        self.tp_mesh = mesh
+    
+    def set_pp_mesh(self, mesh):
+        self.pp_mesh = mesh
+
     @property
     def tok_embeddings(self):
         """Returns the model's embed_tokens, handling different Hugging Face model structures."""
@@ -358,8 +372,9 @@ def rotary_emb(self, value):
             raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
 
     def forward(self, *args, **kwargs):
-        position_ids = torch.arange(args[0].shape[1], device=args[0].device).unsqueeze(0)
-        kwargs["position_ids"] = position_ids
+        local_seq_len = self.max_seq_len
+        local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1
+        kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0)
         output = self.model.model(*args, **kwargs)
         output = self.model.lm_head(output.last_hidden_state)
         return output
diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
index 38c4982319..d886549ff0 100644
--- a/torchtitan/experiments/transformers_backend/test_hf_integration.py
+++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py
@@ -11,6 +11,9 @@
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
 
+BASELINE = "fsdp2_tp1_cp1_pp1"
+# BASELINE = "fsdp1_tp1_cp1_pp1"
+
 console = Console()
 
 class LogLevel(Enum):
@@ -149,15 +152,28 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
     config["model"]["flavor"] = flavor
 
     parallelism_configs = [
-        "fsdp2_tp1_cp1_pp1", # baseline
+        BASELINE, # baseline
         "fsdp2_tp2_cp1_pp1",
         "fsdp2_tp1_cp1_pp2",
         "fsdp2_tp1_cp2_pp1",
         "fsdp2_tp1_cp2_pp2",
         "fsdp2_tp2_cp2_pp1",
+        "fsdp2_tp2_cp1_pp2",
         "fsdp2_tp2_cp2_pp2",
     ]
 
+    # parallelism_configs = [
+    #     BASELINE, # baseline
+    #     "fsdp1_tp2_cp1_pp1",
+    #     "fsdp1_tp1_cp1_pp2",
+    #     "fsdp1_tp1_cp2_pp1",
+    #     "fsdp1_tp1_cp2_pp2",
+    #     "fsdp1_tp2_cp2_pp1",
+    #     "fsdp1_tp2_cp1_pp2",
+    #     "fsdp1_tp2_cp2_pp2",
+    # ]
+
+
     out_path = Path(out_dir) / model_name / flavor
     out_path.mkdir(parents=True, exist_ok=True)
 
@@ -184,6 +200,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
 
     # Create parallelism configs
     for pc in parallelism_configs:
+            
         iter_config = toml.loads(toml.dumps(config))
 
         m = re.match(r"fsdp(\d+)_tp(\d+)_cp(\d+)_pp(\d+)", pc)
@@ -200,8 +217,10 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
         iter_config["parallelism"]["tensor_parallel_degree"] = tp
         iter_config["parallelism"]["context_parallel_degree"] = cp
         iter_config["parallelism"]["pipeline_parallel_degree"] = pp
-        iter_config["parallelism"]["pipeline_parallel_schedule"] = "1F1B"
+        iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe"
         iter_config["job"]["dump_folder"] = str(pc_dir)
+        if pc == BASELINE or pc == "fsdp2_tp1_cp1_pp2":
+            iter_config["training"]["local_batch_size"] = 2
 
         config_path = pc_dir / "config.toml"
         with open(config_path, "w") as f:
@@ -379,9 +398,9 @@ class TrainingMetrics:
         grad_norm: List[float] = field(default_factory=list)
     
     # Default tolerance values (matching compare_distributed_run.py)
-    DEFAULT_LOSS_ATOL = 0.02
+    DEFAULT_LOSS_ATOL = 5e-2
     DEFAULT_LOSS_RTOL = 1e-5
-    DEFAULT_GRAD_NORM_ATOL = 0.02
+    DEFAULT_GRAD_NORM_ATOL = 7e-1
     DEFAULT_GRAD_NORM_RTOL = 1e-5
     
     def _extract_metrics(log_file: Path) -> TrainingMetrics:
@@ -503,7 +522,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
             tuple[int, int]: (passed_count, failed_count)
         """
         # Find baseline directory
-        baseline_dir = flavor_dir / "fsdp2_tp1_cp1_pp1"
+        baseline_dir = flavor_dir / BASELINE
         if not baseline_dir.exists():
             log_message(LogLevel.WARNING, f"No baseline directory found in {flavor_dir.relative_to(inp_path)}, skipping", indent=1)
             return 0, 0
@@ -525,7 +544,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
         # Find all parallelism config directories (excluding seed_checkpoint and baseline)
         config_dirs = []
         for item in flavor_dir.iterdir():
-            if item.is_dir() and item.name.startswith("fsdp2_") and item.name != "fsdp2_tp1_cp1_pp1":
+            if item.is_dir() and item.name not in {BASELINE, "seed_checkpoint"}:
                 config_dirs.append(item)
         
         if not config_dirs:
@@ -598,7 +617,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
                     
                     if success:
                         log_message(LogLevel.INFO, f"Diff between baseline vs HF nd-parallel saved to:", indent=5, dim=True)
-                        console.print(f"      [dim]{diff_file.relative_to(flavor_dir)}[/dim]")
+                        console.print(f"      [dim]{diff_file}[/dim]")
                     else:
                         log_message(LogLevel.WARNING, f"Failed to generate diff: {output}", indent=5, dim=True)
                         
@@ -651,7 +670,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
     # Find all directories that contain a baseline (fsdp2_tp1_cp1_pp1) subdirectory
     flavor_dirs = []
     for root, dirs, files in os.walk(inp_path):
-        if "fsdp2_tp1_cp1_pp1" in dirs:
+        if BASELINE in dirs:
             flavor_dirs.append(Path(root))
     
     if not flavor_dirs:

From bb080ad7187e3322d163196dbb110be55c50ebec Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 14 Oct 2025 16:23:20 +0000
Subject: [PATCH 062/129] fix and uniformize weight init of llama-like model +
 various fix

---
 .../transformers_backend/__init__.py          |  10 +-
 .../infra/parallelize_hf_transformers.py      |  39 +++--
 .../model/hf_llama_like_patch.py              | 165 ++++++++++++++++++
 .../model/hf_transformers_args.py             |  73 ++++++--
 .../test_hf_integration.py                    |  65 ++++---
 torchtitan/utils/test_utils.py                |   7 +-
 6 files changed, 305 insertions(+), 54 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index ac0431ec3f..c29b3a5aa1 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -35,7 +35,7 @@ class TitanModelArgs:
     n_layers: int = 32
     n_heads: int = 32
     n_kv_heads: Optional[int] = None
-    vocab_size: int = 128256
+    vocab_size: Optional[int] = None
     multiple_of: int = 256
     ffn_dim_multiplier: Optional[float] = None
     norm_eps: float = 1e-5
@@ -69,17 +69,19 @@ class DeepSeekV3Args:
     beta_slow: Optional[int] = None
     mscale: Optional[float] = None
     partial_rotary_factor: Optional[float] = None
+    rope_interleave: bool = True
+
 
 flavors = {
     "debugmodel": HFTransformerModelArgs(
         titan_args=TitanModelArgs(
-            vocab_size=2000,
+            vocab_size=51200,
             dim=256,
-            n_layers=6,
+            n_layers=1,
             n_heads=16,
             n_kv_heads=16,
         ),
-        deepseek_v3_args=None
+        pad_token_id=None,
         # deepseek_v3_args=DeepSeekV3Args(
         #     partial_rotary_factor=4.0,
         #     inter_dim=1024,
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index b512ca026c..16e33251ae 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -43,12 +43,12 @@
     torch.ops.aten.mm.default,
     torch.ops.aten._scaled_dot_product_efficient_attention.default,
     torch.ops.aten._scaled_dot_product_flash_attention.default,
+    torch._higher_order_ops.flex_attention,
     torch.ops._c10d_functional.reduce_scatter_tensor.default,
     # for low precision training, it's useful to always save
     # the result of max, since the absolute maximum is
     # used to compute the scaling factor for quantization.
     torch.ops.aten.max.default,
-    torch._higher_order_ops.flex_attention,
 }
 
 def _apply_ac_to_transformer_block(
@@ -379,21 +379,34 @@ def apply_non_moe_tp(
             "self_attn.q_proj": colwise_parallel(),
             "self_attn.k_proj": colwise_parallel(),
             "self_attn.v_proj": colwise_parallel(),
-            "self_attn.o_proj": rowwise_parallel(output_layouts=Shard(1)),
             "post_attention_layernorm": SequenceParallel(),
         }
+
+        # Handle different names for the output projection layer, e.g. o_proj vs dense
+        o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense"
+        layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1))
+
         if not transformer_block.moe_enabled:
-            layer_plan.update(
-                {
-                    "mlp": prepare_module_input(
-                        input_layouts=(Shard(1),),
-                        desired_input_layouts=(Replicate(),),
-                    ),
-                    "mlp.gate_proj": colwise_parallel(),
-                    "mlp.up_proj": colwise_parallel(),
-                    "mlp.down_proj": rowwise_parallel(output_layouts=Shard(1)),
-                }
-            )
+            mlp_plan = {
+                "mlp": prepare_module_input(
+                    input_layouts=(Shard(1),),
+                    desired_input_layouts=(Replicate(),),
+                ),
+            }
+            # Handle different names for MLP layers, e.g. gate_proj vs fc1
+            gate_proj_name = "gate_proj" if hasattr(transformer_block.mlp, "gate_proj") else "fc1"
+            mlp_plan[f"mlp.{gate_proj_name}"] = colwise_parallel()
+
+            if hasattr(transformer_block.mlp, "up_proj"):
+                mlp_plan["mlp.up_proj"] = colwise_parallel()
+
+            down_proj_name = "down_proj" if hasattr(transformer_block.mlp, "down_proj") else "fc2"
+            mlp_plan[f"mlp.{down_proj_name}"] = rowwise_parallel(output_layouts=Shard(1))
+            layer_plan.update(mlp_plan)
+
+        # Some models like Phi-2 don't have post_attention_layernorm
+        if not hasattr(transformer_block, "post_attention_layernorm"):
+            layer_plan.pop("post_attention_layernorm")
 
         parallelize_module(
             module=transformer_block,
diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
new file mode 100644
index 0000000000..563c5e289b
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
@@ -0,0 +1,165 @@
+import torch
+import torch.nn as nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+import math
+from torch.nn import init
+
+
+def patch_hf_llama_like(decoder_layer_cls, attention_cls, mlp_cls=None):
+    """
+    This patch modifies a Hugging Face Llama-like model's weight initialization to match
+    the initialization scheme used in TorchTitan. This is crucial for ensuring
+    bit-for-bit reproducibility when converting checkpoints between the native
+    TorchTitan format and the Hugging Face format.
+
+    The patch targets the following aspects of the model:
+    - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly.
+    - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization
+      for attention, MLP, embedding, and layer norm layers. This includes depth-dependent
+      initialization for attention and MLP layers.
+    - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within
+      each decoder layer, which is required for the depth-dependent initialization.
+
+    By applying this patch, we can ensure that a model loaded in the transformers
+    backend will have the exact same weights as a model trained with the native
+    TorchTitan backend, which is essential for seamless conversion and debugging.
+    """
+
+    _original_decoder_layer_init = decoder_layer_cls.__init__
+
+    def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
+        _original_decoder_layer_init(self, config, layer_idx)
+        self.layer_idx = layer_idx
+        # Ensure both attention and mlp modules have layer_idx for depth-based init
+        if hasattr(self, "self_attn"):
+            self.self_attn.layer_idx = layer_idx
+        # some models might not have mlp in each layer
+        if hasattr(self, "mlp") and self.mlp is not None:
+            self.mlp.layer_idx = layer_idx
+
+    def _initialize_weights_patched(self, module):
+        # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
+        # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
+        # which prevents subsequent proper initialization.
+        if getattr(module, "_is_hf_initialized", False):
+            return
+
+        for param in module.parameters(recurse=True):
+            if param.device.type == "meta":
+                return
+
+        # If not on a meta device, call the original weight initialization
+        self._init_weights(module)
+        module._is_hf_initialized = True
+
+    def _init_weights_patched(self, module):
+        """
+        Patched version of _init_weights to match TorchTitan's initialization for Llama-like models.
+        `self` is a PreTrainedModel instance.
+        """
+        config = self.config
+
+        # check if layer is  (resid_dropout): Dropout(p=0.1, inplace=False)
+        if hasattr(module, "resid_dropout"):
+            print()
+
+        # Build tuple of classes to check for layer_idx-based init_std calculation
+        layer_idx_classes = [attention_cls]
+        if mlp_cls:
+            layer_idx_classes.append(mlp_cls)
+        layer_idx_classes = tuple(layer_idx_classes)
+
+        if isinstance(module, layer_idx_classes):
+            if not hasattr(module, "layer_idx"):
+                return
+            layer_idx = module.layer_idx
+
+            if hasattr(config, "depth_init") and config.depth_init:
+                init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+            else:
+                init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+        if isinstance(module, attention_cls):
+            # Initialize weights and biases for q, k, v projections
+            for proj_name in ["q_proj", "k_proj", "v_proj"]:
+                proj = getattr(module, proj_name)
+                nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02)
+                if proj.bias is not None:
+                    fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    init.uniform_(proj.bias, -bound, bound)
+
+            # Handle different names for the output projection layer
+            o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
+            if o_proj is not None:
+                nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std)
+                if o_proj.bias is not None:
+                    fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    init.uniform_(o_proj.bias, -bound, bound)
+
+        elif mlp_cls and isinstance(module, mlp_cls):
+            # Handle different names for MLP layers
+            gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
+            up_proj = getattr(module, "up_proj", None)
+            down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))
+
+            # gate_proj (or fc1) should always use std=0.02 for numerical stability.
+            if gate_proj is not None:
+                nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02)
+                if gate_proj.bias is not None:
+                    fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    init.uniform_(gate_proj.bias, -bound, bound)
+            # up_proj and down_proj (or fc2) use the depth-dependent init_std.
+            if up_proj is not None:
+                nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std)
+                if up_proj.bias is not None:
+                    fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    init.uniform_(up_proj.bias, -bound, bound)
+            if down_proj is not None:
+                nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std)
+                if down_proj.bias is not None:
+                    fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    init.uniform_(down_proj.bias, -bound, bound)
+
+        elif module is getattr(
+            self, "lm_head", None
+        ):  # TODO(3outeille): find a better way to detect lm_head
+            final_out_std = config.hidden_size**-0.5
+            cutoff_factor = 3
+            nn.init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=final_out_std,
+                a=-cutoff_factor * final_out_std,
+                b=cutoff_factor * final_out_std,
+            )
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+        elif isinstance(module, nn.Embedding):
+            std = config.initializer_range
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+        elif (
+            isinstance(
+                module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)
+            )
+            or "LayerNorm" in module.__class__.__name__
+            or "RMSNorm" in module.__class__.__name__
+        ):
+            # Norms can exist without weights (in which case they are None from torch primitives)
+            if hasattr(module, "weight") and module.weight is not None:
+                module.weight.data.fill_(1.0)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+    decoder_layer_cls.__init__ = _decoder_layer_init_patched
+    PreTrainedModel._init_weights = _init_weights_patched
+    PreTrainedModel._initialize_weights = _initialize_weights_patched
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 7bc444f1eb..db1880b1dc 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -16,8 +16,7 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_utils import AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
-from torchtitan.experiments.transformers_backend.model.hf_llama_patch import patch_hf_llama
-from torchtitan.experiments.transformers_backend.model.hf_deepseek_v3_patch import patch_hf_deepseek_v3
+from torchtitan.experiments.transformers_backend.model.hf_llama_like_patch import patch_hf_llama_like
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
@@ -55,6 +54,7 @@ def __init__(
         attn_implementation: str = "sdpa_torchtitan",
         **kwargs,
     ):
+        super().__init__(attn_implementation=attn_implementation, **kwargs)
         assert titan_args is not None, "titan_args is required"
 
         active_mappings = {}
@@ -68,6 +68,11 @@ def __init__(
         
         self._create_dynamic_properties()
 
+        # Set HF attributes from titan_args based on mappings
+        for titan_name, hf_name in self._active_mappings.items():
+            if hasattr(titan_args, titan_name):
+                setattr(self, hf_name, getattr(titan_args, titan_name))
+
         # Fill all TorchTitan-specific args (no HF equivalent)
         self.multiple_of = titan_args.multiple_of
         self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
@@ -95,6 +100,7 @@ def __init__(
 
             self._passed_args.update(**deepseek_v3_args.__dict__)
 
+            self.rope_interleave = deepseek_v3_args.rope_interleave
             self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor
 
             if deepseek_v3_args.moe_args is not None:
@@ -132,7 +138,7 @@ def __repr__(self) -> str:
         # doesn't work well with how HFTransformerModelArgs is initialized.
         # This custom __repr__ provides a dataclass-like representation that correctly
         # displays the arguments passed during initialization.
-        args_lines = [f"{k}={v!r}" for k, v in sorted(self._passed_args.items())]
+        args_lines = [f"{k}={getattr(self, k)!r}" for k in sorted(self._passed_args.keys())]
         args_str = "\n".join(args_lines)
         return f"{self.__class__.__name__}(\n{args_str}\n)"
 
@@ -141,15 +147,24 @@ def update_from_config(self, job_config: JobConfig):
         hf_model_config = AutoConfig.from_pretrained(
             job_config.model.name,
             attn_implementation=self.attn_implementation,
+            trust_remote_code=True
         )
 
-        self.__dict__.update(hf_model_config.__dict__)
-        
+        # Explicitly update attributes based on mappings
+        for titan_name, hf_name in self._active_mappings.items():
+            if hasattr(hf_model_config, hf_name):
+                setattr(self, titan_name, getattr(hf_model_config, hf_name))
+
+        # Copy any other attributes that might not be in the mapping
+        # This is safer than a direct __dict__ update
+        for key, value in hf_model_config.to_dict().items():
+            setattr(self, key, value)
+
         # Update our attributes with the passed args from flavors
         for key, value in self._passed_args.items():
-            if hasattr(self, key):
+            if hasattr(self, key) and value is not None:
                 setattr(self, key, value)
-        
+
         # MoE
         if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
             self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
@@ -265,12 +280,40 @@ def __init__(self, model_args: HFTransformerModelArgs):
                     f"Make sure the class is available. Original error: {e}"
                 )
         
-        if model_args.architectures[0] == "DeepseekV3Model":
-            print("Patching deepseek")
-            patch_hf_deepseek_v3()
-        else:
-            print("Patching llama")
-            patch_hf_llama()
+        # agnostic weight initialization patching
+        try:
+            model_name_prefix = model_class_name.replace("ForCausalLM", "")
+            model_module = importlib.import_module(model_cls.__module__)
+
+            attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None)
+            mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None)
+            decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None)
+
+            if all([attention_cls, decoder_layer_cls]):
+                logger.info(f"Applying Llama-like patch for {model_name_prefix}")
+                patch_hf_llama_like(
+                    decoder_layer_cls=decoder_layer_cls,
+                    attention_cls=attention_cls,
+                    mlp_cls=mlp_cls,  # mlp_cls can be None
+                )
+            else:
+                missing = [
+                    cls_name
+                    for cls, cls_name in [
+                        (attention_cls, "Attention"),
+                        (decoder_layer_cls, "DecoderLayer"),
+                    ]
+                    if not cls
+                ]
+                logger.warning(
+                    f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
+                    "Skipping Llama-like patch."
+                )
+        except Exception as e:
+            logger.warning(
+                f"Failed to apply agnostic patch for {model_class_name} due to: {e}. "
+                "Weight initialization might not match TorchTitan."
+            )
 
         self.model = model_cls(config=model_args)
         self.max_seq_len = model_args.max_seq_len
@@ -330,6 +373,8 @@ def norm(self):
         """Returns the model's norm, handling different Hugging Face model structures."""
         if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
             return self.model.model.norm
+        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
+            return self.model.model.final_layernorm
         else:
             raise AttributeError("Could not find norm in the model. Please check the model structure.")
 
@@ -337,6 +382,8 @@ def norm(self):
     def norm(self, value):
         if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
             setattr(self.model.model, "norm", value)
+        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
+            setattr(self.model.model, "final_layernorm", value)
         else:
             raise AttributeError("Could not find norm in the model. Please check the model structure.")
 
diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
index d886549ff0..4838133618 100644
--- a/torchtitan/experiments/transformers_backend/test_hf_integration.py
+++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py
@@ -11,8 +11,8 @@
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
 
-BASELINE = "fsdp2_tp1_cp1_pp1"
-# BASELINE = "fsdp1_tp1_cp1_pp1"
+# BASELINE = "fsdp2_tp1_cp1_pp1"
+BASELINE = "fsdp1_tp1_cp1_pp1"
 
 console = Console()
 
@@ -151,28 +151,32 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
     config["model"]["name"] = model_name
     config["model"]["flavor"] = flavor
 
-    parallelism_configs = [
-        BASELINE, # baseline
-        "fsdp2_tp2_cp1_pp1",
-        "fsdp2_tp1_cp1_pp2",
-        "fsdp2_tp1_cp2_pp1",
-        "fsdp2_tp1_cp2_pp2",
-        "fsdp2_tp2_cp2_pp1",
-        "fsdp2_tp2_cp1_pp2",
-        "fsdp2_tp2_cp2_pp2",
-    ]
+    # parallelism_configs = [
+    #     BASELINE, # baseline
+    #     "fsdp2_tp2_cp1_pp1",
+    #     # "fsdp2_tp1_cp1_pp2",
+    #     # "fsdp2_tp1_cp2_pp1",
+    #     # "fsdp2_tp1_cp2_pp2",
+    #     # "fsdp2_tp2_cp2_pp1",
+    #     # "fsdp2_tp2_cp1_pp2",
+    #     # "fsdp2_tp2_cp2_pp2",
+    # ]
 
     # parallelism_configs = [
     #     BASELINE, # baseline
-    #     "fsdp1_tp2_cp1_pp1",
-    #     "fsdp1_tp1_cp1_pp2",
-    #     "fsdp1_tp1_cp2_pp1",
-    #     "fsdp1_tp1_cp2_pp2",
-    #     "fsdp1_tp2_cp2_pp1",
-    #     "fsdp1_tp2_cp1_pp2",
-    #     "fsdp1_tp2_cp2_pp2",
+    #     # "fsdp1_tp2_cp1_pp1",
+    #     # "fsdp1_tp1_cp1_pp2",
+    #     # "fsdp1_tp1_cp2_pp1",
+    #     # "fsdp1_tp1_cp2_pp2",
+    #     # "fsdp1_tp2_cp2_pp1",
+    #     # "fsdp1_tp2_cp1_pp2",
+    #     # "fsdp1_tp2_cp2_pp2",
     # ]
 
+    parallelism_configs = [
+        BASELINE, # baseline
+        "fsdp1_tp2_cp1_pp1",
+    ]
 
     out_path = Path(out_dir) / model_name / flavor
     out_path.mkdir(parents=True, exist_ok=True)
@@ -219,6 +223,11 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
         iter_config["parallelism"]["pipeline_parallel_degree"] = pp
         iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe"
         iter_config["job"]["dump_folder"] = str(pc_dir)
+        
+        # if pc == "fsdp1_tp1_cp1_pp2" or pc == BASELINE:
+        #     iter_config["training"]["global_batch_size"] = 1
+        #     iter_config["training"]["local_batch_size"] = 1
+
         if pc == BASELINE or pc == "fsdp2_tp1_cp1_pp2":
             iter_config["training"]["local_batch_size"] = 2
 
@@ -379,7 +388,7 @@ def check_status(inp_dir: str):
     print("=" * 30)
 
 
-def report(inp_dir: str):
+def report(inp_dir: str, only: str = None):
     """
     Generate diff reports between baseline (fsdp2_tp1_cp1_pp1) and all other parallelism configs.
     Creates diff_baseline_vs_nd_parallelism.log in each non-baseline config directory.
@@ -673,9 +682,20 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
         if BASELINE in dirs:
             flavor_dirs.append(Path(root))
     
+    # Filter by --only if provided
+    if only:
+        original_count = len(flavor_dirs)
+        flavor_dirs = [
+            d for d in flavor_dirs if only in str(d.relative_to(inp_path))
+        ]
+        log_message(
+            LogLevel.INFO,
+            f"Filtered from {original_count} to {len(flavor_dirs)} director{'ies' if len(flavor_dirs) != 1 else 'y'} matching '[bold]{only}[/bold]'",
+        )
+
     if not flavor_dirs:
         log_message(LogLevel.ERROR, f"No directories with baseline configuration found under {inp_path}")
-        console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1_pp1' subdirectory[/yellow]")
+        console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1' subdirectory[/yellow]")
         return
     
     log_message(LogLevel.INFO, f"Found {len(flavor_dirs)} model/flavor combination(s) to process:")
@@ -737,6 +757,7 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
 
     report_parser = subparsers.add_parser("report")
     report_parser.add_argument("--inp_dir", type=str, required=True)
+    report_parser.add_argument("--only", type=str, default=None)
 
     check_status_parser = subparsers.add_parser("check_status")
     check_status_parser.add_argument("--inp_dir", type=str, required=True)
@@ -748,6 +769,6 @@ def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
     elif args.action == "submit_jobs":
         submit_jobs(args.inp_dir, args.qos, args.only)
     elif args.action == "report":
-        report(args.inp_dir)
+        report(args.inp_dir, args.only)
     elif args.action == "check_status":
         check_status(args.inp_dir)
\ No newline at end of file
diff --git a/torchtitan/utils/test_utils.py b/torchtitan/utils/test_utils.py
index 77db8bcfe6..efb8ac478d 100644
--- a/torchtitan/utils/test_utils.py
+++ b/torchtitan/utils/test_utils.py
@@ -42,8 +42,11 @@ def seeded_trunc_normal(*trunc_args, **trunc_kwargs):
                 result = original_trunc_normal(*trunc_args, **trunc_kwargs)
                 return result
             
-            nn.init.trunc_normal_ = seeded_trunc_normal
-            return func(*args, **kwargs)
+            try:
+                nn.init.trunc_normal_ = seeded_trunc_normal
+                return func(*args, **kwargs)
+            finally:
+                nn.init.trunc_normal_ = original_trunc_normal
         
         return wrapper
     return decorator

From 3168f9e2785cc5589226059ea2dc01481577641c Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 15 Oct 2025 08:50:13 +0000
Subject: [PATCH 063/129] support moe init and fix with moe layer (TP for lora
 layers)

---
 .../infra/parallelize_hf_transformers.py      |  32 ++++-
 .../model/hf_moe_like_patch.py                | 135 ++++++++++++++++++
 .../model/hf_transformers_args.py             |  74 +++++++---
 3 files changed, 215 insertions(+), 26 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 16e33251ae..422b307cd4 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -370,18 +370,44 @@ def apply_non_moe_tp(
 
     # Apply tensor + sequence parallelism to every transformer block
     for transformer_block in model.layers:
+        is_deepseek_v3 = "deepseek_v3" in transformer_block.self_attn.__class__.__module__
         layer_plan = {
             "input_layernorm": SequenceParallel(),
             "self_attn": prepare_module_input(
                 input_kwarg_layouts={"hidden_states": Shard(1)},
                 desired_input_kwarg_layouts={"hidden_states": Replicate()},
             ),
-            "self_attn.q_proj": colwise_parallel(),
-            "self_attn.k_proj": colwise_parallel(),
-            "self_attn.v_proj": colwise_parallel(),
             "post_attention_layernorm": SequenceParallel(),
         }
 
+        if is_deepseek_v3:
+            if getattr(transformer_block.self_attn, "q_lora_rank", None) is None:
+                layer_plan["self_attn.q_proj"] = colwise_parallel()
+            else:
+                layer_plan.update({
+                    "self_attn.q_a_proj": NoParallel(),
+                    "self_attn.q_a_layernorm": NoParallel(),
+                    "self_attn.q_b_proj": colwise_parallel(),
+                })
+
+            if getattr(transformer_block.self_attn, "kv_lora_rank", None) is None:
+                layer_plan.update({
+                    "self_attn.k_proj": colwise_parallel(),
+                    "self_attn.v_proj": colwise_parallel(),
+                })
+            else:
+                layer_plan.update({
+                    "self_attn.kv_a_proj_with_mqa": NoParallel(),
+                    "self_attn.kv_a_layernorm": NoParallel(),
+                    "self_attn.kv_b_proj": colwise_parallel(),
+                })
+        else:
+            layer_plan.update({
+                "self_attn.q_proj": colwise_parallel(),
+                "self_attn.k_proj": colwise_parallel(),
+                "self_attn.v_proj": colwise_parallel(),
+            })
+
         # Handle different names for the output projection layer, e.g. o_proj vs dense
         o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense"
         layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1))
diff --git a/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py
new file mode 100644
index 0000000000..dc18e0b455
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py
@@ -0,0 +1,135 @@
+import torch.nn as nn
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+
+
+def patch_hf_moe_like(decoder_layer_cls, attention_cls, mlp_cls, moe_cls):
+    """
+    This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight
+    initialization to match the initialization scheme used in TorchTitan,
+    drawing from patterns in models like DeepseekV3.
+
+    The patch targets:
+    - `PreTrainedModel._initialize_weights`: For correct meta device initialization.
+    - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization
+      for attention, MLP, MoE, embedding, and layer norm layers.
+    - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert
+      modules, required for depth-dependent initialization.
+    """
+
+    _original_decoder_layer_init = decoder_layer_cls.__init__
+
+    def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
+        _original_decoder_layer_init(self, config, layer_idx)
+        self.layer_idx = layer_idx
+
+        if hasattr(self, "self_attn"):
+            self.self_attn.layer_idx = layer_idx
+
+        if hasattr(self, "mlp"):
+            self.mlp.layer_idx = layer_idx
+            if hasattr(self.mlp, "experts"):
+                for expert in self.mlp.experts:
+                    expert.layer_idx = layer_idx
+            if hasattr(self.mlp, "shared_experts"):
+                # Not all MoE models have shared experts
+                if self.mlp.shared_experts is not None:
+                    self.mlp.shared_experts.layer_idx = layer_idx
+
+    def _initialize_weights_patched(self, module):
+        if getattr(module, "_is_hf_initialized", False):
+            return
+        for param in module.parameters(recurse=True):
+            if param.device.type == "meta":
+                return
+        self._init_weights(module)
+        module._is_hf_initialized = True
+
+    def _init_weights_patched(self, module):
+        """
+        Patched version of _init_weights for MoE models.
+        """
+        config = self.config
+        init_std = None
+
+        if isinstance(module, (attention_cls, mlp_cls, moe_cls)):
+            if hasattr(module, "layer_idx"):
+                layer_idx = module.layer_idx
+                if hasattr(config, "depth_init") and config.depth_init:
+                    init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+                else:
+                    # Fallback for models without depth_init
+                    init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+        if isinstance(module, attention_cls):
+            # Handle different attention projection layer names by initializing if they exist
+            if hasattr(module, "q_proj"):
+                nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
+            if hasattr(module, "k_proj"):
+                nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
+            if hasattr(module, "v_proj"):
+                nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
+
+            if hasattr(module, "q_a_proj"):
+                nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
+            if hasattr(module, "q_b_proj"):
+                nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
+            
+            if hasattr(module, "kv_a_proj_with_mqa"):
+                nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
+            if hasattr(module, "kv_b_proj"):
+                nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
+            
+            if hasattr(module, "o_proj") and init_std is not None:
+                nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
+
+        elif isinstance(module, mlp_cls):
+            nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
+            # DeepseekV3 uses std=0.02 for up_proj, unlike Llama
+            nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
+            if init_std is not None:
+                nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
+
+        elif isinstance(module, moe_cls):
+            if hasattr(module, "gate") and init_std is not None:
+                nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
+            if hasattr(module, "experts"):
+                for expert in module.experts:
+                    nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
+                    nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
+                    if init_std is not None:
+                        nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
+            if hasattr(module, "shared_experts") and module.shared_experts is not None:
+                nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
+                nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
+                if init_std is not None:
+                    nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
+
+        elif module is getattr(self, "lm_head", None):
+            final_out_std = config.hidden_size**-0.5
+            cutoff_factor = 3
+            nn.init.trunc_normal_(
+                module.weight,
+                mean=0.0,
+                std=final_out_std,
+                a=-cutoff_factor * final_out_std,
+                b=cutoff_factor * final_out_std,
+            )
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+        elif isinstance(module, nn.Embedding):
+            std = config.initializer_range
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+        elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__:
+            if hasattr(module, "weight") and module.weight is not None:
+                module.weight.data.fill_(1.0)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+    decoder_layer_cls.__init__ = _decoder_layer_init_patched
+    PreTrainedModel._init_weights = _init_weights_patched
+    PreTrainedModel._initialize_weights = _initialize_weights_patched
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index db1880b1dc..4bc65aa0d2 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -17,6 +17,7 @@
 from transformers.modeling_utils import AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
 from torchtitan.experiments.transformers_backend.model.hf_llama_like_patch import patch_hf_llama_like
+from torchtitan.experiments.transformers_backend.model.hf_moe_like_patch import patch_hf_moe_like
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
@@ -138,7 +139,11 @@ def __repr__(self) -> str:
         # doesn't work well with how HFTransformerModelArgs is initialized.
         # This custom __repr__ provides a dataclass-like representation that correctly
         # displays the arguments passed during initialization.
-        args_lines = [f"{k}={getattr(self, k)!r}" for k in sorted(self._passed_args.keys())]
+        args_lines = [
+            f"{k}={getattr(self, k)!r}"
+            for k in sorted(self._passed_args.keys())
+            if hasattr(self, k)
+        ]
         args_str = "\n".join(args_lines)
         return f"{self.__class__.__name__}(\n{args_str}\n)"
 
@@ -156,7 +161,6 @@ def update_from_config(self, job_config: JobConfig):
                 setattr(self, titan_name, getattr(hf_model_config, hf_name))
 
         # Copy any other attributes that might not be in the mapping
-        # This is safer than a direct __dict__ update
         for key, value in hf_model_config.to_dict().items():
             setattr(self, key, value)
 
@@ -191,7 +195,7 @@ def update_from_config(self, job_config: JobConfig):
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         # Check if this is a MoE model by looking for MoE attributes
-        is_moe = hasattr(self, 'n_routed_experts') and hasattr(self, 'num_experts_per_tok')
+        is_moe = hasattr(self, 'n_routed_experts')
         
         if is_moe:
             # MoE parameter counting (adapted from DeepSeek V3 implementation)
@@ -280,7 +284,7 @@ def __init__(self, model_args: HFTransformerModelArgs):
                     f"Make sure the class is available. Original error: {e}"
                 )
         
-        # agnostic weight initialization patching
+        # Attempt to patch model weight initialization based on architecture type
         try:
             model_name_prefix = model_class_name.replace("ForCausalLM", "")
             model_module = importlib.import_module(model_cls.__module__)
@@ -289,26 +293,50 @@ def __init__(self, model_args: HFTransformerModelArgs):
             mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None)
             decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None)
 
-            if all([attention_cls, decoder_layer_cls]):
-                logger.info(f"Applying Llama-like patch for {model_name_prefix}")
-                patch_hf_llama_like(
-                    decoder_layer_cls=decoder_layer_cls,
-                    attention_cls=attention_cls,
-                    mlp_cls=mlp_cls,  # mlp_cls can be None
-                )
+            is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model
+            if is_moe:
+                moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None)
+                required_classes = {
+                    "Attention": attention_cls,
+                    "MLP": mlp_cls, 
+                    "DecoderLayer": decoder_layer_cls,
+                    "MoE": moe_cls
+                }
+                
+                if all(required_classes.values()):
+                    logger.info(f"Applying MoE-like patch for {model_name_prefix}")
+                    patch_hf_moe_like(
+                        decoder_layer_cls=decoder_layer_cls,
+                        attention_cls=attention_cls,
+                        mlp_cls=mlp_cls,
+                        moe_cls=moe_cls
+                    )
+                else:
+                    missing = [name for name, cls in required_classes.items() if not cls]
+                    logger.warning(
+                        f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. "
+                        "Skipping MoE-like patch."
+                    )
             else:
-                missing = [
-                    cls_name
-                    for cls, cls_name in [
-                        (attention_cls, "Attention"),
-                        (decoder_layer_cls, "DecoderLayer"),
-                    ]
-                    if not cls
-                ]
-                logger.warning(
-                    f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
-                    "Skipping Llama-like patch."
-                )
+                required_classes = {
+                    "Attention": attention_cls,
+                    "DecoderLayer": decoder_layer_cls
+                }
+                
+                if all(required_classes.values()):
+                    logger.info(f"Applying Llama-like patch for {model_name_prefix}")
+                    patch_hf_llama_like(
+                        decoder_layer_cls=decoder_layer_cls,
+                        attention_cls=attention_cls,
+                        mlp_cls=mlp_cls  # mlp_cls can be None
+                    )
+                else:
+                    missing = [name for name, cls in required_classes.items() if not cls]
+                    logger.warning(
+                        f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
+                        "Skipping Llama-like patch."
+                    )
+
         except Exception as e:
             logger.warning(
                 f"Failed to apply agnostic patch for {model_class_name} due to: {e}. "

From a9a65b7b95188cab173056271f03aa6a70fa9d8a Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 15 Oct 2025 13:17:13 +0000
Subject: [PATCH 064/129] begin TP + EP with MoE model

---
 .../infra/parallelize_hf_transformers.py      | 119 +++++++++---------
 1 file changed, 63 insertions(+), 56 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 422b307cd4..1bfe6ab779 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -370,7 +370,6 @@ def apply_non_moe_tp(
 
     # Apply tensor + sequence parallelism to every transformer block
     for transformer_block in model.layers:
-        is_deepseek_v3 = "deepseek_v3" in transformer_block.self_attn.__class__.__module__
         layer_plan = {
             "input_layernorm": SequenceParallel(),
             "self_attn": prepare_module_input(
@@ -380,37 +379,32 @@ def apply_non_moe_tp(
             "post_attention_layernorm": SequenceParallel(),
         }
 
-        if is_deepseek_v3:
-            if getattr(transformer_block.self_attn, "q_lora_rank", None) is None:
-                layer_plan["self_attn.q_proj"] = colwise_parallel()
-            else:
-                layer_plan.update({
-                    "self_attn.q_a_proj": NoParallel(),
-                    "self_attn.q_a_layernorm": NoParallel(),
-                    "self_attn.q_b_proj": colwise_parallel(),
-                })
-
-            if getattr(transformer_block.self_attn, "kv_lora_rank", None) is None:
-                layer_plan.update({
-                    "self_attn.k_proj": colwise_parallel(),
-                    "self_attn.v_proj": colwise_parallel(),
-                })
-            else:
-                layer_plan.update({
-                    "self_attn.kv_a_proj_with_mqa": NoParallel(),
-                    "self_attn.kv_a_layernorm": NoParallel(),
-                    "self_attn.kv_b_proj": colwise_parallel(),
-                })
-        else:
+        if getattr(transformer_block.self_attn, "q_lora_rank", None) is None:
             layer_plan.update({
                 "self_attn.q_proj": colwise_parallel(),
                 "self_attn.k_proj": colwise_parallel(),
                 "self_attn.v_proj": colwise_parallel(),
             })
+        else:
+            layer_plan.update({
+                "self_attn.q_a_proj": NoParallel(),
+                "self_attn.q_a_layernorm": NoParallel(),
+                "self_attn.q_b_proj": colwise_parallel(),
+                "self_attn.kv_a_proj_with_mqa": NoParallel(),
+                "self_attn.kv_a_layernorm": NoParallel(),
+                "self_attn.kv_b_proj": colwise_parallel(),
+            })
 
         # Handle different names for the output projection layer, e.g. o_proj vs dense
         o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense"
         layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1))
+        
+        # For Qwen3 RMSNorm on Q and K
+        # TODO(3outeille): we should probably shard(1) then replicate => then use SequenceParallel but for now I am fed up
+        if hasattr(transformer_block.self_attn, "q_norm"):
+            layer_plan["self_attn.q_norm"] = NoParallel()
+        if hasattr(transformer_block.self_attn, "k_norm"):
+            layer_plan["self_attn.k_norm"] = NoParallel()
 
         if not transformer_block.moe_enabled:
             mlp_plan = {
@@ -508,7 +502,7 @@ def apply_fsdp(
         if hasattr(transformer_block, "moe_enabled") and transformer_block.moe_enabled and ep_degree > 1:
             fsdp_mod_ep_config = fsdp_config.copy()
             fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh
-
+            moe_block = transformer_block.mlp
             # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts).
             #       When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding
             #       causes inefficiency, so we choose to do FSDP sharding on dim-1.
@@ -517,15 +511,14 @@ def apply_fsdp(
             #       shard_placement_fn on the outer TransformerBlock-level FSDP.
             _experts_shard_placement_fn = None
             assert dp_mod_ep_mesh is not None
-            assert hasattr(transformer_block, "moe")
             if (
                 dp_mod_ep_mesh.size() * ep_degree
-                > transformer_block.moe.experts.num_experts
+                > moe_block.experts.num_experts
             ):
                 _experts_shard_placement_fn = lambda param: Shard(1)
 
             fully_shard(
-                transformer_block.moe.experts,
+                moe_block.experts,
                 **fsdp_mod_ep_config,
                 reshard_after_forward=reshard_after_forward,
                 shard_placement_fn=_experts_shard_placement_fn,
@@ -534,7 +527,7 @@ def apply_fsdp(
             # NOTE: # Although the FSDP sharding of experts is done on a mesh of
             #       a different size than other parameters, the gradient division
             #       factor should be consistent with data.
-            transformer_block.moe.experts.set_gradient_divide_factor(
+            moe_block.experts.set_gradient_divide_factor(
                 gradient_divide_factor,
             )
 
@@ -573,7 +566,7 @@ def apply_fsdp(
         if next_transformer_block is not None:
             if next_transformer_block.moe_enabled:
                 transformer_block.set_modules_to_forward_prefetch(
-                    [next_transformer_block, next_transformer_block.moe.experts]
+                    [next_transformer_block, next_transformer_block.mlp.experts]
                 )
             else:
                 transformer_block.set_modules_to_forward_prefetch(
@@ -597,7 +590,7 @@ def apply_fsdp(
         if prev_transformer_block is not None:
             if prev_transformer_block.moe_enabled:
                 transformer_block.set_modules_to_backward_prefetch(
-                    [prev_transformer_block, prev_transformer_block.moe.experts]
+                    [prev_transformer_block, prev_transformer_block.mlp.experts]
                 )
             else:
                 transformer_block.set_modules_to_backward_prefetch(
@@ -618,11 +611,12 @@ def apply_moe_ep_tp(
         if not transformer_block.moe_enabled:
             continue
 
+        moe_block = transformer_block.mlp
         if tp_mesh is not None:
             moe_layer_plan = {
                 # input / output sharding on the seqlen dim
                 # all-gather for input, reduce-scatter for output
-                "moe": PrepareModuleInputOutput(
+                "mlp": PrepareModuleInputOutput(
                     input_layouts=(Shard(1),),
                     desired_input_layouts=(Replicate(),),
                     use_local_input=True,
@@ -630,22 +624,22 @@ def apply_moe_ep_tp(
                     desired_output_layouts=(Shard(1),),
                 ),
                 # replicate computation for the router
-                "moe.router.gate": NoParallel(),
+                "mlp.gate": NoParallel(),
             }
             if ep_mesh is not None and not etp_enabled:
                 # If TP is borrowed for EP, then split the tokens across TP ranks so that
                 # the reorderer, the all-to-all comms, and routed experts computation
                 # are effectively running Sequence Parallel (split along the folded bs*slen dim)
-                moe_layer_plan.update({"moe.reorderer": ReordererSequenceParallel()})
-            if transformer_block.moe.shared_experts is not None:
+                moe_layer_plan.update({"mlp.reorderer": ReordererSequenceParallel()})
+            if moe_block.shared_experts is not None:
                 # input Replicate, output Partial
                 moe_layer_plan.update(
                     {
-                        "moe.shared_experts.w1": ColwiseParallel(),
-                        "moe.shared_experts.w2": RowwiseParallel(
+                        "mlp.shared_experts.gate_proj": ColwiseParallel(),
+                        "mlp.shared_experts.up_proj": ColwiseParallel(),
+                        "mlp.shared_experts.down_proj": RowwiseParallel(
                             output_layouts=Partial()
                         ),
-                        "moe.shared_experts.w3": ColwiseParallel(),
                     }
                 )
             parallelize_module(
@@ -654,27 +648,40 @@ def apply_moe_ep_tp(
                 parallelize_plan=moe_layer_plan,
             )
 
-        experts_mesh, experts_plan = None, None
-        if ep_mesh is None:
+        if ep_mesh is None:  # This is the TP-only case for experts
             experts_mesh = tp_mesh
-            # input Replicate, output Partial
-            experts_plan = TensorParallel()
-        elif tp_mesh is None:
-            experts_mesh = ep_mesh
-            # input / output sharding on the batch / tokens dim
-            experts_plan = ExpertParallel()
-        elif etp_enabled:
-            experts_mesh = ep_tp_mesh
-            experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
-        else:
-            experts_mesh = ep_mesh
-            experts_plan = ExpertParallel()
+            expert_tp_plan = {}
+            for i in range(len(moe_block.experts)):
+                expert_tp_plan.update(
+                    {
+                        f"{i}.gate_proj": ColwiseParallel(),
+                        f"{i}.up_proj": ColwiseParallel(),
+                        f"{i}.down_proj": RowwiseParallel(output_layouts=Partial()),
+                    }
+                )
+            parallelize_module(
+                module=moe_block.experts,
+                device_mesh=experts_mesh,
+                parallelize_plan=expert_tp_plan,
+            )
+        else:  # EP or ETP enabled
+            experts_mesh, experts_plan = None, None
+            if tp_mesh is None:
+                experts_mesh = ep_mesh
+                # input / output sharding on the batch / tokens dim
+                experts_plan = ExpertParallel()
+            elif etp_enabled:
+                experts_mesh = ep_tp_mesh
+                experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
+            else:
+                experts_mesh = ep_mesh
+                experts_plan = ExpertParallel()
 
-        parallelize_module(
-            module=transformer_block.moe.experts,
-            device_mesh=experts_mesh,
-            parallelize_plan=experts_plan,
-        )
+            parallelize_module(
+                module=moe_block.experts,
+                device_mesh=experts_mesh,
+                parallelize_plan=experts_plan,
+            )
 
 
 def apply_compile(model: nn.Module):

From b4a1b8882da64a81e49f0d1067619c3babb8eb62 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 15 Oct 2025 13:22:34 +0000
Subject: [PATCH 065/129] cleaning

---
 .../transformers_backend/__init__.py          |   48 +-
 .../compare_distributed_run.py                | 1103 -----------------
 .../compare_distributed_run.sh                |    8 -
 .../model/hf_deepseek_v3_patch.py             |  113 --
 .../model/hf_llama_like_patch.py              |    4 -
 .../model/hf_llama_patch.py                   |   89 --
 torchtitan/models/attention.py                |    4 +-
 torchtitan/models/deepseek_v3/model/model.py  |    6 +-
 torchtitan/models/moe.py                      |    7 +-
 9 files changed, 28 insertions(+), 1354 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/compare_distributed_run.py
 delete mode 100755 torchtitan/experiments/transformers_backend/compare_distributed_run.sh
 delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
 delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_patch.py

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index c29b3a5aa1..fb21837a6b 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -75,35 +75,35 @@ class DeepSeekV3Args:
 flavors = {
     "debugmodel": HFTransformerModelArgs(
         titan_args=TitanModelArgs(
-            vocab_size=51200,
             dim=256,
-            n_layers=1,
+            n_layers=6,
             n_heads=16,
             n_kv_heads=16,
         ),
         pad_token_id=None,
-        # deepseek_v3_args=DeepSeekV3Args(
-        #     partial_rotary_factor=4.0,
-        #     inter_dim=1024,
-        #     moe_inter_dim=256,
-        #     n_dense_layers=1,
-        #     n_group=2,
-        #     topk_group=1,
-        #     kv_lora_rank=512,
-        #     q_lora_rank=0,
-        #     qk_nope_head_dim=128,
-        #     qk_rope_head_dim=64,
-        #     v_head_dim=128,
-        #     mscale=0.70,
-        #     moe_args=MoEArgs(
-        #         num_experts=8,
-        #         num_shared_experts=2,
-        #         top_k=3,
-        #         score_func="softmax",
-        #         route_norm=True,
-        #         score_before_experts=False,
-        #     ),
-        # )
+        #TODO(3outeille): use os.environ to switch between models
+        deepseek_v3_args=DeepSeekV3Args(
+            partial_rotary_factor=4.0,
+            inter_dim=1024,
+            moe_inter_dim=256,
+            n_dense_layers=1,
+            n_group=2,
+            topk_group=1,
+            kv_lora_rank=512,
+            q_lora_rank=0,
+            qk_nope_head_dim=128,
+            qk_rope_head_dim=64,
+            v_head_dim=128,
+            mscale=0.70,
+            moe_args=MoEArgs(
+                num_experts=8,
+                num_shared_experts=2,
+                top_k=3,
+                score_func="softmax",
+                route_norm=True,
+                score_before_experts=False,
+            )
+        ) if os.environ.get("USE_MOE", "0") == "1" else None,
     ),
     "medium": HFTransformerModelArgs(
         titan_args=TitanModelArgs(
diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.py b/torchtitan/experiments/transformers_backend/compare_distributed_run.py
deleted file mode 100644
index b42e8b0138..0000000000
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.py
+++ /dev/null
@@ -1,1103 +0,0 @@
-"""
-python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor debugmodel --nd_parallel 2d --verbose
-python compare_distributed_run.py --steps 5 --model-filter llama3 --flavor flavor --nd_parallel 2d --verbose
-
-Methodology:
-    - train on FSDP with TT (baseline)
-    - train on FSDP with HF (baseline)
-    - For all parallelism, train with nd-// with HF
-        - If one train fails:
-            - generated diff between HF FSDP (baseline) HF nd-// 
-            - train the nd-// TT counterpart
-                - diff between TT nd-// and HF nd-//
-                - diff between TT FSDP (baseline) and HF nd-//
-                - diff between TT FSDP (baseline) and TF nd-//
-results/
-|_ meta-llama
-	|_ Llama-3.2-1B
-		|_ 2D
-			|_ debugmodel
-				|_ baseline_hf_fsdp_4gpu.log
-				|_ baseline_tt_fsdp_4gpu.log
-				|_ baseline_fsdp_debugmodel_4gpu_huggingface.toml
-				|_ baseline_fsdp_debugmodel_4gpu_torchtitan.toml
-				|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu/
-					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.toml
-					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_torchtitan.toml
-					|_ fsdp1_cp1_tp2_pp2_debugmodel_4gpu_huggingface.log
-					|_ diff_hf_baseline_vs_hf_nd_parallelism.log
-					|_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log
-					|_ diff_tt_baseline_vs_hf_nd_parallelism.log
-			|_ full
-				|_ baseline_hf_fsdp_4gpu.log
-				|_ baseline_tt_fsdp_4gpu.log
-				|_ baseline_fsdp_full_4gpu_huggingface.toml
-				|_ baseline_fsdp_full_4gpu_torchtitan.toml
-				|_ fsdp1_cp1_tp2_pp2_full_4gpu/
-					|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.toml
-					|_ fsdp1_cp1_tp2_pp2_full_4gpu_torchtitan.toml
-					|_ fsdp1_cp1_tp2_pp2_full_4gpu_huggingface.log
-					|_ diff_hf_baseline_vs_hf_nd_parallelism.log
-					|_ diff_tt_nd_parallelism_vs_hf_nd_parallelism.log
-					|_ diff_tt_baseline_vs_hf_nd_parallelism.log
-
-"""
-import argparse
-import os
-import re
-import shutil
-import subprocess
-import sys
-from pathlib import Path
-from typing import List, Optional
-from dataclasses import dataclass, field
-from enum import Enum
-import torch
-from rich.console import Console
-from rich.panel import Panel
-from rich.progress import (
-    BarColumn,
-    Progress,
-    SpinnerColumn,
-    TextColumn,
-    TimeElapsedColumn,
-)
-from rich.table import Table
-
-
-console = Console()
-
-
-class LogLevel(Enum):
-    COMMAND = "COMMAND"
-    INFO = "INFO"
-    SUCCESS = "SUCCESS"
-    WARNING = "WARNING"
-    ERROR = "ERROR"
-    TEST_PASS = "TEST_PASS"
-    TEST_FAIL = "TEST_FAIL"
-
-
-def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None:
-    """Log a message with appropriate color coding."""
-    style_map = {
-        LogLevel.COMMAND: "dim",
-        LogLevel.INFO: "blue",
-        LogLevel.SUCCESS: "green",
-        LogLevel.WARNING: "yellow",
-        LogLevel.ERROR: "bold red",
-        LogLevel.TEST_PASS: "green",
-        LogLevel.TEST_FAIL: "bold red",
-    }
-
-    prefix_map = {
-        LogLevel.COMMAND: "[COMMAND]",
-        LogLevel.INFO: "[INFO]",
-        LogLevel.SUCCESS: "[SUCCESS]",
-        LogLevel.WARNING: "[WARNING]",
-        LogLevel.ERROR: "[ERROR]",
-        LogLevel.TEST_PASS: "✅ TEST PASS",
-        LogLevel.TEST_FAIL: "❌ TEST FAIL",
-    }
-
-    style = style_map[level]
-    prefix = prefix_map[level]
-    if indent > 0:
-        indent_str = "  " * (indent - 1) + "└─ "
-    else:
-        indent_str = ""
-         
-    output = ""
-    if level == LogLevel.COMMAND:
-        output = f"{indent_str}[{style}]{prefix} {message}[/]"
-    else:
-        output = f"{indent_str}[{style}]{prefix}[/] {message}"
-
-    if dim:
-        console.print(f"[dim]{output}[/dim]")
-    else:
-        console.print(output)
-
-
-@dataclass
-class ParallelismConfig:
-    """Configuration for a parallelism setup."""
-    name: str
-    dp_replicate: int
-    dp_shard: int
-    tp: int
-    pp: int
-    pp_schedule: str
-    cp: int
-    ep: int
-    eptp: int
-
-@dataclass
-class TrainingMetrics:
-    """Training metrics extracted from logs."""
-    steps: List[int] = field(default_factory=list)
-    loss: List[float] = field(default_factory=list)
-    grad_norm: List[float] = field(default_factory=list)
-    memory: List[float] = field(default_factory=list)
-    tps: List[int] = field(default_factory=list)
-    tflops: List[float] = field(default_factory=list)
-    mfu: List[float] = field(default_factory=list)
-
-class CompareDistributedRun:
-    """Main class for running distributed parallelism comparison tests."""
-    
-    # Default values
-    DEFAULT_STEPS = 10
-    DEFAULT_SEED = 42
-    DEFAULT_FLAVOR = "debugmodel"
-    # value chosen based on diff of llama3 1GPU
-    DEFAULT_LOSS_ATOL = 0.02
-    DEFAULT_LOSS_RTOL = 1e-5
-    DEFAULT_GRAD_NORM_ATOL = 0.02
-    DEFAULT_GRAD_NORM_RTOL = 1e-5
-    
-    MODEL_LISTS = {
-        "torchtitan":  ["llama3", "deepseek_v3"],
-        "huggingface": ["meta-llama/Llama-3.2-1B", "deepseek-ai/DeepSeek-V3"]
-    }
-    
-    MODEL_FLAVORS = {
-        "llama3": ["debugmodel", "medium", "full"],
-        "deepseek_v3": ["debugmodel"],
-        "meta-llama/Llama-3.2-1B": ["debugmodel", "medium", "full"],
-        "deepseek-ai/DeepSeek-V3": ["debugmodel"],
-    }
-
-    #TODO(3outeille): handle slurm later for 4D/5D. Might need to rethink the whole script for that
-    # Available ND parallelisms <-> number of GPUs
-    ND_PARALLEL_TO_NB_GPUS = {
-        "0d": 1,
-        "1d": 2,
-        "2d": 4,
-        "3d": 8,
-        "4d": 16,
-        "5d": 32,
-    }
-    
-    def __init__(self):
-        self.script_dir = Path(__file__).parent.absolute()
-        self.torchtitan_root = self.script_dir.parent.parent
-        self.base_results_dir = self.script_dir / "results"
-        
-        # Configuration parameters
-        self.nd_parallel_to_nb_gpus = self.ND_PARALLEL_TO_NB_GPUS
-        self.steps = self.DEFAULT_STEPS
-        self.seed = self.DEFAULT_SEED
-        self.model_filter = ""
-        self.flavor = self.DEFAULT_FLAVOR
-        self.verbose = False
-        self.use_slurm = False
-        self.slurm_options = []
-        self.loss_atol = self.DEFAULT_LOSS_ATOL
-        self.loss_rtol = self.DEFAULT_LOSS_RTOL
-        self.grad_norm_atol = self.DEFAULT_GRAD_NORM_ATOL
-        self.grad_norm_rtol = self.DEFAULT_GRAD_NORM_RTOL
-        self.parallelism_configs: List[ParallelismConfig] = []
-        self.results_dir: Optional[Path] = None
-        self.test_filter = ""
-
-    def generate_parallelism_configs(self, hf_model_name: str) -> None:
-        """Generate parallelism configurations based on the number of GPUs."""
-        from transformers import AutoConfig
-
-        try:
-            model_config = AutoConfig.from_pretrained(hf_model_name)
-            is_moe = getattr(model_config, "num_local_experts", 0) > 1
-        except Exception:
-            # Fallback for models not on Hub or other errors
-            is_moe = False
-            log_message(LogLevel.WARNING, f"Could not determine if {hf_model_name} is a MoE model from HuggingFace Hub. EP configurations will not be generated.")
-
-        ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
-        configs = []
-
-        def _get_factors(n: int) -> List[int]:
-            factors = set()
-            for i in range(1, int(n**0.5) + 1):
-                if n % i == 0:
-                    factors.add(i)
-                    factors.add(n // i)
-            return sorted(list(factors))
-
-        # Baseline FSDP
-        configs.append(ParallelismConfig(name="fsdp", dp_replicate=1, dp_shard=ngpu, tp=1, pp=1, pp_schedule="1F1B", cp=1, ep=1, eptp=1))
-
-        #NOTE(3outeille): No need to handle DDP (dp_replicate) as DDP is not supported > 1D parallelism"
-        #(cf https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/infra/parallelize.py#L139)
-        possible_fsdp = _get_factors(ngpu) # dp_shard
-        possible_cp = _get_factors(ngpu)
-        possible_tp = _get_factors(ngpu)
-        possible_pp = _get_factors(ngpu)
-
-        #TODO(3outeille): handle HSDP later
-
-        for dp_shard in possible_fsdp:
-            for cp in possible_cp:
-                for tp in possible_tp:
-                    for pp in possible_pp:
-                        
-                        if dp_shard * cp * tp * pp != ngpu:
-                            continue
-
-                        num_parallelisms_used = sum(parallel_degree > 1 for parallel_degree in [dp_shard, cp, tp, pp])
-                        ndims_required = int(self.nd_parallel[0])
-                        #NOTE(3outeille): if 2D//, we need at least 2 parallelisms to be active (> 1). For 3D //, least 3 parallelisms > 1 etc.
-                        if ndims_required > 1 and num_parallelisms_used < ndims_required:
-                            continue
-
-                        configs.append(
-                            ParallelismConfig(
-                                name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}",
-                                dp_replicate=1,
-                                dp_shard=dp_shard,
-                                tp=tp,
-                                pp=pp,
-                                pp_schedule="1F1B",
-                                cp=cp,
-                                ep=1,
-                                eptp=1
-                            )
-                        )
-
-                        if is_moe:
-                            # NOTE(3outeille): EP borrowing degree from dp_shard
-                            configs.append(
-                                ParallelismConfig(
-                                    name=f"fsdp{dp_shard}_cp{cp}_tp{tp}_pp{pp}_ep{dp_shard}",
-                                    dp_replicate=1,
-                                    dp_shard=dp_shard,
-                                    tp=tp,
-                                    pp=pp,
-                                    pp_schedule="1F1B",
-                                    cp=cp,
-                                    ep=dp_shard,
-                                    eptp=1
-                                )
-                            )
-        
-    
-        # Remove duplicates and assign to instance
-        unique_configs = []
-        seen_configs = set()
-        for config in configs:
-            # Create a tuple of the config values to check for duplicates
-            config_tuple = (config.dp_replicate, config.dp_shard, config.tp, config.pp, config.cp, config.ep, config.eptp)
-            if config_tuple not in seen_configs:
-                unique_configs.append(config)
-                seen_configs.add(config_tuple)
-
-        self.parallelism_configs = unique_configs
-        
-        log_message(
-            LogLevel.INFO,
-            f"Generated {len(self.parallelism_configs)} parallelism configurations for {ngpu} GPUs.",
-        )
-        configs_to_display = self.parallelism_configs
-        table_title = "[bold]Generated Parallelism Configurations[/bold]"
-
-        if self.test_filter:
-            # Keep fsdp baseline and anything that matches the filter
-            configs_to_display = [c for c in self.parallelism_configs if c.name == "fsdp" or self.test_filter in c.name]
-            table_title = f"[bold]Filtered Parallelism Configurations (filter: [cyan]'{self.test_filter}'[/cyan])[/bold]"
-
-        table = Table(
-            title=table_title,
-            show_header=True,
-            header_style="bold magenta",
-        )
-        table.add_column("Name", style="cyan", no_wrap=True)
-        table.add_column("dp_replicate", justify="right")
-        table.add_column("dp_shard", justify="right")
-        table.add_column("tp", justify="right")
-        table.add_column("pp", justify="right")
-        table.add_column("cp", justify="right")
-        table.add_column("ep", justify="right")
-        table.add_column("eptp", justify="right")
-
-        for config in configs_to_display:
-            table.add_row(
-                config.name,
-                str(config.dp_replicate),
-                str(config.dp_shard),
-                str(config.tp),
-                str(config.pp),
-                str(config.cp),
-                str(config.ep),
-                str(config.eptp),
-            )
-        console.print(table)
-        console.print()
-    
-    def generate_config(self, config_dir: Path, config: ParallelismConfig, model_name: str, backend: str, filename: Optional[str] = None, indent: int = 0, dim: bool = False) -> Path:
-        """Generate configuration file for a parallelism setup."""
-        import toml
-
-        if filename:
-            config_file = config_dir / filename
-        else:
-            config_file = config_dir / f"{config.name}_{self.flavor}_{self.nd_parallel_to_nb_gpus[self.nd_parallel]}gpu_{backend}.toml"
-
-        base_config = self.script_dir / "configs" / "test_template.toml"
-        shutil.copy2(base_config, config_file)
-
-        # Load the TOML file as a dict
-        with open(config_file, 'r') as f:
-            config_data = toml.load(f)
-
-        # Update [model] section
-        if "model" not in config_data:
-            config_data["model"] = {}
-        config_data["model"]["name"] = model_name
-        config_data["model"]["flavor"] = self.flavor
-
-        # Validate flavor for model type
-        if model_name in self.MODEL_FLAVORS:
-            if self.flavor not in self.MODEL_FLAVORS[model_name]:
-                log_message(LogLevel.WARNING, 
-                           f"Flavor '{self.flavor}' not available for {model_name}. "
-                           f"Available: {self.MODEL_FLAVORS[model_name]}", indent=indent, dim=dim)
-
-        # Update [training] section
-        if "training" not in config_data:
-            config_data["training"] = {}
-        config_data["training"]["steps"] = self.steps
-        config_data["training"]["seed"] = self.seed
-
-        # Update [parallelism] section
-        if "parallelism" not in config_data:
-            config_data["parallelism"] = {}
-        config_data["parallelism"]["data_parallel_replicate_degree"] = config.dp_replicate
-        config_data["parallelism"]["data_parallel_shard_degree"] = config.dp_shard
-        config_data["parallelism"]["tensor_parallel_degree"] = config.tp
-        config_data["parallelism"]["pipeline_parallel_degree"] = config.pp
-        config_data["parallelism"]["pipeline_parallel_schedule"] = config.pp_schedule
-        config_data["parallelism"]["context_parallel_degree"] = config.cp
-        config_data["parallelism"]["expert_parallel_degree"] = config.ep
-        config_data["parallelism"]["expert_tensor_parallel_degree"] = config.eptp
-
-        # Write back the modified TOML
-        with open(config_file, 'w') as f:
-            toml.dump(config_data, f)
-
-        if self.verbose:
-            log_message(LogLevel.INFO, f"Created config file: {config_file} for config '{config.name}' (model: {model_name})", indent=indent, dim=dim)
-        return config_file
-    
-    def extract_metrics(self, log_file: Path, indent: int = 0, dim: bool = False) -> TrainingMetrics:
-        """Extract metrics from log file."""
-        metrics = TrainingMetrics()
-        
-        try:
-            with open(log_file, 'r') as f:
-                content = f.read()
-
-            # Regex to capture all metrics from a log line, ignoring ANSI color codes
-            pattern = re.compile(
-                r"step:\s*(\d+)\s*"
-                r".*?loss:\s*([0-9]+\.?[0-9]*)\s*"
-                r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*"
-            )
-
-            for match in pattern.finditer(content):
-                metrics.steps.append(int(match.group(1)))
-                metrics.loss.append(float(match.group(2)))
-                metrics.grad_norm.append(float(match.group(3)))
-                
-        except Exception as e:
-            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}: {e}", indent=indent, dim=dim)
-        
-        if not metrics.loss or not metrics.grad_norm:
-            log_message(LogLevel.WARNING, f"Could not extract metrics from {log_file}", indent=indent, dim=dim)
-        
-        return metrics
-    
-    def compare_metrics(self, baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, 
-                       config_name: str, indent: int = 0, dim: bool = False) -> bool:
-        """Compare metrics between baseline and test configuration."""
-        if not baseline_metrics.loss or not test_metrics.loss:
-            log_message(LogLevel.TEST_FAIL, f"{config_name} - Unable to extract metrics", indent=indent, dim=dim)
-            return False
-        
-        # Convert to tensors
-        baseline_loss = torch.tensor(baseline_metrics.loss)
-        test_loss = torch.tensor(test_metrics.loss)
-        baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm)
-        test_grad_norm = torch.tensor(test_metrics.grad_norm)
-        
-        # Check if tensors are close
-        loss_pass = torch.allclose(baseline_loss, test_loss, atol=self.loss_atol, rtol=self.loss_rtol)
-        grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=self.grad_norm_atol, rtol=self.grad_norm_rtol)
-
-        # Calculate max absolute differences for logging
-        loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
-        grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
-        
-        # Calculate min absolute differences for logging
-        loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
-        grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
-
-        if loss_pass and grad_pass:
-            log_message(LogLevel.TEST_PASS, 
-                       f"{config_name} - Max loss diff: {loss_max_diff:.2e}, "
-                       f"Min loss diff: {loss_min_diff:.2e}, "
-                       f"Max grad norm diff: {grad_norm_diff:.2e}, "
-                       f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim)
-            return True
-        else:
-            log_message(LogLevel.TEST_FAIL,
-                       f"{config_name} - Max loss diff: {loss_max_diff:.2e}, "
-                       f"Min loss diff: {loss_min_diff:.2e}, "
-                       f"Max grad norm diff: {grad_norm_diff:.2e}, "
-                       f"Min grad norm diff: {grad_norm_min_diff:.2e}", indent=indent, dim=dim)
-            return False
-    
-    def generate_diff(self, baseline_log: Path, test_log: Path, diff_file: Path, indent: int = 0, dim: bool = False) -> None:
-        """Generate diff between baseline and test logs."""
-        
-        def _filter_log(log_file: Path) -> Path:
-            """Filter log file to normalize volatile information."""
-            filtered_file = log_file.with_suffix(log_file.suffix + '.filtered')
-            
-            with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile:
-                for line in infile:
-                    # Apply filtering patterns
-                    line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', 
-                                'TIMESTAMP', line)
-                    line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', 
-                                'torchrun ... --master_port=XXXX', line)
-                    line = re.sub(r'PID [0-9]+', 'PID XXXX', line)
-                    line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line)
-                    outfile.write(line)
-            
-            return filtered_file
-        try:
-            # Filter logs to remove timestamps and volatile information
-            baseline_filtered = _filter_log(baseline_log)
-            test_filtered = _filter_log(test_log)
-            
-            # Generate colored diff using git diff
-            cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color",
-                   str(baseline_filtered), str(test_filtered)]
-            
-            with open(diff_file, 'w') as f:
-                subprocess.run(cmd, stdout=f, stderr=subprocess.DEVNULL)
-            
-            # Clean up filtered files
-            baseline_filtered.unlink()
-            test_filtered.unlink()
-            
-        except Exception as e:
-            log_message(LogLevel.WARNING, f"Could not generate diff: {e}", indent=indent, dim=dim)
-    
-    def run_training_local(self, config_file: Path, log_file: Path, config_name: str, model_name: str, indent: int = 0, dim: bool = False) -> Optional[subprocess.CalledProcessError]:
-        """Run training with given configuration."""
-        log_message(LogLevel.INFO, f"Running training: {config_name} with model {model_name}", indent=indent, dim=dim)
-        cmd = [
-            "torchrun",
-            f"--nproc_per_node={self.ngpu}",
-            "--rdzv_backend", "c10d",
-            "--rdzv_endpoint=localhost:0",
-            "--local-ranks-filter", str(self.ngpu - 1),
-            "--role", "rank",
-            "--tee", "3",
-            "-m", "torchtitan.train",
-            "--training.seed", str(self.seed),
-            "--training.deterministic",
-            "--job.config_file", str(config_file)
-        ]
-        env = os.environ.copy()
-        env["SEED"] = str(self.seed)
-        env["LOG_RANK"] = str(self.ngpu - 1)
-
-        log_message(LogLevel.COMMAND, f"{' '.join(cmd)}", indent=indent, dim=dim)
-
-        try:
-            # Capture output to include it in the exception, while still writing to log file
-            result = subprocess.run(
-                cmd,
-                cwd=self.torchtitan_root,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,  # decodes stdout/stderr as text
-                env=env,
-                check=True
-            )
-            with open(log_file, 'w') as f:
-                f.write(result.stdout)
-            
-            if self.verbose:
-                log_message(LogLevel.SUCCESS, f"Training completed: {config_name}", indent=indent, dim=dim)
-            return None
-            
-        except subprocess.CalledProcessError as e:
-            log_message(LogLevel.ERROR, f"Training failed: {config_name}", indent=indent, dim=dim)
-            
-            # Write the failed output to the log file
-            with open(log_file, 'w') as f:
-                if e.stdout:
-                    f.write(e.stdout)
-
-            # Print the tail of the error log to the console for quick debugging
-            if e.stdout:
-                console.print("[bold red]--- Error Log Tail ---[/bold red]")
-                error_lines = e.stdout.strip().split('\n')
-                for line in error_lines[-15:]:
-                    console.print(f"[red]{line}[/red]")
-                console.print("[bold red]--- End Error Log Tail ---[/bold red]")
-
-            e.add_note(f"\n--- Full output from failed process ---\n{e.stdout or '<no output captured>'}")
-            return e
-    
-    def run_training_slurm(self):
-        pass
-
-    def _compare_one_parallelism_config(
-        self,
-        config: "ParallelismConfig",
-        hf_model_name: str,
-        tt_model_name: str,
-        hf_baseline_metrics: "TrainingMetrics",
-        tt_baseline_metrics: "TrainingMetrics",
-        baseline_log_hf: Path,
-        baseline_log_tt: Path,
-        indent: int = 0,
-    ) -> bool:
-        """Compares a single parallelism configuration against the baseline."""
-        # New flow: launch all training, then all diff, then all extract/compare metrics
-
-        # --- 1. Setup directories and config files ---
-        test_dir_name = f"{config.name}_{self.flavor}_{self.ngpu}gpu"
-        test_dir = self.results_dir / test_dir_name
-        test_dir.mkdir(exist_ok=True)
-
-        config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-        config_file_hf = self.generate_config(
-            config_dir=test_dir,
-            config=config,
-            model_name=hf_model_name,
-            backend="huggingface",
-            filename=config_filename_hf,
-            indent=indent,
-        )
-        log_path_hf = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
-
-        config_filename_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
-        config_file_tt = self.generate_config(
-            config_dir=test_dir,
-            config=config,
-            model_name=tt_model_name,
-            backend="torchtitan",
-            filename=config_filename_tt,
-            indent=indent + 5,
-            dim=True,
-        )
-        log_path_tt = test_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
-
-        # --- 2. Launch all training (HF and TT) ---
-        hf_run_error = self.run_training_local(
-            config_file=config_file_hf,
-            log_file=log_path_hf,
-            config_name=config.name,
-            model_name=hf_model_name,
-            indent=indent,
-        )
-        tt_run_error = self.run_training_local(
-            config_file=config_file_tt,
-            log_file=log_path_tt,
-            config_name=config.name,
-            model_name=tt_model_name,
-            indent=indent + 5,
-            dim=True,
-        )
-
-        # If either training failed, log and skip further steps for this config
-        if hf_run_error:
-            log_message(
-                LogLevel.TEST_FAIL,
-                f"{config.name} (huggingface) - Training script failed.",
-                indent=indent + 5,
-                dim=True,
-            )
-            return False
-
-        if tt_run_error:
-            log_message(
-                LogLevel.TEST_FAIL,
-                f"{config.name} (torchtitan) - Training script failed.",
-                indent=indent + 5,
-                dim=True,
-            )
-            return False
-
-        # --- 3. Generate all diffs ---
-        list_of_diffs = {
-            "HF baseline vs HF nd-parallel": (baseline_log_hf, log_path_hf, test_dir / "diff_hf_baseline_vs_hf_nd_parallelism.log"),
-            "TT nd-parallel vs HF nd-parallel": (log_path_tt, log_path_hf, test_dir / "diff_tt_nd_parallelism_vs_hf_nd_parallelism.log"),
-            "TT baseline vs HF nd-parallel": (baseline_log_tt, log_path_hf, test_dir / "diff_tt_baseline_vs_hf_nd_parallelism.log"),
-            "TT baseline vs TT nd-parallel": (baseline_log_tt, log_path_tt, test_dir / "diff_tt_baseline_vs_tt_nd_parallelism.log"),
-        }
-        for src, dst, output in list_of_diffs.values():
-            self.generate_diff(src, dst, output, indent=indent + 5, dim=True)
-
-        # --- 4. Extract all metrics ---
-        hf_metrics = self.extract_metrics(log_path_hf, indent=indent)
-        tt_metrics = self.extract_metrics(log_path_tt, indent=indent + 5, dim=True)
-
-        # --- 5. Compare metrics and determine pass/fail ---
-        test_passed = True
-
-        for diff_name, (src, dst, output) in list_of_diffs.items():
-            if "TT nd-parallel vs HF nd-parallel" == diff_name:
-                metrics_passed = self.compare_metrics(
-                    tt_metrics,
-                    hf_metrics,
-                    diff_name,
-                    indent=indent + 5,
-                    dim=True,
-                )
-            elif "TT baseline vs TT nd-parallel" == diff_name:
-                metrics_passed = self.compare_metrics(
-                    tt_baseline_metrics,
-                    tt_metrics,
-                    diff_name,
-                    indent=indent + 5,
-                    dim=True,
-                )
-            elif "TT baseline vs HF nd-parallel" == diff_name:
-                metrics_passed = self.compare_metrics(
-                    tt_baseline_metrics,
-                    hf_metrics,
-                    diff_name,
-                    indent=indent + 5,
-                    dim=True,
-                )
-            else:  # HF baseline vs HF nd-parallel == diff_name
-                metrics_passed = self.compare_metrics(
-                    hf_baseline_metrics,
-                    hf_metrics,
-                    diff_name,
-                    indent=indent + 5,
-                    dim=True,
-                )
-
-            if not metrics_passed:
-                test_passed = False
-
-            log_message(
-                LogLevel.INFO,
-                f"Diff between {diff_name} saved to: {output}",
-                indent=indent + 10,
-                dim=True,
-            )
-
-        return test_passed
-
-    def run_local(self, args: argparse.Namespace) -> int:
-        """Main execution function. Runs all test suites for all models."""
-        self.nd_parallel = args.nd_parallel
-        self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
-        self.steps = args.steps
-        self.model_filter = args.model_filter
-        self.test_filter = args.test_filter
-        self.flavor = args.flavor
-        self.verbose = args.verbose
-        self.loss_atol = args.loss_atol
-        self.loss_rtol = args.loss_rtol
-        self.grad_norm_atol = args.grad_norm_atol
-        self.grad_norm_rtol = args.grad_norm_rtol
-
-        console.print(
-            Panel(
-                (
-                    f"[bold]GPUs:[/bold] {self.ngpu}\n"
-                    f"[bold]Steps:[/bold] {self.steps}\n"
-                    f"[bold]Seed:[/bold] {self.seed}\n"
-                    f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n"
-                    f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n"
-                    f"[bold]Model flavor:[/bold] {self.flavor}"
-                ),
-                title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]",
-                expand=False,
-                border_style="blue",
-                padding=(1, 2),
-            )
-        )
-        console.print()
-
-        self.base_results_dir.mkdir(exist_ok=True)
-
-        # TODO(3outeille): make it more generic later
-        if self.model_filter == "llama3":
-            hf_model_name = "meta-llama/Llama-3.2-1B"
-            tt_model_name = "llama3"
-        elif self.model_filter == "deepseek_v3":
-            hf_model_name = "deepseek-ai/DeepSeek-V3"
-            tt_model_name = "deepseek_v3"
-        else:
-            raise ValueError(f"Model filter {self.model_filter} not supported")
-            
-        self.generate_parallelism_configs(hf_model_name)
-            
-        model_owner, model_repo = hf_model_name.split("/", 1)
-        nd_parallel_upper = self.nd_parallel.upper()
-        self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor
-        self.results_dir.mkdir(parents=True, exist_ok=True)
-
-        if self.verbose:
-            log_message(LogLevel.INFO, f"Results directory: {self.results_dir}")
-
-        console.print(
-            Panel(
-                "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]",
-                expand=False,
-                border_style="blue",
-                padding=(0, 2),
-            )
-        )
-
-        baseline_config = next((c for c in self.parallelism_configs if c.name == "fsdp"), None)
-        # --- 1. Generate configs ---
-        baseline_config_filename_hf = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-        baseline_config_file_hf = self.generate_config(
-            config_dir=self.results_dir,
-            config=baseline_config,
-            model_name=hf_model_name,
-            backend="huggingface",
-            filename=baseline_config_filename_hf,
-            indent=0
-        )
-        baseline_log_hf = self.results_dir / f"baseline_hf_{baseline_config.name}_{self.ngpu}gpu.log"
-
-        baseline_config_filename_tt = f"baseline_{baseline_config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
-        baseline_config_file_tt = self.generate_config(
-            config_dir=self.results_dir,
-            config=baseline_config,
-            model_name=tt_model_name,
-            backend="torchtitan", 
-            filename=baseline_config_filename_tt,
-            indent=0
-        )
-        baseline_log_tt = self.results_dir / f"baseline_tt_{baseline_config.name}_{self.ngpu}gpu.log"
-
-        # --- 2. Launch all training ---
-        hf_baseline_run_error = self.run_training_local(
-            config_file=baseline_config_file_hf,
-            log_file=baseline_log_hf,
-            config_name=baseline_config.name,
-            model_name=hf_model_name,
-            indent=0
-        )
-        if hf_baseline_run_error:
-            raise ValueError(f"Huggingface baseline (FSDP) training failed for {hf_model_name}") from hf_baseline_run_error
-
-        tt_baseline_run_error = self.run_training_local(
-            config_file=baseline_config_file_tt,
-            log_file=baseline_log_tt,
-            config_name=baseline_config.name,
-            model_name=tt_model_name,
-            indent=0
-        )
-        if tt_baseline_run_error:
-            raise ValueError(f"TorchTitan baseline (FSDP) training failed for {tt_model_name}") from tt_baseline_run_error
-
-        # --- 3. Generate diff ---
-        diff_file_tt_baseline_vs_hf_baseline = self.results_dir / "diff_tt_baseline_vs_hf_baseline.log"
-        self.generate_diff(
-            baseline_log_tt,
-            baseline_log_hf,
-            diff_file_tt_baseline_vs_hf_baseline,
-            indent=0
-        )
-        log_message(
-            LogLevel.INFO,
-            f"Diff between baseline TT and baseline HF saved to: {diff_file_tt_baseline_vs_hf_baseline}",
-            indent=5,
-            dim=True
-        )
-
-        # --- 4. Extract metrics ---
-        hf_baseline_metrics = self.extract_metrics(baseline_log_hf, indent=0)
-        if not hf_baseline_metrics.loss or not hf_baseline_metrics.grad_norm:
-            raise ValueError(f"Could not extract huggingface baseline metrics for {hf_model_name}")
-
-        tt_baseline_metrics = self.extract_metrics(baseline_log_tt, indent=0)
-        if not tt_baseline_metrics.loss or not tt_baseline_metrics.grad_norm:
-            raise ValueError(f"Could not extract TorchTitan baseline metrics for {tt_model_name}")
-
-        # --- 5. Compare metrics ---
-        if not self.compare_metrics(
-            tt_baseline_metrics,
-            hf_baseline_metrics,
-            "baseline (TT) vs baseline (HF)",
-            indent=5
-        ):
-            raise ValueError(f"Baseline (TT) vs baseline (HF) metrics comparison failed for {tt_model_name}")
-
-        console.print()
-        console.print(
-            Panel(
-                "[bold cyan]Comparing ND Parallelism Configurations[/bold cyan]",
-                expand=False,
-                border_style="blue",
-                padding=(0, 2),
-            )
-        )
-        passed_tests = 1 # +1 for the baseline (FSDP)
-        failed_tests = 0
-        test_configs = [c for c in self.parallelism_configs if c.name != "fsdp"]
-        if self.test_filter:
-            filtered_configs = [c for c in test_configs if self.test_filter in c.name]
-            if not filtered_configs:
-                log_message(LogLevel.WARNING, f"Test filter '{self.test_filter}' did not match any test configurations.")
-            test_configs = filtered_configs
-        total_tests = len(test_configs) + 1 # +1 for the baseline (FSDP)
-        results = []
-
-        console.print()
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            task = progress.add_task(
-                "[cyan]Comparing configurations...", total=total_tests
-            )
-            for i, config in enumerate(test_configs):
-                if i > 0:
-                    console.rule(style="dim")
-
-                progress.update(
-                    task, description=f"[cyan]Testing [bold]{config.name}[/bold]"
-                )
-                passed = self._compare_one_parallelism_config(
-                    config,
-                    hf_model_name,
-                    tt_model_name,
-                    hf_baseline_metrics,
-                    tt_baseline_metrics,
-                    baseline_log_hf,
-                    baseline_log_tt,
-                    indent=1,
-                )
-                results.append((config.name, passed))
-                if passed:
-                    passed_tests += 1
-                else:
-                    failed_tests += 1
-                progress.advance(task)
-        console.print()
-
-        console.print(
-            Panel(
-                "[bold cyan]Final Summary[/bold cyan]",
-                expand=False,
-                border_style="blue",
-                padding=(0, 2),
-            )
-        )
-
-        summary_table = Table(show_header=True, header_style="bold magenta")
-        summary_table.add_column("Configuration", style="cyan")
-        summary_table.add_column("Status", justify="center")
-
-        for name, passed in results:
-            status = (
-                "[bold green]✅ PASS[/bold green]"
-                if passed
-                else "[bold red]❌ FAIL[/bold red]"
-            )
-            summary_table.add_row(name, status)
-
-        console.print(summary_table)
-        console.print()
-
-        overall_summary = Table(title="Overall Test Summary")
-        overall_summary.add_column("Metric", style="cyan")
-        overall_summary.add_column("Value", justify="right")
-        overall_summary.add_row("Total Configurations Tested", str(total_tests))
-        overall_summary.add_row("[green]Passed[/green]", str(passed_tests))
-        overall_summary.add_row("[red]Failed[/red]", str(failed_tests))
-        console.print(overall_summary)
-
-        if passed_tests == total_tests:
-            log_message(LogLevel.SUCCESS, "All model tests passed! 🎉")
-            return 0
-        else:
-            log_message(LogLevel.TEST_FAIL, f"{failed_tests} configuration(s) had test failures")
-            log_message(
-                LogLevel.INFO, f"Check the diff files in {self.results_dir} for details"
-            )
-            return 1
-        
-    def run_slurm(self, args: argparse.Namespace) -> int:
-        """Main execution function. Runs all test suites for all models."""
-        self.nd_parallel = args.nd_parallel
-        self.ngpu = self.nd_parallel_to_nb_gpus[self.nd_parallel]
-        self.steps = args.steps
-        self.model_filter = args.model_filter
-        self.test_filter = args.test_filter
-        self.flavor = args.flavor
-        self.verbose = args.verbose
-        self.loss_atol = args.loss_atol
-        self.loss_rtol = args.loss_rtol
-        self.grad_norm_atol = args.grad_norm_atol
-        self.grad_norm_rtol = args.grad_norm_rtol
-
-        console.print(
-            Panel(
-                (
-                    f"[bold]GPUs:[/bold] {self.ngpu}\n"
-                    f"[bold]Steps:[/bold] {self.steps}\n"
-                    f"[bold]Seed:[/bold] {self.seed}\n"
-                    f"[bold]Model filter:[/bold] {self.model_filter or 'all'}\n"
-                    f"[bold]Test filter:[/bold] {self.test_filter or 'all'}\n"
-                    f"[bold]Model flavor:[/bold] {self.flavor}"
-                ),
-                title="[bold cyan]Distributed Parallelism Comparison[/bold cyan]",
-                expand=False,
-                border_style="blue",
-                padding=(1, 2),
-            )
-        )
-        console.print()
-
-        self.base_results_dir.mkdir(exist_ok=True)
-
-        # TODO(3outeille): make it more generic later
-        if self.model_filter == "llama3":
-            hf_model_name = "meta-llama/Llama-3.2-1B"
-            tt_model_name = "llama3"
-        elif self.model_filter == "deepseek_v3":
-            hf_model_name = "deepseek-ai/DeepSeek-V3"
-            tt_model_name = "deepseek_v3"
-        else:
-            raise ValueError(f"Model filter {self.model_filter} not supported")
-            
-        self.generate_parallelism_configs(hf_model_name)
-            
-        model_owner, model_repo = hf_model_name.split("/", 1)
-        nd_parallel_upper = self.nd_parallel.upper()
-        self.results_dir = self.base_results_dir / model_owner / model_repo / nd_parallel_upper / self.flavor
-        self.results_dir.mkdir(parents=True, exist_ok=True)
-
-        if self.verbose:
-            log_message(LogLevel.INFO, f"Results directory: {self.results_dir}")
-
-        console.print(
-            Panel(
-                "[bold cyan]Comparing baseline (FSDP) for huggingface & torchtitan[/bold cyan]",
-                expand=False,
-                border_style="blue",
-                padding=(0, 2),
-            )
-        )
-
-        # --- 1. Generate configs ---
-
-        L = []
-
-        for config in self.parallelism_configs:
-
-            config_dir = self.results_dir if config.name == "fsdp" else self.results_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu"
-            config_dir.mkdir(exist_ok=True)
-
-            config_filename_hf = f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.toml"
-            config_file_hf = self.generate_config(
-                config_dir=config_dir,
-                config=config,
-                model_name=hf_model_name,
-                backend="huggingface",
-                filename=config_filename_hf,
-                indent=0
-            )
-            config_filename_tt = f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.toml"
-            config_file_tt = self.generate_config(
-                config_dir=config_dir,
-                config=config,
-                model_name=tt_model_name,
-                backend="torchtitan", 
-                filename=config_filename_tt,
-                indent=0
-            )
-            log_path_hf = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_huggingface.log"
-            log_path_tt = config_dir / f"{config.name}_{self.flavor}_{self.ngpu}gpu_torchtitan.log"
-
-            L.append((config_file_hf, config_file_tt, log_path_hf, log_path_tt))
-
-
-        # Launch slurm training
-        jobs = []
-        from slurm_utils import Job, Status
-        for config_file_hf, config_file_tt, log_path_hf, log_path_tt in L:
-            job_hf = Job(config_file_hf, log_path_hf, qos="high")
-            job_tt = Job(config_file_tt, log_path_tt, qos="high")
-
-            job_tt.set_status(Status.INIT)
-            job_hf.set_status(Status.INIT)
-            jobs.append(job_hf)
-            jobs.append(job_tt)
-
-        scheduler = Scheduler()
-
-        scheduler.create_slurm_script(jobs)
-        # submit in subprocess
-        scheduler.submit_jobs(jobs) # -> job.set_status(Status.PENDING)
-
-        scheduler.wait_for_all_jobs_to_complete() # spawn tmux to monitor jobs
-        #NOTE(3outeille): run_slurm() should not be run if <TODO(3outeille): think of the condition to not rerun>
-
-    def run_tests_slurm(self, args: argparse.Namespace) -> int:
-        # TODO(3outeille): do diff + compare metrics
-        pass
-
-def main():
-    """Entry point for the script."""
-    parser = argparse.ArgumentParser(
-        description="Test different parallelism configurations against a baseline FSDP model.",
-    )
-    parser.add_argument("--use_slurm", action="store_true",
-                        help="Use SLURM for job submission")
-    parser.add_argument("--run_tests_slurm", action="store_true",
-                        help="Run tests with SLURM")
-    parser.add_argument("-m", "--model-filter", default="",
-                        help="Filter models by name pattern (e.g., 'llama3')")
-    parser.add_argument("-t", "--test-filter", default="",
-                        help="Filter parallelism configurations by name pattern (e.g., 'fsdp1_cp1_tp2_pp2')")
-    parser.add_argument("-nd", "--nd_parallel", type=str, default="2d",
-                        help=f"Parallelism to use (default: {CompareDistributedRun.ND_PARALLEL_TO_NB_GPUS.keys()})")
-    parser.add_argument("-s", "--steps", type=int, default=CompareDistributedRun.DEFAULT_STEPS,
-                        help=f"Training steps (default: {CompareDistributedRun.DEFAULT_STEPS})")
-    parser.add_argument("--flavor", default=CompareDistributedRun.DEFAULT_FLAVOR,
-                        help=f"Model flavor/size (default: {CompareDistributedRun.DEFAULT_FLAVOR}). "
-                            f"Available: llama3=[debugmodel, medium, full], deepseek_v3=[debugmodel]")
-    parser.add_argument("-v", "--verbose", action="store_true",
-                        help="Verbose output")
-    parser.add_argument("--loss-atol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_ATOL,
-                        help=f"Absolute tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_ATOL})")
-    parser.add_argument("--loss-rtol", type=float, default=CompareDistributedRun.DEFAULT_LOSS_RTOL,
-                        help=f"Relative tolerance for loss comparison (default: {CompareDistributedRun.DEFAULT_LOSS_RTOL})")
-    parser.add_argument("--grad-norm-atol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL,
-                        help=f"Absolute tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_ATOL})")
-    parser.add_argument("--grad-norm-rtol", type=float, default=CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL,
-                        help=f"Relative tolerance for grad norm comparison (default: {CompareDistributedRun.DEFAULT_GRAD_NORM_RTOL})")
-    
-    args = parser.parse_args()
-        
-    runner = CompareDistributedRun()
-    if args.use_slurm:
-        return runner.run_slurm(args)
-    elif args.run_tests_slurm:
-        return runner.run_tests_slurm(args)
-    else:
-        return runner.run_local(args)
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh b/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
deleted file mode 100755
index 2ca9bbee62..0000000000
--- a/torchtitan/experiments/transformers_backend/compare_distributed_run.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/bash
-
-if [[ "$1" == "--debug" ]]; then
-    shift
-    debugpy-run compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@"
-else
-    python compare_distributed_run.py --steps 10 --model-filter llama3 --flavor debugmodel --nd_parallel 1d "$@"
-fi
diff --git a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py b/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
deleted file mode 100644
index c2cb960ac5..0000000000
--- a/torchtitan/experiments/transformers_backend/model/hf_deepseek_v3_patch.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import os
-import torch.nn as nn
-from torchtitan.utils.test_utils import seeded_init_decorator_for_test
-
-from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
-from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE, DeepseekV3DecoderLayer
-from transformers.modeling_utils import PreTrainedModel
-
-
-_original_deepseek_v3_decoder_layer_init = DeepseekV3DecoderLayer.__init__
-
-def _deepseek_v3_decoder_layer_init_patched(self, config: DeepseekV3Config, layer_idx: int):
-    _original_deepseek_v3_decoder_layer_init(self, config, layer_idx)
-    
-    self.layer_idx = layer_idx
-    self.mlp.layer_idx = layer_idx
-    
-    if hasattr(self.mlp, 'experts'):
-        for expert in self.mlp.experts:
-            expert.layer_idx = layer_idx
-        self.mlp.shared_experts.layer_idx = layer_idx
-    
-def _initialize_weights_patched(self, module):
-    # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
-    # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
-    # which prevents subsequent proper initialization.
-    if getattr(module, "_is_hf_initialized", False):
-        return
-
-    for param in module.parameters(recurse=True):
-        if param.device.type == "meta":
-            return
-    
-    # If not on a meta device, call the original weight initialization
-    self._init_weights(module)
-    module._is_hf_initialized = True
-
-@seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
-def _init_weights_patched(self, module):
-    """
-    Patched version of _init_weights to match TorchTitan's initialization for Llama.
-    `self` is a LlamaPreTrainedModel instance.
-    """
-    config = self.config
-    
-    #TODO(3outeille): only out_proj/down_proj needs std=init_std. so we can refactor to loop over module and only init last layer with std=init_std
-    if isinstance(module, (DeepseekV3Attention, DeepseekV3MLP, DeepseekV3MoE)):
-        layer_idx = module.layer_idx
-        init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-
-    if isinstance(module, DeepseekV3Attention):
-        if hasattr(module, 'q_proj'):
-            nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
-        else:
-            nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
-            nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
-        
-        nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
-        
-        nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
-    
-    elif isinstance(module, DeepseekV3MLP):
-        nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
-
-    elif isinstance(module, DeepseekV3MoE):
-        nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
-        for expert in module.experts:
-            nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
-            nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
-            nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
-        
-        nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
-
-    elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head
-        final_out_std = config.hidden_size**-0.5
-        cutoff_factor = 3
-        nn.init.trunc_normal_(
-            module.weight,
-            mean=0.0,
-            std=final_out_std,
-            a=-cutoff_factor * final_out_std,
-            b=cutoff_factor * final_out_std,
-        )
-        if module.bias is not None:
-            module.bias.data.zero_()
-
-    elif isinstance(module, nn.Embedding):
-        std = config.initializer_range
-        module.weight.data.normal_(mean=0.0, std=std)
-        if module.padding_idx is not None:
-            module.weight.data[module.padding_idx].zero_()
-    
-    elif (
-        isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d))
-        or "LayerNorm" in module.__class__.__name__
-        or "RMSNorm" in module.__class__.__name__
-    ):
-        # Norms can exist without weights (in which case they are None from torch primitives)
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(1.0)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.zero_()
-
-
-def patch_hf_deepseek_v3():
-    DeepseekV3DecoderLayer.__init__ = _deepseek_v3_decoder_layer_init_patched
-    PreTrainedModel._init_weights = _init_weights_patched
-    PreTrainedModel._initialize_weights = _initialize_weights_patched
diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
index 563c5e289b..f1ada96928 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
@@ -20,10 +20,6 @@ def patch_hf_llama_like(decoder_layer_cls, attention_cls, mlp_cls=None):
       initialization for attention and MLP layers.
     - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within
       each decoder layer, which is required for the depth-dependent initialization.
-
-    By applying this patch, we can ensure that a model loaded in the transformers
-    backend will have the exact same weights as a model trained with the native
-    TorchTitan backend, which is essential for seamless conversion and debugging.
     """
 
     _original_decoder_layer_init = decoder_layer_cls.__init__
diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
deleted file mode 100644
index c3557f6973..0000000000
--- a/torchtitan/experiments/transformers_backend/model/hf_llama_patch.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import torch
-import torch.nn as nn
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaDecoderLayer
-from transformers.modeling_utils import PreTrainedModel
-
-
-_original_llama_decoder_layer_init = LlamaDecoderLayer.__init__
-
-def _llama_decoder_layer_init_patched(self, config: LlamaConfig, layer_idx: int):
-    _original_llama_decoder_layer_init(self, config, layer_idx)
-    self.layer_idx = layer_idx
-    self.mlp.layer_idx = layer_idx
-
-def _initialize_weights_patched(self, module):
-    # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
-    # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
-    # which prevents subsequent proper initialization.
-    if getattr(module, "_is_hf_initialized", False):
-        return
-
-    for param in module.parameters(recurse=True):
-        if param.device.type == "meta":
-            return
-    
-    # If not on a meta device, call the original weight initialization
-    self._init_weights(module)
-    module._is_hf_initialized = True
-
-def _init_weights_patched(self, module):
-    """
-    Patched version of _init_weights to match TorchTitan's initialization for Llama.
-    `self` is a LlamaPreTrainedModel instance.
-    """
-    config = self.config
-
-    if isinstance(module, (LlamaAttention, LlamaMLP)):
-        layer_idx = module.layer_idx
-
-        if config.depth_init:
-            init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-        else:
-            init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
-
-    if isinstance(module, LlamaAttention):
-        nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
-    
-    elif isinstance(module, LlamaMLP):
-        nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=init_std)
-        nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
-
-    elif module is getattr(self, "lm_head", None): #TODO(3outeille): find a better way to detect lm_head
-        final_out_std = config.hidden_size**-0.5
-        cutoff_factor = 3
-        nn.init.trunc_normal_(
-            module.weight,
-            mean=0.0,
-            std=final_out_std,
-            a=-cutoff_factor * final_out_std,
-            b=cutoff_factor * final_out_std,
-        )
-        if module.bias is not None:
-            module.bias.data.zero_()
-
-    elif isinstance(module, nn.Embedding):
-        std = config.initializer_range
-        module.weight.data.normal_(mean=0.0, std=std)
-        if module.padding_idx is not None:
-            module.weight.data[module.padding_idx].zero_()
-    
-    elif (
-        isinstance(module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d))
-        or "LayerNorm" in module.__class__.__name__
-        or "RMSNorm" in module.__class__.__name__
-    ):
-        # Norms can exist without weights (in which case they are None from torch primitives)
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(1.0)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.zero_()
-
-def patch_hf_llama():
-    LlamaDecoderLayer.__init__ = _llama_decoder_layer_init_patched
-    PreTrainedModel._init_weights = _init_weights_patched
-    PreTrainedModel._initialize_weights = _initialize_weights_patched
\ No newline at end of file
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
index 9d99622cc1..f66361a6d2 100644
--- a/torchtitan/models/attention.py
+++ b/torchtitan/models/attention.py
@@ -205,9 +205,9 @@ def _init_backend(cls) -> None:
 
         # Add CuDNN on B200 w/ highest priority
         cls.backends = [
-            # SDPBackend.FLASH_ATTENTION,
+            SDPBackend.FLASH_ATTENTION,
             SDPBackend.EFFICIENT_ATTENTION,
-        #     SDPBackend.MATH,
+            SDPBackend.MATH,
         ]
         if has_cuda_capability(10, 0):
             cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION)
diff --git a/torchtitan/models/deepseek_v3/model/model.py b/torchtitan/models/deepseek_v3/model/model.py
index 260c7bf49a..e2c4bbeda9 100644
--- a/torchtitan/models/deepseek_v3/model/model.py
+++ b/torchtitan/models/deepseek_v3/model/model.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
-import os
 from typing import Tuple
 
 import torch
@@ -14,7 +13,7 @@
 from torchtitan.models.attention import build_attention
 from torchtitan.models.moe import FeedForward, MoE
 from torchtitan.protocols.train_spec import ModelProtocol
-from torchtitan.utils.test_utils import seeded_init_decorator_for_test
+
 from .args import DeepSeekV3ModelArgs
 
 
@@ -241,7 +240,6 @@ def forward(
         output = output.view(bsz, seqlen, -1)  # (bsz, seqlen, n_heads * v_head_dim)
         return self.wo(output)  # (bsz, seqlen, dim)
 
-    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float):
         linear_list = [
             self.wkv_a,
@@ -304,7 +302,6 @@ def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):
             x = x + self.feed_forward(self.ffn_norm(x))
         return x
 
-    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, buffer_device: torch.device):
         for norm in (self.attention_norm, self.ffn_norm):
             norm.reset_parameters()
@@ -342,7 +339,6 @@ def __init__(self, model_args: DeepSeekV3ModelArgs):
         self.model_args = model_args
         self.init_weights()
 
-    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, buffer_device: torch.device | None = None) -> None:
         buffer_device = buffer_device or self.freqs_cis.device
         with torch.device(buffer_device):
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
index e2e3981625..8be14ecbf0 100644
--- a/torchtitan/models/moe.py
+++ b/torchtitan/models/moe.py
@@ -12,8 +12,7 @@
 from torch import nn
 
 from torchtitan.distributed.expert_parallel import expert_parallel
-import os
-from torchtitan.utils.test_utils import seeded_init_decorator_for_test
+
 
 @dataclass
 class MoEArgs:
@@ -58,7 +57,6 @@ def __init__(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 
-    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float = 0.02):
         nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
         for linear in (self.w2, self.w3):
@@ -155,7 +153,6 @@ def forward(
                 self.w1, self.w2, self.w3, x, num_tokens_per_expert
             )
 
-    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
         nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
@@ -249,7 +246,6 @@ def forward(
 
         return top_scores, selected_experts_indices, num_tokens_per_expert
 
-    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std)
 
@@ -439,7 +435,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         out = out.reshape(bs, slen, dim)
         return out
 
-    @seeded_init_decorator_for_test(seed=os.environ.get("SEED"))
     def init_weights(
         self,
         init_std: float,

From 5f1075b372bb0a0ed12e0c0928bc3ed7eda8bc5d Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 15 Oct 2025 17:29:33 +0000
Subject: [PATCH 066/129] add small example scripts

---
 .../configs/qwen3_fsdp2_tp2_pp2.toml          | 89 +++++++++++++++++++
 .../transformers_backend/run_train.sh         | 33 +++++++
 2 files changed, 122 insertions(+)
 create mode 100644 torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
 create mode 100755 torchtitan/experiments/transformers_backend/run_train.sh

diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
new file mode 100644
index 0000000000..5f40ec41b3
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
@@ -0,0 +1,89 @@
+# torchtitan Config.toml
+
+[job]
+dump_folder = "./outputs"
+description = "Qwen 3 debug training"
+print_args = false
+use_for_integration_test = false
+
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 5
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "Qwen/Qwen3-4B-Instruct-2507"
+flavor = "debugmodel"
+# test folder with tokenizer.json, for debug purpose only
+hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+# converters = ["float8"]
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+min_lr_factor = 0.0
+
+[training]
+global_batch_size = 4
+local_batch_size = 2
+seq_len = 2048
+max_norm = 1.0  # grad norm clipping
+steps = 10
+dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
+dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
+mixed_precision_param = "float32" # force float32 for comparison
+mixed_precision_reduce = "float32"
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 2
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 2
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 2
+pipeline_parallel_schedule = "1F1B"
+context_parallel_degree = 1
+expert_parallel_degree = 1
+expert_tensor_parallel_degree = 1
+
+[checkpoint]
+enable = false
+folder = "checkpoint"
+interval = 10
+last_save_model_only = false
+export_dtype = "float32"
+async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
+
+[activation_checkpoint]
+mode = "selective"  # ["none", "selective", "full"]
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+
+[compile]
+enable=false
+components = ["model", "loss"]
+
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+filter_fqns = ["output"]
+
+[validation]
+enable = false
+dataset = "c4_validation"
+freq = 5
+steps = 10
diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh
new file mode 100755
index 0000000000..3b82ad07f3
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/run_train.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+# use envs as local overwrites for convenience
+# e.g.
+# LOG_RANK=0,1 NGPU=4 ./run_train.sh
+NGPU=${NGPU:-"8"}
+export LOG_RANK=${LOG_RANK:-0}
+
+# Option to switch between debug and train
+MODE=${MODE:-"train"}  # Set MODE=debug or MODE=train
+
+CONFIG_FILE=${CONFIG_FILE:-"configs/qwen3_fsdp2_tp2_pp2.toml"}
+
+if [ "$MODE" = "debug" ]; then
+    PYTHON_CMD="debugpy-run -m torch.distributed.run --"
+else
+    PYTHON_CMD="torchrun"
+fi
+
+TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
+
+PYTORCH_ALLOC_CONF="expandable_segments:True" \
+TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \
+$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+-m torchtitan.train --job.config_file ${CONFIG_FILE} "$@"
\ No newline at end of file

From c35ccfcc74a4ef72b0f1585b3353f92ab940dc52 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 20 Oct 2025 11:24:02 +0000
Subject: [PATCH 067/129] fix all the merge issues

---
 .../transformers_backend/__init__.py          |  9 +----
 .../configs/test_template.toml                |  5 +--
 .../infra/parallelize_hf_transformers.py      |  5 +--
 .../transformers_backend/infra/pipeline_hf.py | 38 ++++++++++++-------
 torchtitan/protocols/train_spec.py            | 19 +++-------
 torchtitan/train.py                           |  1 +
 6 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index fb21837a6b..34892cfcc2 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -105,19 +105,12 @@ class DeepSeekV3Args:
             )
         ) if os.environ.get("USE_MOE", "0") == "1" else None,
     ),
-    "medium": HFTransformerModelArgs(
-        titan_args=TitanModelArgs(
-            dim=1024,
-            n_layers=12,
-        ),
-    ),
     "full": HFTransformerModelArgs(
         titan_args=TitanModelArgs(),
     ),
 }
 
 hf_train_spec = TrainSpec(
-    name="hf_auto_model",
     model_cls=HFTransformerModel,
     model_args=flavors,
     parallelize_fn=parallelize_hf_transformers,
@@ -129,4 +122,4 @@ class DeepSeekV3Args:
     build_loss_fn=build_cross_entropy_loss,
 )
 
-register_train_spec(hf_train_spec)
\ No newline at end of file
+register_train_spec("hf_placeholder_name", hf_train_spec)
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml
index fa0c763ed7..0964cf640e 100644
--- a/torchtitan/experiments/transformers_backend/configs/test_template.toml
+++ b/torchtitan/experiments/transformers_backend/configs/test_template.toml
@@ -3,8 +3,7 @@
 [job]
 dump_folder = "./outputs"
 description = "Llama 3 debug training"
-print_args = false
-use_for_integration_test = false
+print_config = true
 
 [profiling]
 enable_profiling = true
@@ -77,7 +76,7 @@ selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac bas
 enable=false
 components = ["model", "loss"]
 
-[float8]
+[quantize.linear.float8]
 enable_fsdp_float8_all_gather = false
 precompute_float8_dynamic_scale_for_fsdp = false
 filter_fqns = ["output"]
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 1bfe6ab779..32e122ab75 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -25,12 +25,11 @@
     SequenceParallel,
 )
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
-from torchtitan.distributed import ParallelDims
+from torchtitan.distributed import ParallelDims, NoParallel
 
 from torchtitan.distributed.expert_parallel import (
     ExpertParallel,
     ExpertTensorParallel,
-    NoParallel,
     ReordererSequenceParallel,
     TensorParallel,
 )
@@ -198,7 +197,7 @@ def parallelize_hf_transformers(
     if parallel_dims.tp_enabled:
         model.set_tp_mesh(world_mesh["tp"])
         enable_float8_linear = "float8" in job_config.model.converters
-        float8_is_rowwise = job_config.float8.recipe_name in (
+        float8_is_rowwise = job_config.quantize.linear.float8.recipe_name in (
             "rowwise",
             "rowwise_with_gw_hp",
         )
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
index fb707b2509..cd599ac2a5 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
@@ -19,8 +19,7 @@
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.pipeline_parallel import (
     build_pipeline_schedule,
-    pipeline_module_split,
-    stage_ids_this_rank,
+    pipeline_module_split
 )
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.pipelining import PipelineStage
@@ -145,6 +144,7 @@ def generate_llm_fqn_per_model_part(
 
     return module_names_per_stage
 
+
 def pipeline_module_split(
     whole_model: nn.Module,
     pp_mesh: DeviceMesh,
@@ -185,7 +185,7 @@ def pipeline_module_split(
         ]
     """
     pp_rank = pp_mesh.get_local_rank()
-    pp_size = pp_mesh.size()
+    pp_degree = pp_mesh.size()
 
     def _build_stage_from_modules(
         stage_idx: int, module_names: list[str], num_stages: int
@@ -194,7 +194,6 @@ def _build_stage_from_modules(
 
         # Create a set of modules to keep for faster lookup
         modules_to_keep = set(module_names)
-        print(f"Stage {stage_idx}: Modules to keep: {modules_to_keep}")
         for module_name, module_value in model.named_children():
             # Handle layer-like structures (e.g., "layers.0", "layers.1")
             if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)):
@@ -250,7 +249,27 @@ def _build_stage_from_modules(
         "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop"
     )
 
-    for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style):
+    def _get_stage_indices() -> tuple[int]:
+        """
+        Compute the stage ids for the stages that will run on this pp rank
+        for either a looped or V style schedule
+        """
+        assert (
+            num_stages % pp_degree == 0
+        ), f"num_stages {num_stages} must be evenly divisible by pp_degree {pp_degree}"
+        stages_per_rank = num_stages // pp_degree
+        if style == "loop":
+            return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank))
+        elif style == "v":
+            assert (
+                stages_per_rank == 2
+            ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
+            stage_v_pairs = list(
+                zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1))
+            )
+            return stage_v_pairs[pp_rank]
+
+    for stage_idx in _get_stage_indices():
         module_names = module_names_per_stage[stage_idx]
         stage, model_chunk = _build_stage_from_modules(
             stage_idx,
@@ -266,7 +285,6 @@ def _build_stage_from_modules(
 
     return stages, models
 
-
 def pipeline_hf_transformers(
     model: nn.Module,
     parallel_dims: ParallelDims,
@@ -276,12 +294,6 @@ def pipeline_hf_transformers(
     parallelize_fn: ParallelizeFunction,
     loss_fn: LossFunction,
 ) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]:
-    if job_config.parallelism.pipeline_parallel_split_points != []:
-        raise ValueError(
-            "pipeline_parallel_split_points is deprecated. Please use module_fqns_per_model_part instead."
-            "You can generate module_fqns_per_model_part programmatically with generate_llm_fqn_per_model_part"
-        )
-
     pp_mesh = parallel_dims.world_mesh["pp"]
 
     # Determine the number of virtual stages based on schedule type
@@ -385,4 +397,4 @@ def pipeline_hf_transformers(
         if stage.is_last:
             has_last_stage = True
 
-    return pp_schedule, model_parts, has_first_stage, has_last_stage
+    return pp_schedule, model_parts, has_first_stage, has_last_stage
\ No newline at end of file
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index bc81c9928e..f04d6ac269 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -52,6 +52,7 @@ class TrainSpec:
     build_dataloader_fn: DataLoaderBuilder
     build_tokenizer_fn: TokenizerBuilder | None
     build_loss_fn: LossFunctionBuilder
+    name: str | None = None
     build_validator_fn: ValidatorBuilder | None = None
     build_metrics_processor_fn: MetricsProcessorBuilder | None = None
     state_dict_adapter: type[BaseStateDictAdapter] | None = None
@@ -70,23 +71,13 @@ def register_train_spec(name: str, train_spec: TrainSpec) -> None:
 
 
 def get_train_spec(name: str) -> TrainSpec:
-<<<<<<< HEAD
-    global _train_specs
-
-    if "/" in name: # HF model (dynamic loading)
-        hf_spec = _train_specs["hf_auto_model"]
-        new_spec = dataclasses.replace(hf_spec, name=name)
-        _train_specs[name] = new_spec
-    elif name not in _train_specs:  # Torchtitan
-        raise ValueError(f"Model {name} is not registered.")
-
-    return _train_specs[name]
-=======
     # user-defined TrainSpec has higher priority
     global _extra_train_specs
-    if name in _extra_train_specs:
+    if "/" in name: # HF model (dynamic loading)
+        hf_spec = _extra_train_specs["hf_placeholder_name"]
+        return dataclasses.replace(hf_spec, name=name)
+    elif name in _extra_train_specs:
         return _extra_train_specs[name]
->>>>>>> main
 
     from torchtitan.experiments import _supported_experiments
     from torchtitan.models import _supported_models
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 3ef25e2886..6bb28d4a8d 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -31,6 +31,7 @@
     maybe_enable_memory_snapshot,
     maybe_enable_profiling,
 )
+import torchtitan.experiments.transformers_backend  # noqa: F401
 
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):
     # core configs

From d5ce2e9132d2380c1867aaef0184980e6dccc787 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 20 Oct 2025 11:29:49 +0000
Subject: [PATCH 068/129] get rid of hf patches files and put it in
 hf_transformer_args

---
 .../model/hf_llama_like_patch.py              | 161 ----------
 .../model/hf_moe_like_patch.py                | 135 --------
 .../model/hf_transformers_args.py             | 291 +++++++++++++++++-
 3 files changed, 286 insertions(+), 301 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
 delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py

diff --git a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
deleted file mode 100644
index f1ada96928..0000000000
--- a/torchtitan/experiments/transformers_backend/model/hf_llama_like_patch.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import torch
-import torch.nn as nn
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import PreTrainedModel
-import math
-from torch.nn import init
-
-
-def patch_hf_llama_like(decoder_layer_cls, attention_cls, mlp_cls=None):
-    """
-    This patch modifies a Hugging Face Llama-like model's weight initialization to match
-    the initialization scheme used in TorchTitan. This is crucial for ensuring
-    bit-for-bit reproducibility when converting checkpoints between the native
-    TorchTitan format and the Hugging Face format.
-
-    The patch targets the following aspects of the model:
-    - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly.
-    - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization
-      for attention, MLP, embedding, and layer norm layers. This includes depth-dependent
-      initialization for attention and MLP layers.
-    - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within
-      each decoder layer, which is required for the depth-dependent initialization.
-    """
-
-    _original_decoder_layer_init = decoder_layer_cls.__init__
-
-    def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
-        _original_decoder_layer_init(self, config, layer_idx)
-        self.layer_idx = layer_idx
-        # Ensure both attention and mlp modules have layer_idx for depth-based init
-        if hasattr(self, "self_attn"):
-            self.self_attn.layer_idx = layer_idx
-        # some models might not have mlp in each layer
-        if hasattr(self, "mlp") and self.mlp is not None:
-            self.mlp.layer_idx = layer_idx
-
-    def _initialize_weights_patched(self, module):
-        # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
-        # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
-        # which prevents subsequent proper initialization.
-        if getattr(module, "_is_hf_initialized", False):
-            return
-
-        for param in module.parameters(recurse=True):
-            if param.device.type == "meta":
-                return
-
-        # If not on a meta device, call the original weight initialization
-        self._init_weights(module)
-        module._is_hf_initialized = True
-
-    def _init_weights_patched(self, module):
-        """
-        Patched version of _init_weights to match TorchTitan's initialization for Llama-like models.
-        `self` is a PreTrainedModel instance.
-        """
-        config = self.config
-
-        # check if layer is  (resid_dropout): Dropout(p=0.1, inplace=False)
-        if hasattr(module, "resid_dropout"):
-            print()
-
-        # Build tuple of classes to check for layer_idx-based init_std calculation
-        layer_idx_classes = [attention_cls]
-        if mlp_cls:
-            layer_idx_classes.append(mlp_cls)
-        layer_idx_classes = tuple(layer_idx_classes)
-
-        if isinstance(module, layer_idx_classes):
-            if not hasattr(module, "layer_idx"):
-                return
-            layer_idx = module.layer_idx
-
-            if hasattr(config, "depth_init") and config.depth_init:
-                init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-            else:
-                init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
-
-        if isinstance(module, attention_cls):
-            # Initialize weights and biases for q, k, v projections
-            for proj_name in ["q_proj", "k_proj", "v_proj"]:
-                proj = getattr(module, proj_name)
-                nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02)
-                if proj.bias is not None:
-                    fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight)
-                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                    init.uniform_(proj.bias, -bound, bound)
-
-            # Handle different names for the output projection layer
-            o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
-            if o_proj is not None:
-                nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std)
-                if o_proj.bias is not None:
-                    fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight)
-                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                    init.uniform_(o_proj.bias, -bound, bound)
-
-        elif mlp_cls and isinstance(module, mlp_cls):
-            # Handle different names for MLP layers
-            gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
-            up_proj = getattr(module, "up_proj", None)
-            down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))
-
-            # gate_proj (or fc1) should always use std=0.02 for numerical stability.
-            if gate_proj is not None:
-                nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02)
-                if gate_proj.bias is not None:
-                    fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight)
-                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                    init.uniform_(gate_proj.bias, -bound, bound)
-            # up_proj and down_proj (or fc2) use the depth-dependent init_std.
-            if up_proj is not None:
-                nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std)
-                if up_proj.bias is not None:
-                    fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight)
-                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                    init.uniform_(up_proj.bias, -bound, bound)
-            if down_proj is not None:
-                nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std)
-                if down_proj.bias is not None:
-                    fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight)
-                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                    init.uniform_(down_proj.bias, -bound, bound)
-
-        elif module is getattr(
-            self, "lm_head", None
-        ):  # TODO(3outeille): find a better way to detect lm_head
-            final_out_std = config.hidden_size**-0.5
-            cutoff_factor = 3
-            nn.init.trunc_normal_(
-                module.weight,
-                mean=0.0,
-                std=final_out_std,
-                a=-cutoff_factor * final_out_std,
-                b=cutoff_factor * final_out_std,
-            )
-            if module.bias is not None:
-                module.bias.data.zero_()
-
-        elif isinstance(module, nn.Embedding):
-            std = config.initializer_range
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-        elif (
-            isinstance(
-                module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)
-            )
-            or "LayerNorm" in module.__class__.__name__
-            or "RMSNorm" in module.__class__.__name__
-        ):
-            # Norms can exist without weights (in which case they are None from torch primitives)
-            if hasattr(module, "weight") and module.weight is not None:
-                module.weight.data.fill_(1.0)
-            if hasattr(module, "bias") and module.bias is not None:
-                module.bias.data.zero_()
-
-    decoder_layer_cls.__init__ = _decoder_layer_init_patched
-    PreTrainedModel._init_weights = _init_weights_patched
-    PreTrainedModel._initialize_weights = _initialize_weights_patched
diff --git a/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py b/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py
deleted file mode 100644
index dc18e0b455..0000000000
--- a/torchtitan/experiments/transformers_backend/model/hf_moe_like_patch.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import torch.nn as nn
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import PreTrainedModel
-
-
-def patch_hf_moe_like(decoder_layer_cls, attention_cls, mlp_cls, moe_cls):
-    """
-    This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight
-    initialization to match the initialization scheme used in TorchTitan,
-    drawing from patterns in models like DeepseekV3.
-
-    The patch targets:
-    - `PreTrainedModel._initialize_weights`: For correct meta device initialization.
-    - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization
-      for attention, MLP, MoE, embedding, and layer norm layers.
-    - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert
-      modules, required for depth-dependent initialization.
-    """
-
-    _original_decoder_layer_init = decoder_layer_cls.__init__
-
-    def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
-        _original_decoder_layer_init(self, config, layer_idx)
-        self.layer_idx = layer_idx
-
-        if hasattr(self, "self_attn"):
-            self.self_attn.layer_idx = layer_idx
-
-        if hasattr(self, "mlp"):
-            self.mlp.layer_idx = layer_idx
-            if hasattr(self.mlp, "experts"):
-                for expert in self.mlp.experts:
-                    expert.layer_idx = layer_idx
-            if hasattr(self.mlp, "shared_experts"):
-                # Not all MoE models have shared experts
-                if self.mlp.shared_experts is not None:
-                    self.mlp.shared_experts.layer_idx = layer_idx
-
-    def _initialize_weights_patched(self, module):
-        if getattr(module, "_is_hf_initialized", False):
-            return
-        for param in module.parameters(recurse=True):
-            if param.device.type == "meta":
-                return
-        self._init_weights(module)
-        module._is_hf_initialized = True
-
-    def _init_weights_patched(self, module):
-        """
-        Patched version of _init_weights for MoE models.
-        """
-        config = self.config
-        init_std = None
-
-        if isinstance(module, (attention_cls, mlp_cls, moe_cls)):
-            if hasattr(module, "layer_idx"):
-                layer_idx = module.layer_idx
-                if hasattr(config, "depth_init") and config.depth_init:
-                    init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-                else:
-                    # Fallback for models without depth_init
-                    init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
-
-        if isinstance(module, attention_cls):
-            # Handle different attention projection layer names by initializing if they exist
-            if hasattr(module, "q_proj"):
-                nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
-            if hasattr(module, "k_proj"):
-                nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
-            if hasattr(module, "v_proj"):
-                nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
-
-            if hasattr(module, "q_a_proj"):
-                nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
-            if hasattr(module, "q_b_proj"):
-                nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
-            
-            if hasattr(module, "kv_a_proj_with_mqa"):
-                nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
-            if hasattr(module, "kv_b_proj"):
-                nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
-            
-            if hasattr(module, "o_proj") and init_std is not None:
-                nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
-
-        elif isinstance(module, mlp_cls):
-            nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
-            # DeepseekV3 uses std=0.02 for up_proj, unlike Llama
-            nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
-            if init_std is not None:
-                nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
-
-        elif isinstance(module, moe_cls):
-            if hasattr(module, "gate") and init_std is not None:
-                nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
-            if hasattr(module, "experts"):
-                for expert in module.experts:
-                    nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
-                    nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
-                    if init_std is not None:
-                        nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
-            if hasattr(module, "shared_experts") and module.shared_experts is not None:
-                nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
-                nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
-                if init_std is not None:
-                    nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
-
-        elif module is getattr(self, "lm_head", None):
-            final_out_std = config.hidden_size**-0.5
-            cutoff_factor = 3
-            nn.init.trunc_normal_(
-                module.weight,
-                mean=0.0,
-                std=final_out_std,
-                a=-cutoff_factor * final_out_std,
-                b=cutoff_factor * final_out_std,
-            )
-            if module.bias is not None:
-                module.bias.data.zero_()
-
-        elif isinstance(module, nn.Embedding):
-            std = config.initializer_range
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-        elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__:
-            if hasattr(module, "weight") and module.weight is not None:
-                module.weight.data.fill_(1.0)
-            if hasattr(module, "bias") and module.bias is not None:
-                module.bias.data.zero_()
-
-    decoder_layer_cls.__init__ = _decoder_layer_init_patched
-    PreTrainedModel._init_weights = _init_weights_patched
-    PreTrainedModel._initialize_weights = _initialize_weights_patched
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
index 4bc65aa0d2..883c282dc0 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -8,16 +8,16 @@
 from dataclasses import dataclass
 import torch
 from torch import nn
+import math
+from torch.nn import init
 from torchtitan.config import JobConfig
 from torchtitan.protocols import BaseModelArgs
 from torchtitan.tools.logging import logger
 from transformers import AutoConfig
 from transformers.utils import is_torch_deterministic
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import AttentionInterface
+from transformers.modeling_utils import AttentionInterface, PreTrainedModel
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
-from torchtitan.experiments.transformers_backend.model.hf_llama_like_patch import patch_hf_llama_like
-from torchtitan.experiments.transformers_backend.model.hf_moe_like_patch import patch_hf_moe_like
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
@@ -305,7 +305,7 @@ def __init__(self, model_args: HFTransformerModelArgs):
                 
                 if all(required_classes.values()):
                     logger.info(f"Applying MoE-like patch for {model_name_prefix}")
-                    patch_hf_moe_like(
+                    self._patch_hf_moe_like(
                         decoder_layer_cls=decoder_layer_cls,
                         attention_cls=attention_cls,
                         mlp_cls=mlp_cls,
@@ -325,7 +325,7 @@ def __init__(self, model_args: HFTransformerModelArgs):
                 
                 if all(required_classes.values()):
                     logger.info(f"Applying Llama-like patch for {model_name_prefix}")
-                    patch_hf_llama_like(
+                    self._patch_hf_llama_like(
                         decoder_layer_cls=decoder_layer_cls,
                         attention_cls=attention_cls,
                         mlp_cls=mlp_cls  # mlp_cls can be None
@@ -365,6 +365,287 @@ def set_tp_mesh(self, mesh):
     def set_pp_mesh(self, mesh):
         self.pp_mesh = mesh
 
+    def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None):
+        """
+        This patch modifies a Hugging Face Llama-like model's weight initialization to match
+        the initialization scheme used in TorchTitan. This is crucial for ensuring
+        bit-for-bit reproducibility when converting checkpoints between the native
+        TorchTitan format and the Hugging Face format.
+
+        The patch targets the following aspects of the model:
+        - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly.
+        - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization
+          for attention, MLP, embedding, and layer norm layers. This includes depth-dependent
+          initialization for attention and MLP layers.
+        - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within
+          each decoder layer, which is required for the depth-dependent initialization.
+        """
+
+        _original_decoder_layer_init = decoder_layer_cls.__init__
+
+        def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
+            _original_decoder_layer_init(self, config, layer_idx)
+            self.layer_idx = layer_idx
+            # Ensure both attention and mlp modules have layer_idx for depth-based init
+            if hasattr(self, "self_attn"):
+                self.self_attn.layer_idx = layer_idx
+            # some models might not have mlp in each layer
+            if hasattr(self, "mlp") and self.mlp is not None:
+                self.mlp.layer_idx = layer_idx
+
+        def _initialize_weights_patched(self, module):
+            # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
+            # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
+            # which prevents subsequent proper initialization.
+            if getattr(module, "_is_hf_initialized", False):
+                return
+
+            for param in module.parameters(recurse=True):
+                if param.device.type == "meta":
+                    return
+
+            # If not on a meta device, call the original weight initialization
+            self._init_weights(module)
+            module._is_hf_initialized = True
+
+        def _init_weights_patched(self, module):
+            """
+            Patched version of _init_weights to match TorchTitan's initialization for Llama-like models.
+            `self` is a PreTrainedModel instance.
+            """
+            config = self.config
+
+            # Build tuple of classes to check for layer_idx-based init_std calculation
+            layer_idx_classes = [attention_cls]
+            if mlp_cls:
+                layer_idx_classes.append(mlp_cls)
+            layer_idx_classes = tuple(layer_idx_classes)
+
+            if isinstance(module, layer_idx_classes):
+                if not hasattr(module, "layer_idx"):
+                    return
+                layer_idx = module.layer_idx
+
+                if hasattr(config, "depth_init") and config.depth_init:
+                    init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+                else:
+                    init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+            if isinstance(module, attention_cls):
+                # Initialize weights and biases for q, k, v projections
+                for proj_name in ["q_proj", "k_proj", "v_proj"]:
+                    proj = getattr(module, proj_name)
+                    nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02)
+                    if proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(proj.bias, -bound, bound)
+
+                # Handle different names for the output projection layer
+                o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
+                if o_proj is not None:
+                    nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std)
+                    if o_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(o_proj.bias, -bound, bound)
+
+            elif mlp_cls and isinstance(module, mlp_cls):
+                # Handle different names for MLP layers
+                gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
+                up_proj = getattr(module, "up_proj", None)
+                down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))
+
+                # gate_proj (or fc1) should always use std=0.02 for numerical stability.
+                if gate_proj is not None:
+                    nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02)
+                    if gate_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(gate_proj.bias, -bound, bound)
+                # up_proj and down_proj (or fc2) use the depth-dependent init_std.
+                if up_proj is not None:
+                    nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std)
+                    if up_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(up_proj.bias, -bound, bound)
+                if down_proj is not None:
+                    nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std)
+                    if down_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(down_proj.bias, -bound, bound)
+
+            elif module is getattr(
+                self, "lm_head", None
+            ):  # TODO(3outeille): find a better way to detect lm_head
+                final_out_std = config.hidden_size**-0.5
+                cutoff_factor = 3
+                nn.init.trunc_normal_(
+                    module.weight,
+                    mean=0.0,
+                    std=final_out_std,
+                    a=-cutoff_factor * final_out_std,
+                    b=cutoff_factor * final_out_std,
+                )
+                if module.bias is not None:
+                    module.bias.data.zero_()
+
+            elif isinstance(module, nn.Embedding):
+                std = config.initializer_range
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+
+            elif (
+                isinstance(
+                    module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)
+                )
+                or "LayerNorm" in module.__class__.__name__
+                or "RMSNorm" in module.__class__.__name__
+            ):
+                # Norms can exist without weights (in which case they are None from torch primitives)
+                if hasattr(module, "weight") and module.weight is not None:
+                    module.weight.data.fill_(1.0)
+                if hasattr(module, "bias") and module.bias is not None:
+                    module.bias.data.zero_()
+
+        decoder_layer_cls.__init__ = _decoder_layer_init_patched
+        PreTrainedModel._init_weights = _init_weights_patched
+        PreTrainedModel._initialize_weights = _initialize_weights_patched
+
+    def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls):
+        """
+        This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight
+        initialization to match the initialization scheme used in TorchTitan,
+        drawing from patterns in models like DeepseekV3.
+
+        The patch targets:
+        - `PreTrainedModel._initialize_weights`: For correct meta device initialization.
+        - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization
+          for attention, MLP, MoE, embedding, and layer norm layers.
+        - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert
+          modules, required for depth-dependent initialization.
+        """
+
+        _original_decoder_layer_init = decoder_layer_cls.__init__
+
+        def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
+            _original_decoder_layer_init(self, config, layer_idx)
+            self.layer_idx = layer_idx
+
+            if hasattr(self, "self_attn"):
+                self.self_attn.layer_idx = layer_idx
+
+            if hasattr(self, "mlp"):
+                self.mlp.layer_idx = layer_idx
+                if hasattr(self.mlp, "experts"):
+                    for expert in self.mlp.experts:
+                        expert.layer_idx = layer_idx
+                if hasattr(self.mlp, "shared_experts"):
+                    # Not all MoE models have shared experts
+                    if self.mlp.shared_experts is not None:
+                        self.mlp.shared_experts.layer_idx = layer_idx
+
+        def _initialize_weights_patched(self, module):
+            if getattr(module, "_is_hf_initialized", False):
+                return
+            for param in module.parameters(recurse=True):
+                if param.device.type == "meta":
+                    return
+            self._init_weights(module)
+            module._is_hf_initialized = True
+
+        def _init_weights_patched(self, module):
+            """
+            Patched version of _init_weights for MoE models.
+            """
+            config = self.config
+            init_std = None
+
+            if isinstance(module, (attention_cls, mlp_cls, moe_cls)):
+                if hasattr(module, "layer_idx"):
+                    layer_idx = module.layer_idx
+                    if hasattr(config, "depth_init") and config.depth_init:
+                        init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+                    else:
+                        # Fallback for models without depth_init
+                        init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+            if isinstance(module, attention_cls):
+                # Handle different attention projection layer names by initializing if they exist
+                if hasattr(module, "q_proj"):
+                    nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
+                if hasattr(module, "k_proj"):
+                    nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
+                if hasattr(module, "v_proj"):
+                    nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
+
+                if hasattr(module, "q_a_proj"):
+                    nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
+                if hasattr(module, "q_b_proj"):
+                    nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
+                
+                if hasattr(module, "kv_a_proj_with_mqa"):
+                    nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
+                if hasattr(module, "kv_b_proj"):
+                    nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
+                
+                if hasattr(module, "o_proj") and init_std is not None:
+                    nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
+
+            elif isinstance(module, mlp_cls):
+                nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
+                # DeepseekV3 uses std=0.02 for up_proj, unlike Llama
+                nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
+                if init_std is not None:
+                    nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
+
+            elif isinstance(module, moe_cls):
+                if hasattr(module, "gate") and init_std is not None:
+                    nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
+                if hasattr(module, "experts"):
+                    for expert in module.experts:
+                        nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
+                        nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
+                        if init_std is not None:
+                            nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
+                if hasattr(module, "shared_experts") and module.shared_experts is not None:
+                    nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
+                    nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
+                    if init_std is not None:
+                        nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
+
+            elif module is getattr(self, "lm_head", None):
+                final_out_std = config.hidden_size**-0.5
+                cutoff_factor = 3
+                nn.init.trunc_normal_(
+                    module.weight,
+                    mean=0.0,
+                    std=final_out_std,
+                    a=-cutoff_factor * final_out_std,
+                    b=cutoff_factor * final_out_std,
+                )
+                if module.bias is not None:
+                    module.bias.data.zero_()
+
+            elif isinstance(module, nn.Embedding):
+                std = config.initializer_range
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+
+            elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__:
+                if hasattr(module, "weight") and module.weight is not None:
+                    module.weight.data.fill_(1.0)
+                if hasattr(module, "bias") and module.bias is not None:
+                    module.bias.data.zero_()
+
+        decoder_layer_cls.__init__ = _decoder_layer_init_patched
+        PreTrainedModel._init_weights = _init_weights_patched
+        PreTrainedModel._initialize_weights = _initialize_weights_patched
+
     @property
     def tok_embeddings(self):
         """Returns the model's embed_tokens, handling different Hugging Face model structures."""

From 8d46723147543e7eb6fa4e451a65bb17ea05f0ac Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 20 Oct 2025 11:33:49 +0000
Subject: [PATCH 069/129] remove eos_id + refactor Optional[int] to comply with
 torchtitan convention

---
 .../transformers_backend/__init__.py          | 46 +++++++++----------
 .../transformers_backend/run_train.sh         | 33 -------------
 2 files changed, 22 insertions(+), 57 deletions(-)
 delete mode 100755 torchtitan/experiments/transformers_backend/run_train.sh

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 34892cfcc2..0b50ce2027 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 import os
 from dataclasses import dataclass
-from typing import Optional
 
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
@@ -34,41 +33,40 @@ class TitanModelArgs:
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: Optional[int] = None
+    n_kv_heads: int | None = None
+    vocab_size: int | None = None
     multiple_of: int = 256
-    ffn_dim_multiplier: Optional[float] = None
+    ffn_dim_multiplier: float | None = None
     norm_eps: float = 1e-5
     rope_theta: float = 10000
     max_seq_len: int = 2048
     depth_init: bool = True
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
-    eos_id: int = 0
 
 
 @dataclass
 class DeepSeekV3Args:
     """Arguments specific to DeepSeekV3 models."""
-    moe_args: Optional[MoEArgs] = None
-    n_group: Optional[int] = None
-    topk_group: Optional[int] = None
-    inter_dim: Optional[int] = None
-    moe_inter_dim: Optional[int] = None
-    n_dense_layers: Optional[int] = None
-    n_expert_groups: Optional[int] = None
-    n_limited_groups: Optional[int] = None
-    q_lora_rank: Optional[int] = None
-    kv_lora_rank: Optional[int] = None
-    qk_nope_head_dim: Optional[int] = None
-    qk_rope_head_dim: Optional[int] = None
-    v_head_dim: Optional[int] = None
-    original_seq_len: Optional[int] = None
-    rope_factor: Optional[float] = None
-    beta_fast: Optional[int] = None
-    beta_slow: Optional[int] = None
-    mscale: Optional[float] = None
-    partial_rotary_factor: Optional[float] = None
+    moe_args: MoEArgs | None = None
+    n_group: int | None = None
+    topk_group: int | None = None
+    inter_dim: int | None = None
+    moe_inter_dim: int | None = None
+    n_dense_layers: int | None = None
+    n_expert_groups: int | None = None
+    n_limited_groups: int | None = None
+    q_lora_rank: int | None = None
+    kv_lora_rank: int | None = None
+    qk_nope_head_dim: int | None = None
+    qk_rope_head_dim: int | None = None
+    v_head_dim: int | None = None
+    original_seq_len: int | None = None
+    rope_factor: float | None = None
+    beta_fast: int | None = None
+    beta_slow: int | None = None
+    mscale: float | None = None
+    partial_rotary_factor: float | None = None
     rope_interleave: bool = True
 
 
diff --git a/torchtitan/experiments/transformers_backend/run_train.sh b/torchtitan/experiments/transformers_backend/run_train.sh
deleted file mode 100755
index 3b82ad07f3..0000000000
--- a/torchtitan/experiments/transformers_backend/run_train.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -ex
-
-# use envs as local overwrites for convenience
-# e.g.
-# LOG_RANK=0,1 NGPU=4 ./run_train.sh
-NGPU=${NGPU:-"8"}
-export LOG_RANK=${LOG_RANK:-0}
-
-# Option to switch between debug and train
-MODE=${MODE:-"train"}  # Set MODE=debug or MODE=train
-
-CONFIG_FILE=${CONFIG_FILE:-"configs/qwen3_fsdp2_tp2_pp2.toml"}
-
-if [ "$MODE" = "debug" ]; then
-    PYTHON_CMD="debugpy-run -m torch.distributed.run --"
-else
-    PYTHON_CMD="torchrun"
-fi
-
-TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
-
-PYTORCH_ALLOC_CONF="expandable_segments:True" \
-TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \
-$PYTHON_CMD --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
---local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
--m torchtitan.train --job.config_file ${CONFIG_FILE} "$@"
\ No newline at end of file

From 087f8411f5594fd1ee2bf350ff686ed30b859923 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 20 Oct 2025 11:38:05 +0000
Subject: [PATCH 070/129] move
 torch.utils.deterministic.fill_uninitialized_memory = False to utils + remove
 test_utils

---
 torchtitan/distributed/utils.py |  2 ++
 torchtitan/train.py             |  3 --
 torchtitan/utils/test_utils.py  | 52 ---------------------------------
 3 files changed, 2 insertions(+), 55 deletions(-)
 delete mode 100644 torchtitan/utils/test_utils.py

diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
index 67eb41280f..ce59df57b0 100644
--- a/torchtitan/distributed/utils.py
+++ b/torchtitan/distributed/utils.py
@@ -100,6 +100,8 @@ def set_determinism(
     if deterministic:
         logger.info("Deterministic algorithm enabled (expect perf degradation).")
         torch.use_deterministic_algorithms(True)
+        # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
+        torch.utils.deterministic.fill_uninitialized_memory = False
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
         # env var for deterministic CuBLAS
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 6bb28d4a8d..96a77caa0b 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -12,7 +12,6 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
-from torchtitan.utils.test_utils import debug_structure_param
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
 from torchtitan.components.dataloader import DataloaderExhaustedError
@@ -173,8 +172,6 @@ def __init__(self, job_config: JobConfig):
             self.metrics_processor.num_flops_per_token,
         ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
         
-        debug_structure_param(model)
-        
         logger.info(
             f"{color.blue}Model {job_config.model.name} {job_config.model.flavor} "
             f"{color.red}size: {model_param_count:,} total parameters{color.reset}"
diff --git a/torchtitan/utils/test_utils.py b/torchtitan/utils/test_utils.py
deleted file mode 100644
index efb8ac478d..0000000000
--- a/torchtitan/utils/test_utils.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import torch
-import functools
-import torch.nn as nn
-from torchtitan.tools.logging import logger
-from transformers.utils import is_torch_deterministic
-import lovely_tensors as lt; lt.monkey_patch()
-
-def debug_structure_param(model: nn.Module):
-    """Print a breakdown of model parameters by module structure."""
-    logger.info("Model Structure Parameter Breakdown:")
-
-    if is_torch_deterministic():
-        # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
-        torch.utils.deterministic.fill_uninitialized_memory = False
-
-    def _format_module(module: nn.Module, prefix: str = ""):
-        for name, sub_module in module.named_children():
-            sub_module_params = sum(p.numel() for p in sub_module.parameters())
-            if sub_module_params > 0:
-                logger.info(
-                    f"{prefix}({name}): {sub_module.__class__.__name__} - {sub_module_params:,} params"
-                )
-                _format_module(sub_module, prefix + "  ")
-
-    total_params = sum(p.numel() for p in model.parameters())
-    logger.info(f"{model.__class__.__name__} - {total_params:,} params")
-    _format_module(model, "  ")
-
-def seeded_init_decorator_for_test(seed):
-    """
-    Decorator that adds torch.manual_seed before every nn.init.trunc_normal_ call
-    and prints layer weights after initialization.
-    """
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            original_trunc_normal = nn.init.trunc_normal_
-            
-            def seeded_trunc_normal(*trunc_args, **trunc_kwargs):
-                torch.manual_seed(seed)
-                tensor = trunc_args[0]  # First argument is always the tensor
-                result = original_trunc_normal(*trunc_args, **trunc_kwargs)
-                return result
-            
-            try:
-                nn.init.trunc_normal_ = seeded_trunc_normal
-                return func(*args, **kwargs)
-            finally:
-                nn.init.trunc_normal_ = original_trunc_normal
-        
-        return wrapper
-    return decorator

From 937c68d092f8229330d3b1e9c8c6b361b91830b6 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 20 Oct 2025 11:51:36 +0000
Subject: [PATCH 071/129] remove test_template for base_config instead

---
 .../configs/test_template.toml                | 88 -------------------
 .../test_hf_integration.py                    | 12 ++-
 torchtitan/train.py                           |  1 -
 3 files changed, 5 insertions(+), 96 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/configs/test_template.toml

diff --git a/torchtitan/experiments/transformers_backend/configs/test_template.toml b/torchtitan/experiments/transformers_backend/configs/test_template.toml
deleted file mode 100644
index 0964cf640e..0000000000
--- a/torchtitan/experiments/transformers_backend/configs/test_template.toml
+++ /dev/null
@@ -1,88 +0,0 @@
-# torchtitan Config.toml
-
-[job]
-dump_folder = "./outputs"
-description = "Llama 3 debug training"
-print_config = true
-
-[profiling]
-enable_profiling = true
-save_traces_folder = "profile_trace"
-profile_freq = 5
-enable_memory_snapshot = false
-save_memory_snapshot_folder = "memory_snapshot"
-
-[metrics]
-log_freq = 1
-disable_color_printing = false
-enable_tensorboard = false
-save_tb_folder = "tb"
-enable_wandb = false
-
-[model]
-name = "llama3"
-flavor = "debugmodel"
-# test folder with tokenizer.json, for debug purpose only
-hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
-# converters = ["float8"]
-
-[optimizer]
-name = "AdamW"
-lr = 8e-4
-eps = 1e-8
-
-[lr_scheduler]
-warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
-decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
-decay_type = "linear"
-min_lr_factor = 0.0
-
-[training]
-global_batch_size = 4
-local_batch_size = 2
-seq_len = 2048
-max_norm = 1.0  # grad norm clipping
-steps = 10
-dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
-dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
-mixed_precision_param = "float32" # force float32 for comparison
-mixed_precision_reduce = "float32"
-
-[parallelism]
-data_parallel_replicate_degree = 1
-data_parallel_shard_degree = 1
-fsdp_reshard_after_forward = "default" # default / never / always
-tensor_parallel_degree = 1
-enable_async_tensor_parallel = false
-pipeline_parallel_degree = 1
-pipeline_parallel_schedule = "1F1B"
-context_parallel_degree = 1
-expert_parallel_degree = 1
-expert_tensor_parallel_degree = 1
-
-[checkpoint]
-enable = false
-folder = "checkpoint"
-interval = 10
-last_save_model_only = false
-export_dtype = "float32"
-async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
-
-[activation_checkpoint]
-mode = "selective"  # ["none", "selective", "full"]
-selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
-
-[compile]
-enable=false
-components = ["model", "loss"]
-
-[quantize.linear.float8]
-enable_fsdp_float8_all_gather = false
-precompute_float8_dynamic_scale_for_fsdp = false
-filter_fqns = ["output"]
-
-[validation]
-enable = false
-dataset = "c4_validation"
-freq = 5
-steps = 10
diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
index 4838133618..46b4b3e385 100644
--- a/torchtitan/experiments/transformers_backend/test_hf_integration.py
+++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py
@@ -144,7 +144,7 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
         |_ llama3 #torchtitan model
     """
 
-    base_config = "configs/test_template.toml"
+    base_config = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/test_template.toml"
     with open(base_config, "r") as f:
         config = toml.load(f)
 
@@ -223,13 +223,11 @@ def create_configs(model_name: str, out_dir: str, flavor: str):
         iter_config["parallelism"]["pipeline_parallel_degree"] = pp
         iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe"
         iter_config["job"]["dump_folder"] = str(pc_dir)
-        
-        # if pc == "fsdp1_tp1_cp1_pp2" or pc == BASELINE:
-        #     iter_config["training"]["global_batch_size"] = 1
-        #     iter_config["training"]["local_batch_size"] = 1
 
-        if pc == BASELINE or pc == "fsdp2_tp1_cp1_pp2":
-            iter_config["training"]["local_batch_size"] = 2
+        iter_config["training"]["global_batch_size"] = 4
+        iter_config["training"]["local_batch_size"] = 2
+        iter_config["training"]["mixed_precision_param"] = "float32"
+        iter_config["training"]["mixed_precision_reduce"] = "float32"
 
         config_path = pc_dir / "config.toml"
         with open(config_path, "w") as f:
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 96a77caa0b..bc7c23daee 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -171,7 +171,6 @@ def __init__(self, job_config: JobConfig):
             model_param_count,
             self.metrics_processor.num_flops_per_token,
         ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
-        
         logger.info(
             f"{color.blue}Model {job_config.model.name} {job_config.model.flavor} "
             f"{color.red}size: {model_param_count:,} total parameters{color.reset}"

From 4f2b357909443e0147c07867ff9ac568c7134cf1 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 20 Oct 2025 12:12:55 +0000
Subject: [PATCH 072/129] separate args &model + dont extract loss metrics -1.0
 when double PP rank in tests

---
 .../transformers_backend/__init__.py          |   6 +-
 .../transformers_backend/model/args.py        | 268 +++++++++++++++++
 .../{hf_transformers_args.py => model.py}     | 273 +-----------------
 .../test_hf_integration.py                    |   7 +-
 4 files changed, 283 insertions(+), 271 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/model/args.py
 rename torchtitan/experiments/transformers_backend/model/{hf_transformers_args.py => model.py} (69%)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 0b50ce2027..77afb7d29b 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -16,14 +16,14 @@
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
-from .model.hf_transformers_args import HFTransformerModelArgs, HFTransformerModel
-
+from .model.args import HFTransformerModelArgs
+from .model.model import HFTransformerModel
 from torchtitan.models.moe import MoEArgs
 
+
 __all__ = [
     "HFTransformerModelArgs",
     "HFTransformerModel",
-    "hf_transformers_configs",
 ]
 
 @dataclass
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
new file mode 100644
index 0000000000..b1cde8e881
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -0,0 +1,268 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+from dataclasses import dataclass
+import torch
+from torch import nn
+import math
+from torch.nn import init
+from torchtitan.config import JobConfig
+from torchtitan.protocols import BaseModelArgs
+from torchtitan.tools.logging import logger
+from transformers import AutoConfig
+from transformers.utils import is_torch_deterministic
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import AttentionInterface, PreTrainedModel
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+
+@dataclass
+class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
+    """
+    Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
+    
+    Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
+    Properties are created dynamically based on which arguments are provided.
+    """
+    
+    # Define all possible mappings organized by argument type
+    _TT_TO_HF_MAPPINGS = {
+        "base": {
+            # Core TorchTitan mappings (always available)
+            "dim": "hidden_size",
+            "n_layers": "num_hidden_layers",
+            "n_heads": "num_attention_heads",
+            "n_kv_heads": "num_key_value_heads",
+            "norm_eps": "rms_norm_eps",
+            "max_seq_len": "max_position_embeddings",
+            "eos_id": "eos_token_id",
+        },
+        "deepseek_v3": {
+            # DeepSeekV3 specific mappings (only when deepseek_v3_args provided)
+            "inter_dim": "intermediate_size",
+            "n_dense_layers": "first_k_dense_replace",
+        },
+    }
+
+    def __init__(
+        self,
+        titan_args,
+        deepseek_v3_args=None,
+        # HuggingFace specific args
+        attn_implementation: str = "sdpa_torchtitan",
+        **kwargs,
+    ):
+        super().__init__(attn_implementation=attn_implementation, **kwargs)
+        assert titan_args is not None, "titan_args is required"
+
+        active_mappings = {}
+        
+        active_mappings.update(self._TT_TO_HF_MAPPINGS["base"])
+        
+        if deepseek_v3_args is not None:
+            active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"])
+        
+        self._active_mappings = active_mappings
+        
+        self._create_dynamic_properties()
+
+        # Set HF attributes from titan_args based on mappings
+        for titan_name, hf_name in self._active_mappings.items():
+            if hasattr(titan_args, titan_name):
+                setattr(self, hf_name, getattr(titan_args, titan_name))
+
+        # Fill all TorchTitan-specific args (no HF equivalent)
+        self.multiple_of = titan_args.multiple_of
+        self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
+        self.depth_init = titan_args.depth_init
+        self.use_flex_attn = titan_args.use_flex_attn
+        self.attn_mask_type = titan_args.attn_mask_type
+
+        # HuggingFace specific args
+        self.attn_implementation = attn_implementation
+        #NOTE:(3outeille):This will force create_causal_mask to return None
+        AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
+
+        # Start with passed_args as just titan_args
+        self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}
+        self._passed_args.update(kwargs)
+
+        #NOTE(3outeille): Wait for transformers uniformization of MoE args
+        if deepseek_v3_args is not None:
+            # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
+            # setting it to None in HuggingFace.
+            q_lora_rank = deepseek_v3_args.q_lora_rank
+            if q_lora_rank == 0:
+                q_lora_rank = None
+            deepseek_v3_args.q_lora_rank = q_lora_rank
+
+            self._passed_args.update(**deepseek_v3_args.__dict__)
+
+            self.rope_interleave = deepseek_v3_args.rope_interleave
+            self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor
+
+            if deepseek_v3_args.moe_args is not None:
+                moe_args = deepseek_v3_args.moe_args
+                self.num_experts_per_tok = moe_args.top_k
+                self.n_routed_experts = moe_args.num_experts
+                self.n_shared_experts = moe_args.num_shared_experts
+                self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
+                self._passed_args.update(
+                    dict(
+                        num_experts_per_tok=moe_args.top_k,
+                        n_routed_experts=moe_args.num_experts,
+                        n_shared_experts=moe_args.num_shared_experts,
+                        moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
+                    )
+                )
+
+    def _create_dynamic_properties(self):
+        """Create properties dynamically based on active mappings."""
+        def _create_property(hf_name: str) -> property:
+            def getter(self):
+                return getattr(self, hf_name)
+            def setter(self, value):
+                setattr(self, hf_name, value)
+            return property(getter, setter)
+        
+        for titan_name, hf_name in self._active_mappings.items():
+            # Create getter/setter for attribute that don't already exist
+            if not hasattr(self.__class__, titan_name):
+                setattr(self.__class__, titan_name, _create_property(hf_name))
+
+    def __repr__(self) -> str:
+        # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
+        # PretrainedConfig has a __repr__ that serializes the object to JSON, but it
+        # doesn't work well with how HFTransformerModelArgs is initialized.
+        # This custom __repr__ provides a dataclass-like representation that correctly
+        # displays the arguments passed during initialization.
+        args_lines = [
+            f"{k}={getattr(self, k)!r}"
+            for k in sorted(self._passed_args.keys())
+            if hasattr(self, k)
+        ]
+        args_str = "\n".join(args_lines)
+        return f"{self.__class__.__name__}(\n{args_str}\n)"
+
+    def update_from_config(self, job_config: JobConfig):
+        # Load HF config (overwrites our HF attributes)
+        hf_model_config = AutoConfig.from_pretrained(
+            job_config.model.name,
+            attn_implementation=self.attn_implementation,
+            trust_remote_code=True
+        )
+
+        # Explicitly update attributes based on mappings
+        for titan_name, hf_name in self._active_mappings.items():
+            if hasattr(hf_model_config, hf_name):
+                setattr(self, titan_name, getattr(hf_model_config, hf_name))
+
+        # Copy any other attributes that might not be in the mapping
+        for key, value in hf_model_config.to_dict().items():
+            setattr(self, key, value)
+
+        # Update our attributes with the passed args from flavors
+        for key, value in self._passed_args.items():
+            if hasattr(self, key) and value is not None:
+                setattr(self, key, value)
+
+        # MoE
+        if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
+            self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        
+        # Configure HF-specific settings to match TorchTitan settings
+        self.tie_word_embeddings = False
+        self.attention_bias = False
+        self.mlp_bias = False
+        self.use_cache = False
+        self.initializer_range = 1.0  # use as std for normal init in embedding
+        
+        if not hasattr(self, "inter_dim"): # Only for llama model
+            ffn_hidden_size = 4 * self.dim
+            ffn_hidden_size = int(2 * ffn_hidden_size / 3)
+            if self.ffn_dim_multiplier is not None:
+                ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
+            self.intermediate_size = self.multiple_of * (
+                (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
+            )
+        
+        self.head_dim = self.dim // self.num_attention_heads
+        
+        return self
+
+    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        # Check if this is a MoE model by looking for MoE attributes
+        is_moe = hasattr(self, 'n_routed_experts')
+        
+        if is_moe:
+            # MoE parameter counting (adapted from DeepSeek V3 implementation)
+            nparams_embedding = 0
+            nparams_moe_router = 0
+            nparams_shared_experts = 0
+            nparams_experts = 0
+            nparams_dense = 0
+
+            for name, p in model.named_parameters():
+                if "embedding" in name:
+                    nparams_embedding += p.numel()
+                    nparams_dense += p.numel()
+                elif "moe.shared_experts" in name:
+                    nparams_shared_experts += p.numel()
+                elif "moe.router" in name:
+                    nparams_moe_router += p.numel()
+                elif "moe.experts" in name:
+                    nparams_experts += p.numel()
+                else:
+                    nparams_dense += p.numel()
+
+            nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
+            nparams = nparams_dense + nparams_sparse
+            nparams_sparse_active = (
+                nparams_moe_router
+                + nparams_shared_experts
+                + nparams_experts * self.num_experts_per_tok // self.n_routed_experts
+            )
+
+            logger.info(
+                f"Total parameter count: dense {nparams_dense:,}, "
+                f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
+            )
+
+            l, h, q, t = (
+                self.n_layers,
+                self.n_heads,
+                self.dim // self.n_heads,
+                seq_len,
+            )
+            # Use active parameters for FLOPS calculation in MoE
+            num_flops_per_token = (
+                6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
+                + 12 * l * h * q * t
+            )
+        else:
+            # Dense model parameter counting (original implementation)
+            nparams = sum(p.numel() for p in model.parameters())
+            nparams_embedding = sum(
+                sum(p.numel() for p in m.parameters())
+                for m in model.children()
+                if isinstance(m, nn.Embedding)
+            )
+
+            l, h, q, t = (
+                self.n_layers,
+                self.n_heads,
+                self.dim // self.n_heads,
+                seq_len,
+            )
+            # Reasoning behind the factor of 12 for the self-attention part of the formula:
+            # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+            # 2. the flash attention does 1 more matmul recomputation in the backward
+            #    but recomputation should not be counted in calculating MFU           (+0)
+            # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+            # 4. we follow the convention and do not account for sparsity in causal attention
+            num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+
+        return nparams, num_flops_per_token
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/model.py
similarity index 69%
rename from torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
rename to torchtitan/experiments/transformers_backend/model/model.py
index 883c282dc0..1e17247bff 100644
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -1,271 +1,12 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import importlib
-from dataclasses import dataclass
-import torch
-from torch import nn
 import math
+import torch
 from torch.nn import init
-from torchtitan.config import JobConfig
-from torchtitan.protocols import BaseModelArgs
-from torchtitan.tools.logging import logger
-from transformers import AutoConfig
-from transformers.utils import is_torch_deterministic
+from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import AttentionInterface, PreTrainedModel
-from transformers.integrations.sdpa_attention import sdpa_attention_forward
-
-@dataclass
-class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
-    """
-    Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
-    
-    Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
-    Properties are created dynamically based on which arguments are provided.
-    """
-    
-    # Define all possible mappings organized by argument type
-    _TT_TO_HF_MAPPINGS = {
-        "base": {
-            # Core TorchTitan mappings (always available)
-            "dim": "hidden_size",
-            "n_layers": "num_hidden_layers",
-            "n_heads": "num_attention_heads",
-            "n_kv_heads": "num_key_value_heads",
-            "norm_eps": "rms_norm_eps",
-            "max_seq_len": "max_position_embeddings",
-            "eos_id": "eos_token_id",
-        },
-        "deepseek_v3": {
-            # DeepSeekV3 specific mappings (only when deepseek_v3_args provided)
-            "inter_dim": "intermediate_size",
-            "n_dense_layers": "first_k_dense_replace",
-        },
-    }
-
-    def __init__(
-        self,
-        titan_args,
-        deepseek_v3_args=None,
-        # HuggingFace specific args
-        attn_implementation: str = "sdpa_torchtitan",
-        **kwargs,
-    ):
-        super().__init__(attn_implementation=attn_implementation, **kwargs)
-        assert titan_args is not None, "titan_args is required"
-
-        active_mappings = {}
-        
-        active_mappings.update(self._TT_TO_HF_MAPPINGS["base"])
-        
-        if deepseek_v3_args is not None:
-            active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"])
-        
-        self._active_mappings = active_mappings
-        
-        self._create_dynamic_properties()
-
-        # Set HF attributes from titan_args based on mappings
-        for titan_name, hf_name in self._active_mappings.items():
-            if hasattr(titan_args, titan_name):
-                setattr(self, hf_name, getattr(titan_args, titan_name))
-
-        # Fill all TorchTitan-specific args (no HF equivalent)
-        self.multiple_of = titan_args.multiple_of
-        self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
-        self.depth_init = titan_args.depth_init
-        self.use_flex_attn = titan_args.use_flex_attn
-        self.attn_mask_type = titan_args.attn_mask_type
-
-        # HuggingFace specific args
-        self.attn_implementation = attn_implementation
-        #NOTE:(3outeille):This will force create_causal_mask to return None
-        AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
-
-        # Start with passed_args as just titan_args
-        self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}
-        self._passed_args.update(kwargs)
-
-        #NOTE(3outeille): Wait for transformers uniformization of MoE args
-        if deepseek_v3_args is not None:
-            # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
-            # setting it to None in HuggingFace.
-            q_lora_rank = deepseek_v3_args.q_lora_rank
-            if q_lora_rank == 0:
-                q_lora_rank = None
-            deepseek_v3_args.q_lora_rank = q_lora_rank
-
-            self._passed_args.update(**deepseek_v3_args.__dict__)
-
-            self.rope_interleave = deepseek_v3_args.rope_interleave
-            self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor
-
-            if deepseek_v3_args.moe_args is not None:
-                moe_args = deepseek_v3_args.moe_args
-                self.num_experts_per_tok = moe_args.top_k
-                self.n_routed_experts = moe_args.num_experts
-                self.n_shared_experts = moe_args.num_shared_experts
-                self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
-                self._passed_args.update(
-                    dict(
-                        num_experts_per_tok=moe_args.top_k,
-                        n_routed_experts=moe_args.num_experts,
-                        n_shared_experts=moe_args.num_shared_experts,
-                        moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
-                    )
-                )
-
-    def _create_dynamic_properties(self):
-        """Create properties dynamically based on active mappings."""
-        def _create_property(hf_name: str) -> property:
-            def getter(self):
-                return getattr(self, hf_name)
-            def setter(self, value):
-                setattr(self, hf_name, value)
-            return property(getter, setter)
-        
-        for titan_name, hf_name in self._active_mappings.items():
-            # Create getter/setter for attribute that don't already exist
-            if not hasattr(self.__class__, titan_name):
-                setattr(self.__class__, titan_name, _create_property(hf_name))
-
-    def __repr__(self) -> str:
-        # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
-        # PretrainedConfig has a __repr__ that serializes the object to JSON, but it
-        # doesn't work well with how HFTransformerModelArgs is initialized.
-        # This custom __repr__ provides a dataclass-like representation that correctly
-        # displays the arguments passed during initialization.
-        args_lines = [
-            f"{k}={getattr(self, k)!r}"
-            for k in sorted(self._passed_args.keys())
-            if hasattr(self, k)
-        ]
-        args_str = "\n".join(args_lines)
-        return f"{self.__class__.__name__}(\n{args_str}\n)"
-
-    def update_from_config(self, job_config: JobConfig):
-        # Load HF config (overwrites our HF attributes)
-        hf_model_config = AutoConfig.from_pretrained(
-            job_config.model.name,
-            attn_implementation=self.attn_implementation,
-            trust_remote_code=True
-        )
-
-        # Explicitly update attributes based on mappings
-        for titan_name, hf_name in self._active_mappings.items():
-            if hasattr(hf_model_config, hf_name):
-                setattr(self, titan_name, getattr(hf_model_config, hf_name))
-
-        # Copy any other attributes that might not be in the mapping
-        for key, value in hf_model_config.to_dict().items():
-            setattr(self, key, value)
-
-        # Update our attributes with the passed args from flavors
-        for key, value in self._passed_args.items():
-            if hasattr(self, key) and value is not None:
-                setattr(self, key, value)
-
-        # MoE
-        if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
-            self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
-        
-        # Configure HF-specific settings to match TorchTitan settings
-        self.tie_word_embeddings = False
-        self.attention_bias = False
-        self.mlp_bias = False
-        self.use_cache = False
-        self.initializer_range = 1.0  # use as std for normal init in embedding
-        
-        if not hasattr(self, "inter_dim"): # Only for llama model
-            ffn_hidden_size = 4 * self.dim
-            ffn_hidden_size = int(2 * ffn_hidden_size / 3)
-            if self.ffn_dim_multiplier is not None:
-                ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
-            self.intermediate_size = self.multiple_of * (
-                (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
-            )
-        
-        self.head_dim = self.dim // self.num_attention_heads
-        
-        return self
-
-    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        # Check if this is a MoE model by looking for MoE attributes
-        is_moe = hasattr(self, 'n_routed_experts')
-        
-        if is_moe:
-            # MoE parameter counting (adapted from DeepSeek V3 implementation)
-            nparams_embedding = 0
-            nparams_moe_router = 0
-            nparams_shared_experts = 0
-            nparams_experts = 0
-            nparams_dense = 0
-
-            for name, p in model.named_parameters():
-                if "embedding" in name:
-                    nparams_embedding += p.numel()
-                    nparams_dense += p.numel()
-                elif "moe.shared_experts" in name:
-                    nparams_shared_experts += p.numel()
-                elif "moe.router" in name:
-                    nparams_moe_router += p.numel()
-                elif "moe.experts" in name:
-                    nparams_experts += p.numel()
-                else:
-                    nparams_dense += p.numel()
-
-            nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
-            nparams = nparams_dense + nparams_sparse
-            nparams_sparse_active = (
-                nparams_moe_router
-                + nparams_shared_experts
-                + nparams_experts * self.num_experts_per_tok // self.n_routed_experts
-            )
-
-            logger.info(
-                f"Total parameter count: dense {nparams_dense:,}, "
-                f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
-            )
-
-            l, h, q, t = (
-                self.n_layers,
-                self.n_heads,
-                self.dim // self.n_heads,
-                seq_len,
-            )
-            # Use active parameters for FLOPS calculation in MoE
-            num_flops_per_token = (
-                6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
-                + 12 * l * h * q * t
-            )
-        else:
-            # Dense model parameter counting (original implementation)
-            nparams = sum(p.numel() for p in model.parameters())
-            nparams_embedding = sum(
-                sum(p.numel() for p in m.parameters())
-                for m in model.children()
-                if isinstance(m, nn.Embedding)
-            )
-
-            l, h, q, t = (
-                self.n_layers,
-                self.n_heads,
-                self.dim // self.n_heads,
-                seq_len,
-            )
-            # Reasoning behind the factor of 12 for the self-attention part of the formula:
-            # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
-            # 2. the flash attention does 1 more matmul recomputation in the backward
-            #    but recomputation should not be counted in calculating MFU           (+0)
-            # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
-            # 4. we follow the convention and do not account for sparsity in causal attention
-            num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
-
-        return nparams, num_flops_per_token
+import importlib
+from torch import nn
+from .args import HFTransformerModelArgs
+from torchtitan.tools.logging import logger
 
 class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
@@ -779,4 +520,4 @@ def __setattr__(self, name, value):
                 return
 
         # Otherwise, fall back to the default nn.Module behavior.
-        super().__setattr__(name, value)
\ No newline at end of file
+        super().__setattr__(name, value)
diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
index 46b4b3e385..6a1f5c1852 100644
--- a/torchtitan/experiments/transformers_backend/test_hf_integration.py
+++ b/torchtitan/experiments/transformers_backend/test_hf_integration.py
@@ -421,13 +421,16 @@ def _extract_metrics(log_file: Path) -> TrainingMetrics:
             # Regex to capture all metrics from a log line, ignoring ANSI color codes
             pattern = re.compile(
                 r"step:\s*(\d+)\s*"
-                r".*?loss:\s*([0-9]+\.?[0-9]*)\s*"
+                r".*?loss:\s*(-?[0-9]+\.?[0-9]*)\s*"
                 r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*"
             )
 
             for match in pattern.finditer(content):
+                loss = float(match.group(2))
+                if loss == -1.0:
+                    continue
                 metrics.steps.append(int(match.group(1)))
-                metrics.loss.append(float(match.group(2)))
+                metrics.loss.append(loss)
                 metrics.grad_norm.append(float(match.group(3)))
                 
         except Exception as e:

From 154289d040a4624dd635d2e805aa39107376bd82 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 21 Oct 2025 08:37:29 +0000
Subject: [PATCH 073/129] use recent refactoring for flops computation for
 dense and moe model

---
 .../transformers_backend/model/args.py        | 74 +------------------
 1 file changed, 4 insertions(+), 70 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index b1cde8e881..c49109aa0b 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -13,10 +13,10 @@
 from torchtitan.config import JobConfig
 from torchtitan.protocols import BaseModelArgs
 from torchtitan.tools.logging import logger
+from torchtitan.models.utils import get_dense_model_nparams_and_flops, get_moe_model_nparams_and_flops
 from transformers import AutoConfig
-from transformers.utils import is_torch_deterministic
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import AttentionInterface, PreTrainedModel
+from transformers.modeling_utils import AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
 
 @dataclass
@@ -194,75 +194,9 @@ def update_from_config(self, job_config: JobConfig):
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        # Check if this is a MoE model by looking for MoE attributes
         is_moe = hasattr(self, 'n_routed_experts')
         
         if is_moe:
-            # MoE parameter counting (adapted from DeepSeek V3 implementation)
-            nparams_embedding = 0
-            nparams_moe_router = 0
-            nparams_shared_experts = 0
-            nparams_experts = 0
-            nparams_dense = 0
-
-            for name, p in model.named_parameters():
-                if "embedding" in name:
-                    nparams_embedding += p.numel()
-                    nparams_dense += p.numel()
-                elif "moe.shared_experts" in name:
-                    nparams_shared_experts += p.numel()
-                elif "moe.router" in name:
-                    nparams_moe_router += p.numel()
-                elif "moe.experts" in name:
-                    nparams_experts += p.numel()
-                else:
-                    nparams_dense += p.numel()
-
-            nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
-            nparams = nparams_dense + nparams_sparse
-            nparams_sparse_active = (
-                nparams_moe_router
-                + nparams_shared_experts
-                + nparams_experts * self.num_experts_per_tok // self.n_routed_experts
-            )
-
-            logger.info(
-                f"Total parameter count: dense {nparams_dense:,}, "
-                f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
-            )
-
-            l, h, q, t = (
-                self.n_layers,
-                self.n_heads,
-                self.dim // self.n_heads,
-                seq_len,
-            )
-            # Use active parameters for FLOPS calculation in MoE
-            num_flops_per_token = (
-                6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
-                + 12 * l * h * q * t
-            )
+            return get_moe_model_nparams_and_flops(self, model, seq_len)
         else:
-            # Dense model parameter counting (original implementation)
-            nparams = sum(p.numel() for p in model.parameters())
-            nparams_embedding = sum(
-                sum(p.numel() for p in m.parameters())
-                for m in model.children()
-                if isinstance(m, nn.Embedding)
-            )
-
-            l, h, q, t = (
-                self.n_layers,
-                self.n_heads,
-                self.dim // self.n_heads,
-                seq_len,
-            )
-            # Reasoning behind the factor of 12 for the self-attention part of the formula:
-            # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
-            # 2. the flash attention does 1 more matmul recomputation in the backward
-            #    but recomputation should not be counted in calculating MFU           (+0)
-            # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
-            # 4. we follow the convention and do not account for sparsity in causal attention
-            num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
-
-        return nparams, num_flops_per_token
\ No newline at end of file
+            return get_dense_model_nparams_and_flops(self, model, seq_len)
\ No newline at end of file

From 1b2cfd792e63c9a91f3bffe9283760018001fb23 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 21 Oct 2025 13:35:43 +0000
Subject: [PATCH 074/129] fix tie_embedding

---
 .../infra/parallelize_hf_transformers.py      |  4 --
 .../transformers_backend/model/args.py        |  1 -
 .../transformers_backend/model/model.py       | 48 +++++++++++++------
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 32e122ab75..3d729f3afb 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -195,7 +195,6 @@ def parallelize_hf_transformers(
         logger.warning("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
-        model.set_tp_mesh(world_mesh["tp"])
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.quantize.linear.float8.recipe_name in (
             "rowwise",
@@ -297,9 +296,6 @@ def parallelize_hf_transformers(
             enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
         )
 
-    if parallel_dims.pp_enabled:
-        model.set_pp_mesh(world_mesh["pp"])
-
     return model
 
 
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index c49109aa0b..6bd805fff4 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -174,7 +174,6 @@ def update_from_config(self, job_config: JobConfig):
             self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
         
         # Configure HF-specific settings to match TorchTitan settings
-        self.tie_word_embeddings = False
         self.attention_bias = False
         self.mlp_bias = False
         self.use_cache = False
diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index 1e17247bff..0a8c000d0e 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -94,17 +94,9 @@ def __init__(self, model_args: HFTransformerModelArgs):
                 layer.moe_enabled = False
 
         self.cp_mesh = None
-        self.tp_mesh = None
-        self.pp_mesh = None
 
     def set_cp_mesh(self, mesh):
         self.cp_mesh = mesh
-    
-    def set_tp_mesh(self, mesh):
-        self.tp_mesh = mesh
-    
-    def set_pp_mesh(self, mesh):
-        self.pp_mesh = mesh
 
     def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None):
         """
@@ -155,7 +147,6 @@ def _init_weights_patched(self, module):
             `self` is a PreTrainedModel instance.
             """
             config = self.config
-
             # Build tuple of classes to check for layer_idx-based init_std calculation
             layer_idx_classes = [attention_cls]
             if mlp_cls:
@@ -234,8 +225,21 @@ def _init_weights_patched(self, module):
                     module.bias.data.zero_()
 
             elif isinstance(module, nn.Embedding):
-                std = config.initializer_range
-                module.weight.data.normal_(mean=0.0, std=std)
+                # When tie_word_embeddings is True, use lm_head initialization
+                if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings:
+                    final_out_std = config.hidden_size**-0.5
+                    cutoff_factor = 3
+                    nn.init.trunc_normal_(
+                        module.weight,
+                        mean=0.0,
+                        std=final_out_std,
+                        a=-cutoff_factor * final_out_std,
+                        b=cutoff_factor * final_out_std,
+                    )
+                else:
+                    std = config.initializer_range
+                    module.weight.data.normal_(mean=0.0, std=std)
+                
                 if module.padding_idx is not None:
                     module.weight.data[module.padding_idx].zero_()
 
@@ -372,8 +376,21 @@ def _init_weights_patched(self, module):
                     module.bias.data.zero_()
 
             elif isinstance(module, nn.Embedding):
-                std = config.initializer_range
-                module.weight.data.normal_(mean=0.0, std=std)
+                # When tie_word_embeddings is True, use lm_head initialization
+                if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings:
+                    final_out_std = config.hidden_size**-0.5
+                    cutoff_factor = 3
+                    nn.init.trunc_normal_(
+                        module.weight,
+                        mean=0.0,
+                        std=final_out_std,
+                        a=-cutoff_factor * final_out_std,
+                        b=cutoff_factor * final_out_std,
+                    )
+                else:
+                    std = config.initializer_range
+                    module.weight.data.normal_(mean=0.0, std=std)
+                
                 if module.padding_idx is not None:
                     module.weight.data[module.padding_idx].zero_()
 
@@ -495,7 +512,10 @@ def selective_init(module):
 
         self.model.apply(selective_init)
 
-        self.model.tie_weights()
+        #TODO(3outeille): For pipeline parallel, only tie weights if both input and output embeddings are on the same device
+        # Maybe better way of handling this?
+        if not isinstance(self.tok_embeddings, nn.Identity) and not isinstance(self.output, nn.Identity):
+            self.model.tie_weights()
     
     def named_children(self):
         """

From 0f2c51e026f228e38eee6a2cd151d58124f393cd Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 21 Oct 2025 13:46:39 +0000
Subject: [PATCH 075/129] remove pad_token_id=None

---
 torchtitan/experiments/transformers_backend/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 77afb7d29b..d315c05271 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -78,7 +78,6 @@ class DeepSeekV3Args:
             n_heads=16,
             n_kv_heads=16,
         ),
-        pad_token_id=None,
         #TODO(3outeille): use os.environ to switch between models
         deepseek_v3_args=DeepSeekV3Args(
             partial_rotary_factor=4.0,

From 4c8b4b7f3895867de1ce1e52739deb68b9b3eb8c Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 21 Oct 2025 14:39:51 +0000
Subject: [PATCH 076/129] make it clearer about args

---
 .../transformers_backend/__init__.py          | 12 ++--
 .../transformers_backend/model/args.py        | 55 +++++++++----------
 2 files changed, 31 insertions(+), 36 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index d315c05271..110a376642 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -27,9 +27,8 @@
 ]
 
 @dataclass
-class TitanModelArgs:
+class TitanDenseModelArgs:
     """Arguments for the base TorchTitan model."""
-
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
@@ -46,7 +45,7 @@ class TitanModelArgs:
 
 
 @dataclass
-class DeepSeekV3Args:
+class TitanMoeModelArgs:
     """Arguments specific to DeepSeekV3 models."""
     moe_args: MoEArgs | None = None
     n_group: int | None = None
@@ -72,14 +71,13 @@ class DeepSeekV3Args:
 
 flavors = {
     "debugmodel": HFTransformerModelArgs(
-        titan_args=TitanModelArgs(
+        titan_dense_args=TitanDenseModelArgs(
             dim=256,
             n_layers=6,
             n_heads=16,
             n_kv_heads=16,
         ),
-        #TODO(3outeille): use os.environ to switch between models
-        deepseek_v3_args=DeepSeekV3Args(
+        titan_moe_args=TitanMoeModelArgs(
             partial_rotary_factor=4.0,
             inter_dim=1024,
             moe_inter_dim=256,
@@ -103,7 +101,7 @@ class DeepSeekV3Args:
         ) if os.environ.get("USE_MOE", "0") == "1" else None,
     ),
     "full": HFTransformerModelArgs(
-        titan_args=TitanModelArgs(),
+        titan_dense_args=TitanDenseModelArgs(),
     ),
 }
 
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 6bd805fff4..e02a04e136 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -30,8 +30,8 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
     
     # Define all possible mappings organized by argument type
     _TT_TO_HF_MAPPINGS = {
-        "base": {
-            # Core TorchTitan mappings (always available)
+        "dense": {
+            # TorchTitan dense model mappings (always available)
             "dim": "hidden_size",
             "n_layers": "num_hidden_layers",
             "n_heads": "num_attention_heads",
@@ -40,8 +40,8 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
             "max_seq_len": "max_position_embeddings",
             "eos_id": "eos_token_id",
         },
-        "deepseek_v3": {
-            # DeepSeekV3 specific mappings (only when deepseek_v3_args provided)
+        "moe": {
+            # TorchTitan moe model specific mappings (only when titan_moe_args provided)
             "inter_dim": "intermediate_size",
             "n_dense_layers": "first_k_dense_replace",
         },
@@ -49,21 +49,21 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
 
     def __init__(
         self,
-        titan_args,
-        deepseek_v3_args=None,
+        titan_dense_args,
+        titan_moe_args=None,
         # HuggingFace specific args
         attn_implementation: str = "sdpa_torchtitan",
         **kwargs,
     ):
         super().__init__(attn_implementation=attn_implementation, **kwargs)
-        assert titan_args is not None, "titan_args is required"
+        assert titan_dense_args is not None, "titan_dense_args is required"
 
         active_mappings = {}
         
-        active_mappings.update(self._TT_TO_HF_MAPPINGS["base"])
+        active_mappings.update(self._TT_TO_HF_MAPPINGS["dense"])
         
-        if deepseek_v3_args is not None:
-            active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"])
+        if titan_moe_args is not None:
+            active_mappings.update(self._TT_TO_HF_MAPPINGS["moe"])
         
         self._active_mappings = active_mappings
         
@@ -71,15 +71,15 @@ def __init__(
 
         # Set HF attributes from titan_args based on mappings
         for titan_name, hf_name in self._active_mappings.items():
-            if hasattr(titan_args, titan_name):
-                setattr(self, hf_name, getattr(titan_args, titan_name))
+            if hasattr(titan_dense_args, titan_name):
+                setattr(self, hf_name, getattr(titan_dense_args, titan_name))
 
         # Fill all TorchTitan-specific args (no HF equivalent)
-        self.multiple_of = titan_args.multiple_of
-        self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
-        self.depth_init = titan_args.depth_init
-        self.use_flex_attn = titan_args.use_flex_attn
-        self.attn_mask_type = titan_args.attn_mask_type
+        self.multiple_of = titan_dense_args.multiple_of
+        self.ffn_dim_multiplier = titan_dense_args.ffn_dim_multiplier
+        self.depth_init = titan_dense_args.depth_init
+        self.use_flex_attn = titan_dense_args.use_flex_attn
+        self.attn_mask_type = titan_dense_args.attn_mask_type
 
         # HuggingFace specific args
         self.attn_implementation = attn_implementation
@@ -87,35 +87,32 @@ def __init__(
         AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
 
         # Start with passed_args as just titan_args
-        self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}
+        self._passed_args = {**titan_dense_args.__dict__, "attn_implementation": attn_implementation}
         self._passed_args.update(kwargs)
 
         #NOTE(3outeille): Wait for transformers uniformization of MoE args
-        if deepseek_v3_args is not None:
+        if titan_moe_args is not None:
             # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
             # setting it to None in HuggingFace.
-            q_lora_rank = deepseek_v3_args.q_lora_rank
+            q_lora_rank = titan_moe_args.q_lora_rank
             if q_lora_rank == 0:
                 q_lora_rank = None
-            deepseek_v3_args.q_lora_rank = q_lora_rank
+            titan_moe_args.q_lora_rank = q_lora_rank
 
-            self._passed_args.update(**deepseek_v3_args.__dict__)
+            self._passed_args.update(**titan_moe_args.__dict__)
 
-            self.rope_interleave = deepseek_v3_args.rope_interleave
-            self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor
-
-            if deepseek_v3_args.moe_args is not None:
-                moe_args = deepseek_v3_args.moe_args
+            if titan_moe_args.moe_args is not None:
+                moe_args = titan_moe_args.moe_args
                 self.num_experts_per_tok = moe_args.top_k
                 self.n_routed_experts = moe_args.num_experts
                 self.n_shared_experts = moe_args.num_shared_experts
-                self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
+                self.moe_intermediate_size = titan_moe_args.moe_inter_dim
                 self._passed_args.update(
                     dict(
                         num_experts_per_tok=moe_args.top_k,
                         n_routed_experts=moe_args.num_experts,
                         n_shared_experts=moe_args.num_shared_experts,
-                        moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
+                        moe_intermediate_size=titan_moe_args.moe_inter_dim,
                     )
                 )
 

From c61271e3a75d1962a73aa10b30d4c5e63839538f Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 21 Oct 2025 14:44:12 +0000
Subject: [PATCH 077/129] remove local testing scripts

---
 .../configs/template.slurm                    | 115 ---
 .../test_hf_integration.py                    | 775 ------------------
 2 files changed, 890 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/configs/template.slurm
 delete mode 100644 torchtitan/experiments/transformers_backend/test_hf_integration.py

diff --git a/torchtitan/experiments/transformers_backend/configs/template.slurm b/torchtitan/experiments/transformers_backend/configs/template.slurm
deleted file mode 100644
index 493b569e95..0000000000
--- a/torchtitan/experiments/transformers_backend/configs/template.slurm
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name={{ name }}
-#SBATCH --output={{ root_path }}/slurm_%j.out
-#SBATCH --error={{ root_path }}/slurm_%j.out
-#SBATCH --nodes={{ nodes }}
-#SBATCH --gres=gpu:{{ n_proc_per_node }}
-#SBATCH --ntasks-per-node=1
-#SBATCH --qos={{ qos }}
-#SBATCH --cpus-per-task=12
-
-# Misc initializations.
-echo "========================"
-echo "START TIME: $(date)"
-source /etc/profile.d/modules.sh
-source /fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/env_torchtitan_official/bin/activate
-echo python3 version = $(python3 --version)
-echo "==========="
-
-# Slurm stuff
-export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
-export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=$((1024 + RANDOM % 64511))
-
-export TMPDIR=/scratch
-export TORCH_HOME="/fsx/ferdinandmom/cache/torch"
-export HF_HOME="/fsx/ferdinandmom/cache/huggingface"
-export HF_DATASETS_CACHE="/fsx/ferdinandmom/cache/huggingface/datasets"
-export TRANSFORMERS_CACHE="/fsx/ferdinandmom/cache/huggingface/transformers"
-export CUBLAS_WORKSPACE_CONFIG=":4096:8"
-export CUDA_DEVICE_MAX_CONNECTIONS="1"
-export UV_CACHE_DIR="/fsx/ferdinandmom/.cache/uv"
-
-# EFA settings
-export FI_PROVIDER=efa
-export FI_EFA_FORK_SAFE=1
-export FI_EFA_ENABLE_SHM_TRANSFER=1
-export NCCL_PROTO=simple
-export NCCL_SOCKET_IFNAME=enp
-
-module load cuda/12.4
-
-echo "Running training job: {{ name }}"
-echo "Config file: {{ config_path }}"
-
-# Function to update status based on squeue output
-update_status() {
-    job_id=$1
-    status_file=$2
-    # For unknown reasons, it doenst update status for pending. It only works for running 
-    while true; do
-        job_status=$(squeue --job $job_id --noheader --format=%T)
-        echo "Job status: $job_status"
-        if [ -z "$job_status" ]; then
-            # Job has finished or is not found
-            break
-        elif [ "$job_status" = "RUNNING" ]; then
-            printf "running" > $status_file
-            break
-        fi
-        sleep 10
-    done
-}
-
-# Update status to "pending" or "running" in the background
-update_status $job_id {{ root_path }}/status.txt &
-
-# LOG_DIR="{{ root_path }}/logs"
-# mkdir -p ${LOG_DIR}
-
-# CMD="torchrun \
-#    --nproc_per_node {{ n_proc_per_node }} \
-#    --nnodes {{ nodes }} \
-#    --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
-#    --rdzv_backend c10d \
-#    --max_restarts 0 \
-#    --log-dir ${LOG_DIR} \
-#    --role rank \
-#    --tee 3 \
-#    -m torchtitan.train \
-#    --checkpoint.enable \
-#    {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \
-#    --training.seed 42 \
-#    --training.deterministic \
-#    --training.steps 1 \
-#    --job.config_file {{ config_path }}"
-
-
-CMD="torchrun \
-   --nproc_per_node {{ n_proc_per_node }} \
-   --nnodes {{ nodes }} \
-   --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
-   --rdzv_backend c10d \
-   --max_restarts 0 \
-   --role rank \
-   --local_ranks_filter {{ n_proc_per_node - 1 }} \
-   --tee 3 \
-   -m torchtitan.train \
-   --checkpoint.enable \
-   {% if name == "seed_checkpoint" %} --checkpoint.create_seed_checkpoint {% else %} --checkpoint.initial_load_path {{ initial_load_path }} {% endif %} \
-   --training.seed 42 \
-   --training.deterministic \
-   --job.config_file {{ config_path }}"
-
-# Run the main command
-echo "Running command: srun -u $CMD"
-srun -u $CMD
-exit_status=$?
-
-
-# Update status based on the exit status of `srun`
-if [ $exit_status -eq 0 ]; then
-   printf "completed" > {{ root_path }}/status.txt
-else
-   printf "fail" > {{ root_path }}/status.txt
-fi
diff --git a/torchtitan/experiments/transformers_backend/test_hf_integration.py b/torchtitan/experiments/transformers_backend/test_hf_integration.py
deleted file mode 100644
index 6a1f5c1852..0000000000
--- a/torchtitan/experiments/transformers_backend/test_hf_integration.py
+++ /dev/null
@@ -1,775 +0,0 @@
-import toml
-from argparse import ArgumentParser
-from pathlib import Path
-import re
-import os
-import subprocess
-from enum import Enum
-from jinja2 import Template
-from rich.console import Console
-from rich.panel import Panel
-from rich.table import Table
-from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
-
-# BASELINE = "fsdp2_tp1_cp1_pp1"
-BASELINE = "fsdp1_tp1_cp1_pp1"
-
-console = Console()
-
-class LogLevel(Enum):
-    INFO = "INFO"
-    SUCCESS = "SUCCESS"
-    WARNING = "WARNING"
-    ERROR = "ERROR"
-    TEST_PASS = "TEST_PASS"
-    TEST_FAIL = "TEST_FAIL"
-
-def log_message(level: LogLevel, message: str, indent: int = 0, dim: bool = False) -> None:
-    """Log a message with appropriate color coding."""
-    style_map = {
-        LogLevel.INFO: "blue",
-        LogLevel.SUCCESS: "green",
-        LogLevel.WARNING: "yellow",
-        LogLevel.ERROR: "bold red",
-        LogLevel.TEST_PASS: "green",
-        LogLevel.TEST_FAIL: "bold red",
-    }
-
-    prefix_map = {
-        LogLevel.INFO: "[INFO]",
-        LogLevel.SUCCESS: "[SUCCESS]",
-        LogLevel.WARNING: "[WARNING]",
-        LogLevel.ERROR: "[ERROR]",
-        LogLevel.TEST_PASS: "✅ TEST PASS",
-        LogLevel.TEST_FAIL: "❌ TEST FAIL",
-    }
-
-    style = style_map[level]
-    prefix = prefix_map[level]
-    if indent > 0:
-        indent_str = "  " * (indent - 1) + "└─ "
-    else:
-        indent_str = ""
-         
-    output = f"{indent_str}[{style}]{prefix}[/] {message}"
-
-    if dim:
-        console.print(f"[dim]{output}[/dim]")
-    else:
-        console.print(output)
-
-
-def _create_slurm_script(
-    config: dict,
-    config_path: Path,
-    script_path: Path,
-    job_name: str,
-    initial_load_path: str = None,
-    repo_id: str = None,
-):
-    with open(config_path, "r") as file:
-        config = toml.load(file)
-
-    pp = config["parallelism"]["pipeline_parallel_degree"]
-    dp = config["parallelism"]["data_parallel_shard_degree"]
-    tp = config["parallelism"]["tensor_parallel_degree"]
-    cp = config["parallelism"]["context_parallel_degree"]
-    world_size = pp * dp * tp * cp
-
-    nodes = max(1, world_size // 8)
-    n_proc_per_node = min(8, world_size // nodes)
-
-    print(f"world_size: {world_size}, nodes: {nodes}, n_proc_per_node: {n_proc_per_node}")
-
-    # Read the SLURM script template from the file
-    template_path = Path(__file__).parent / "configs/template.slurm"
-    with open(template_path, "r") as f:
-        slurm_script_template = f.read()
-    base_bench_template = Template(slurm_script_template)
-
-    context_bench = {
-        "name": job_name,
-        "nodes": nodes,
-        "n_proc_per_node": n_proc_per_node,
-        "root_path": script_path.parent,
-        "config_path": config_path,
-        "initial_load_path": initial_load_path,
-        "repo_id": repo_id,
-        "qos": "high" if nodes > 1 else "normal",  # Example logic for qos
-    }
-
-    with open(script_path, "w") as file:
-        file.write(base_bench_template.render(context_bench))
-
-    print(f"Slurm script created at {script_path}")
-
-
-def create_configs(model_name: str, out_dir: str, flavor: str):
-    """
-    results/
-        |_ meta-llama
-            |_ Llama-3.2-1B
-                |_ debugmodel/
-                    |_ seed_checkpoint/
-                        |_ config.toml
-                        |_ seed.slurm
-                        |_ step-0/
-                           |_ ....
-                    |_ fsdp2_tp1_cp1_pp1/
-                        |_ config.toml
-                        |_ nd_parallelism.slurm
-                        |_ nd_parallelism.log
-                    |_ fsdp2_tp2_cp1_pp1/
-                        |_ config.toml
-                        |_ nd_parallelism.slurm
-                        |_ nd_parallelism.log
-                        |_ diff_baseline_vs_nd_parallelism.log
-                    |_ fsdp2_tp1_cp1_pp2/
-                        |_ config.toml
-                        |_ nd_parallelism.slurm
-                        |_ nd_parallelism.log
-                        |_ diff_baseline_vs_nd_parallelism.log
-                    |_ fsdp2_tp1_cp2_pp1/
-                        |_ config.toml
-                        |_ nd_parallelism.slurm
-                        |_ nd_parallelism.log
-                        |_ diff_baseline_vs_nd_parallelism.log
-                    |_ fsdp2_tp1_cp2_pp2/
-                        |_ config.toml
-                        |_ nd_parallelism.slurm
-                        |_ nd_parallelism.log
-                        |_ diff_baseline_vs_nd_parallelism.log`
-                |_ full/
-                ...
-        |_ llama3 #torchtitan model
-    """
-
-    base_config = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/test_template.toml"
-    with open(base_config, "r") as f:
-        config = toml.load(f)
-
-    config["model"]["name"] = model_name
-    config["model"]["flavor"] = flavor
-
-    # parallelism_configs = [
-    #     BASELINE, # baseline
-    #     "fsdp2_tp2_cp1_pp1",
-    #     # "fsdp2_tp1_cp1_pp2",
-    #     # "fsdp2_tp1_cp2_pp1",
-    #     # "fsdp2_tp1_cp2_pp2",
-    #     # "fsdp2_tp2_cp2_pp1",
-    #     # "fsdp2_tp2_cp1_pp2",
-    #     # "fsdp2_tp2_cp2_pp2",
-    # ]
-
-    # parallelism_configs = [
-    #     BASELINE, # baseline
-    #     # "fsdp1_tp2_cp1_pp1",
-    #     # "fsdp1_tp1_cp1_pp2",
-    #     # "fsdp1_tp1_cp2_pp1",
-    #     # "fsdp1_tp1_cp2_pp2",
-    #     # "fsdp1_tp2_cp2_pp1",
-    #     # "fsdp1_tp2_cp1_pp2",
-    #     # "fsdp1_tp2_cp2_pp2",
-    # ]
-
-    parallelism_configs = [
-        BASELINE, # baseline
-        "fsdp1_tp2_cp1_pp1",
-    ]
-
-    out_path = Path(out_dir) / model_name / flavor
-    out_path.mkdir(parents=True, exist_ok=True)
-
-    # Create seed checkpoint
-    seed_config = toml.loads(toml.dumps(config))
-    seed_config["parallelism"]["data_parallel_shard_degree"] = 1
-    seed_config["parallelism"]["tensor_parallel_degree"] = 1
-    seed_config["parallelism"]["pipeline_parallel_degree"] = 1
-    seed_config["parallelism"]["context_parallel_degree"] = 1
-    seed_checkpoint_dir = out_path / "seed_checkpoint"
-    seed_checkpoint_dir.mkdir(exist_ok=True)
-    seed_config["job"]["dump_folder"] = str(seed_checkpoint_dir)
-    seed_config_path = seed_checkpoint_dir / "config.toml"
-    with open(seed_config_path, "w") as f:
-        toml.dump(seed_config, f)
-    print(f"Created {seed_config_path}")
-    _create_slurm_script(
-        seed_config,
-        seed_config_path,
-        seed_checkpoint_dir / "seed.slurm",
-        "seed_checkpoint",
-        repo_id=model_name,
-    )
-
-    # Create parallelism configs
-    for pc in parallelism_configs:
-            
-        iter_config = toml.loads(toml.dumps(config))
-
-        m = re.match(r"fsdp(\d+)_tp(\d+)_cp(\d+)_pp(\d+)", pc)
-        if not m:
-            print(f"Skipping invalid config string: {pc}")
-            continue
-
-        fsdp, tp, cp, pp = map(int, m.groups())
-
-        pc_dir = out_path / pc
-        pc_dir.mkdir(exist_ok=True)
-
-        iter_config["parallelism"]["data_parallel_shard_degree"] = fsdp
-        iter_config["parallelism"]["tensor_parallel_degree"] = tp
-        iter_config["parallelism"]["context_parallel_degree"] = cp
-        iter_config["parallelism"]["pipeline_parallel_degree"] = pp
-        iter_config["parallelism"]["pipeline_parallel_schedule"] = "GPipe"
-        iter_config["job"]["dump_folder"] = str(pc_dir)
-
-        iter_config["training"]["global_batch_size"] = 4
-        iter_config["training"]["local_batch_size"] = 2
-        iter_config["training"]["mixed_precision_param"] = "float32"
-        iter_config["training"]["mixed_precision_reduce"] = "float32"
-
-        config_path = pc_dir / "config.toml"
-        with open(config_path, "w") as f:
-            toml.dump(iter_config, f)
-        print(f"Created {config_path}")
-        _create_slurm_script(
-            iter_config,
-            config_path,
-            pc_dir / "nd_parallelism.slurm",
-            pc,
-            initial_load_path=str(seed_checkpoint_dir / "checkpoint/step-0"),
-            repo_id=model_name,
-        )
-
-class Status(Enum):
-    # INIT -> PENDING -> [RUNNING | FAIL] -> COMPLETED
-    INIT = "init"  # Job is created
-    PENDING = "pending"  # Job is waiting for ressources
-    RUNNING = "running"  # Job is running
-    FAIL = "fail"  # Job failed
-    COMPLETED = "completed"  # Job is completed
-
-class Job:
-    def __init__(self, root_path: str, qos: str, inp_dir: str = None) -> None:
-        self.root_path = root_path
-        self.name = os.path.basename(root_path)
-        
-        self.config = os.path.join(root_path, "config.toml")
-        seed_slurm = os.path.join(root_path, "seed.slurm")
-        if os.path.exists(seed_slurm):
-            self.slurm_script = seed_slurm
-        else:
-            self.slurm_script = os.path.join(root_path, "nd_parallelism.slurm")
-
-        self.qos = qos
-
-        # Check if the status.txt file exists
-        status_file_path = os.path.join(self.root_path, "status.txt")
-        if not os.path.exists(status_file_path):
-            # Create the status.txt file with INIT status
-            with open(status_file_path, "w") as f:
-                f.write(Status.INIT.value)
-        self.status = self.get_status()
-
-    def get_status(self) -> Status:
-        """
-        Read the status of the job from `status.txt` and return it
-        """
-        is_existing = lambda value_to_check: any(
-            value.value == value_to_check for value in Status.__members__.values()
-        )
-
-        status_file_path = os.path.join(self.root_path, "status.txt")
-        with open(status_file_path, "r") as f:
-            status = f.read().strip()
-            if not is_existing(status):
-                raise ValueError(f"Invalid status: {status}")
-            return Status(status)
-
-    def set_status(self, status: Status) -> Status:
-        """
-        Update the status of the job in `status.txt` and return the new status
-        """
-        status_file_path = os.path.join(self.root_path, "status.txt")
-        with open(status_file_path, "w") as f:
-            f.write(status.value)
-            return status
-
-class Scheduler:
-    def __init__(self, inp_dir: str, qos: str) -> None:
-        # Find all leaf directories, and the top-level directory if it contains a config.
-        jobs_directory_paths = []
-        for root, dirs, files in os.walk(inp_dir):
-            is_job_dir = any(f.endswith(".toml") for f in files)
-            if is_job_dir:
-                if not dirs: # leaf node
-                    jobs_directory_paths.append(os.path.abspath(root))
-                # also capture baseline job in root
-                elif root == inp_dir:
-                    jobs_directory_paths.append(os.path.abspath(root))
-
-        self.job_lists = [Job(job_path, qos, inp_dir) for job_path in jobs_directory_paths]
-
-    def keep_only_jobs(self, status: Status):
-        return [job for job in self.job_lists if job.status == status]
-
-    def filter_out_jobs(self, status: Status):
-        return [job for job in self.job_lists if job.status != status]
-
-
-def submit_jobs(inp_dir, qos, only: str = None):
-    scheduler = Scheduler(inp_dir, qos)
-
-    env_vars = os.environ.copy()
-    total_jobs = len(scheduler.job_lists)
-
-    if only:
-        try:
-            status_to_filter = Status(only)
-            scheduler.job_lists = scheduler.keep_only_jobs(status_to_filter)
-        except ValueError:
-            print(f"Invalid status for --only: {only}")
-            return
-
-    if only is not None:
-        filtered_jobs = len(scheduler.job_lists)
-        if filtered_jobs == 0:
-            print(f"No '{only}' jobs to resubmit")
-            return
-        print(
-            f"Only {filtered_jobs}/{total_jobs} jobs with status '{only}' will be resubmitted"
-        )
-
-    scheduler.job_lists = scheduler.filter_out_jobs(Status.COMPLETED)
-
-    for job in scheduler.job_lists:
-        subprocess.run(["sbatch", job.slurm_script], env=env_vars)
-        job.set_status(Status.PENDING)
-
-
-def check_status(inp_dir: str):
-    """
-    Display a table showing the count of jobs in each status.
-    Reads status.txt from all job directories found in inp_dir.
-    """
-    # Find all directories with status.txt files
-    jobs_directory_paths = []
-    for root, dirs, files in os.walk(inp_dir):
-        if "status.txt" in files:
-            jobs_directory_paths.append(os.path.abspath(root))
-    
-    if not jobs_directory_paths:
-        print(f"No jobs found in {inp_dir}")
-        return
-    
-    # Count jobs by status
-    status_counts = {status: 0 for status in Status}
-    for job_path in jobs_directory_paths:
-        job = Job(job_path, qos="N/A")
-        status_counts[job.status] += 1
-    
-    total = len(jobs_directory_paths)
-    
-    # Print table
-    print("\nJob Status Summary")
-    print("=" * 30)
-    print(f"{'Status':<12} | {'Count':>5}")
-    print("-" * 30)
-    print(f"{'Init':<12} | {status_counts[Status.INIT]:>5}")
-    print(f"{'Pending':<12} | {status_counts[Status.PENDING]:>5}")
-    print(f"{'Running':<12} | {status_counts[Status.RUNNING]:>5}")
-    print(f"{'Fail':<12} | {status_counts[Status.FAIL]:>5}")
-    print(f"{'Completed':<12} | {status_counts[Status.COMPLETED]:>5}")
-    print("-" * 30)
-    print(f"{'Total':<12} | {total:>5}")
-    print("=" * 30)
-
-
-def report(inp_dir: str, only: str = None):
-    """
-    Generate diff reports between baseline (fsdp2_tp1_cp1_pp1) and all other parallelism configs.
-    Creates diff_baseline_vs_nd_parallelism.log in each non-baseline config directory.
-    Automatically discovers all model/flavor combinations under inp_dir.
-    """
-    # Add imports
-    import torch
-    from dataclasses import dataclass, field
-    from typing import List
-    
-    @dataclass
-    class TrainingMetrics:
-        """Training metrics extracted from logs."""
-        steps: List[int] = field(default_factory=list)
-        loss: List[float] = field(default_factory=list)
-        grad_norm: List[float] = field(default_factory=list)
-    
-    # Default tolerance values (matching compare_distributed_run.py)
-    DEFAULT_LOSS_ATOL = 5e-2
-    DEFAULT_LOSS_RTOL = 1e-5
-    DEFAULT_GRAD_NORM_ATOL = 7e-1
-    DEFAULT_GRAD_NORM_RTOL = 1e-5
-    
-    def _extract_metrics(log_file: Path) -> TrainingMetrics:
-        """Extract metrics from log file."""
-        metrics = TrainingMetrics()
-        
-        try:
-            with open(log_file, 'r') as f:
-                content = f.read()
-
-            # Regex to capture all metrics from a log line, ignoring ANSI color codes
-            pattern = re.compile(
-                r"step:\s*(\d+)\s*"
-                r".*?loss:\s*(-?[0-9]+\.?[0-9]*)\s*"
-                r".*?grad_norm:\s*([0-9]+\.?[0-9]*)\s*"
-            )
-
-            for match in pattern.finditer(content):
-                loss = float(match.group(2))
-                if loss == -1.0:
-                    continue
-                metrics.steps.append(int(match.group(1)))
-                metrics.loss.append(loss)
-                metrics.grad_norm.append(float(match.group(3)))
-                
-        except Exception as e:
-            log_message(LogLevel.WARNING, f"Could not extract metrics: {e}", indent=3, dim=True)
-        
-        return metrics
-    
-    def _compare_metrics(baseline_metrics: TrainingMetrics, test_metrics: TrainingMetrics, 
-                        config_name: str) -> tuple[bool, str]:
-        """Compare metrics between baseline and test configuration.
-        
-        Returns:
-            tuple[bool, str]: (passed, summary_message)
-        """
-        if not baseline_metrics.loss or not test_metrics.loss:
-            return False, f"Unable to extract metrics"
-        
-        # Convert to tensors
-        baseline_loss = torch.tensor(baseline_metrics.loss)
-        test_loss = torch.tensor(test_metrics.loss)
-        baseline_grad_norm = torch.tensor(baseline_metrics.grad_norm)
-        test_grad_norm = torch.tensor(test_metrics.grad_norm)
-        
-        # Check if tensors are close
-        loss_pass = torch.allclose(baseline_loss, test_loss, atol=DEFAULT_LOSS_ATOL, rtol=DEFAULT_LOSS_RTOL)
-        grad_pass = torch.allclose(baseline_grad_norm, test_grad_norm, atol=DEFAULT_GRAD_NORM_ATOL, rtol=DEFAULT_GRAD_NORM_RTOL)
-
-        # Calculate max absolute differences for logging
-        loss_max_diff = torch.max(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
-        grad_norm_diff = torch.max(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
-        
-        # Calculate min absolute differences for logging
-        loss_min_diff = torch.min(torch.abs(baseline_loss - test_loss)).item() if baseline_loss.numel() > 0 and test_loss.numel() > 0 else 0.0
-        grad_norm_min_diff = torch.min(torch.abs(baseline_grad_norm - test_grad_norm)).item() if baseline_grad_norm.numel() > 0 and test_grad_norm.numel() > 0 else 0.0
-
-        summary = (f"Max loss diff: {loss_max_diff:.2e}, "
-                  f"Min loss diff: {loss_min_diff:.2e}, "
-                  f"Max grad norm diff: {grad_norm_diff:.2e}, "
-                  f"Min grad norm diff: {grad_norm_min_diff:.2e}")
-        
-        return (loss_pass and grad_pass), summary
-
-    def _filter_log(log_file: Path) -> Path:
-        """Filter log file to normalize volatile information (timestamps, PIDs, ports)."""
-        filtered_file = log_file.with_suffix(log_file.suffix + '.filtered')
-        
-        with open(log_file, 'r') as infile, open(filtered_file, 'w') as outfile:
-            for line in infile:
-                # Apply filtering patterns to remove volatile information
-                line = re.sub(r'([0-9]{4}-[0-9]{2}-[0-9]{2} )?[0-9]{2}:[0-9]{2}:[0-9]{2}(,[0-9]+)?', 
-                            'TIMESTAMP', line)
-                line = re.sub(r'torchrun.*--master_port[= ]([0-9]+)', 
-                            'torchrun ... --master_port=XXXX', line)
-                line = re.sub(r'PID [0-9]+', 'PID XXXX', line)
-                line = re.sub(r'localhost:[0-9]+', 'localhost:XXXX', line)
-                outfile.write(line)
-        
-        return filtered_file
-
-    def _generate_diff(baseline_log: Path, test_log: Path, diff_file: Path) -> tuple[bool, str]:
-        """Generate diff between baseline and test logs using git diff.
-        
-        Returns:
-            tuple[bool, str]: (success, diff_output or error_message)
-        """
-        # Filter logs to remove timestamps and volatile information
-        baseline_filtered = _filter_log(baseline_log)
-        test_filtered = _filter_log(test_log)
-        
-        try:
-            # Generate colored diff using git diff
-            cmd = ["git", "diff", "--no-index", "--color=always", "--word-diff=color",
-                str(baseline_filtered), str(test_filtered)]
-            
-            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-            
-            # git diff returns exit code 1 when files differ (which is expected), not an error
-            if result.returncode not in [0, 1]:
-                error_msg = f"git diff failed with code {result.returncode}\n{result.stderr}"
-                return False, error_msg
-            
-            # Write diff to file
-            with open(diff_file, 'w') as f:
-                f.write(result.stdout)
-            
-            return True, result.stdout
-            
-        finally:
-            # Clean up filtered files
-            if baseline_filtered.exists():
-                baseline_filtered.unlink()
-            if test_filtered.exists():
-                test_filtered.unlink()
-
-    def _process_flavor_dir(flavor_dir: Path) -> tuple[int, int]:
-        """Process a single model/flavor directory.
-        
-        Returns:
-            tuple[int, int]: (passed_count, failed_count)
-        """
-        # Find baseline directory
-        baseline_dir = flavor_dir / BASELINE
-        if not baseline_dir.exists():
-            log_message(LogLevel.WARNING, f"No baseline directory found in {flavor_dir.relative_to(inp_path)}, skipping", indent=1)
-            return 0, 0
-        
-        # Find baseline .out file
-        baseline_out_files = list(baseline_dir.glob("*.out"))
-        if not baseline_out_files:
-            log_message(LogLevel.WARNING, f"No .out file found in baseline {baseline_dir.relative_to(inp_path)}, skipping", indent=1)
-            return 0, 0
-        baseline_out = baseline_out_files[0]
-        
-        # Extract baseline metrics
-        log_message(LogLevel.INFO, f"Extracting baseline metrics from {baseline_out.name}...", indent=1)
-        baseline_metrics = _extract_metrics(baseline_out)
-        if not baseline_metrics.loss or not baseline_metrics.grad_norm:
-            log_message(LogLevel.WARNING, "Could not extract baseline metrics, skipping comparisons", indent=1)
-            return 0, 0
-        
-        # Find all parallelism config directories (excluding seed_checkpoint and baseline)
-        config_dirs = []
-        for item in flavor_dir.iterdir():
-            if item.is_dir() and item.name not in {BASELINE, "seed_checkpoint"}:
-                config_dirs.append(item)
-        
-        if not config_dirs:
-            log_message(LogLevel.INFO, f"No test configurations found in {flavor_dir.relative_to(inp_path)}", indent=1)
-            return 0, 0
-        
-        console.print()
-        console.print(
-            Panel(
-                f"[cyan]Baseline:[/cyan] {baseline_out.relative_to(flavor_dir)}\n"
-                f"[cyan]Configurations to compare:[/cyan] {len(config_dirs)}",
-                title=f"[bold cyan]Processing {flavor_dir.relative_to(inp_path)}[/bold cyan]",
-                expand=False,
-                border_style="cyan",
-                padding=(0, 2),
-            )
-        )
-        
-        # Track results for summary
-        results = []
-        
-        # Generate diffs for each config
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            task = progress.add_task("[cyan]Processing configurations...", total=len(config_dirs))
-            
-            for i, config_dir in enumerate(sorted(config_dirs)):
-                if i > 0:
-                    console.rule(style="dim")
-                
-                progress.update(task, description=f"[cyan]Testing [bold]{config_dir.name}[/bold]")
-                
-                # Find .out file in config directory
-                test_out_files = list(config_dir.glob("*.out"))
-                if not test_out_files:
-                    log_message(LogLevel.WARNING, f"{config_dir.name}: No .out file found, skipping", indent=1)
-                    results.append((config_dir.name, False, "No .out file found"))
-                    progress.advance(task)
-                    continue
-                
-                test_out = test_out_files[0]
-                diff_file = config_dir / "diff_baseline_vs_nd_parallelism.log"
-                
-                # Extract test metrics
-                test_metrics = _extract_metrics(test_out)
-                
-                # Compare metrics
-                if test_metrics.loss and test_metrics.grad_norm:
-                    test_passed, metrics_summary = _compare_metrics(baseline_metrics, test_metrics, config_dir.name)
-                    
-                    if test_passed:
-                        log_message(LogLevel.TEST_PASS, f"{config_dir.name} - {metrics_summary}", indent=1)
-                        results.append((config_dir.name, True, metrics_summary))
-                    else:
-                        log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - {metrics_summary}", indent=1)
-                        results.append((config_dir.name, False, metrics_summary))
-                else:
-                    log_message(LogLevel.TEST_FAIL, f"{config_dir.name} - Unable to extract metrics", indent=1)
-                    results.append((config_dir.name, False, "Unable to extract metrics"))
-                
-                # Generate diff
-                try:
-                    success, output = _generate_diff(baseline_out, test_out, diff_file)
-                    
-                    if success:
-                        log_message(LogLevel.INFO, f"Diff between baseline vs HF nd-parallel saved to:", indent=5, dim=True)
-                        console.print(f"      [dim]{diff_file}[/dim]")
-                    else:
-                        log_message(LogLevel.WARNING, f"Failed to generate diff: {output}", indent=5, dim=True)
-                        
-                except Exception as e:
-                    log_message(LogLevel.WARNING, f"Failed to generate diff - {e}", indent=5, dim=True)
-                
-                progress.advance(task)
-        
-        console.print()
-        # Create summary table
-        summary_table = Table(
-            title=f"[bold]Summary for {flavor_dir.relative_to(inp_path)}[/bold]",
-            show_header=True,
-            header_style="bold magenta"
-        )
-        summary_table.add_column("Configuration", style="cyan")
-        summary_table.add_column("Status", justify="center")
-        summary_table.add_column("Metrics", style="dim")
-        
-        for name, passed, summary in results:
-            status = "[bold green]✅ PASS[/bold green]" if passed else "[bold red]❌ FAIL[/bold red]"
-            # Truncate summary if too long
-            display_summary = summary if len(summary) < 60 else summary[:57] + "..."
-            summary_table.add_row(name, status, display_summary)
-        
-        console.print(summary_table)
-        console.print()
-        
-        passed_count = sum(1 for _, passed, _ in results if passed)
-        failed_count = len(results) - passed_count
-        
-        return passed_count, failed_count
-
-    inp_path = Path(inp_dir)
-    
-    if not inp_path.exists():
-        console.print(f"[bold red]Error:[/bold red] Directory not found: {inp_path}")
-        return
-    
-    console.print(
-        Panel(
-            "[bold cyan]HuggingFace Integration Test Report Generator[/bold cyan]",
-            expand=False,
-            border_style="blue",
-            padding=(1, 2),
-        )
-    )
-    console.print()
-    
-    # Find all directories that contain a baseline (fsdp2_tp1_cp1_pp1) subdirectory
-    flavor_dirs = []
-    for root, dirs, files in os.walk(inp_path):
-        if BASELINE in dirs:
-            flavor_dirs.append(Path(root))
-    
-    # Filter by --only if provided
-    if only:
-        original_count = len(flavor_dirs)
-        flavor_dirs = [
-            d for d in flavor_dirs if only in str(d.relative_to(inp_path))
-        ]
-        log_message(
-            LogLevel.INFO,
-            f"Filtered from {original_count} to {len(flavor_dirs)} director{'ies' if len(flavor_dirs) != 1 else 'y'} matching '[bold]{only}[/bold]'",
-        )
-
-    if not flavor_dirs:
-        log_message(LogLevel.ERROR, f"No directories with baseline configuration found under {inp_path}")
-        console.print("[yellow]Expected to find directories containing 'fsdp2_tp1_cp1' subdirectory[/yellow]")
-        return
-    
-    log_message(LogLevel.INFO, f"Found {len(flavor_dirs)} model/flavor combination(s) to process:")
-    for flavor_dir in flavor_dirs:
-        console.print(f"  [cyan]•[/cyan] {flavor_dir.relative_to(inp_path)}")
-    
-    # Process each flavor directory
-    total_passed = 0
-    total_failed = 0
-    
-    for flavor_dir in flavor_dirs:
-        passed, failed = _process_flavor_dir(flavor_dir)
-        total_passed += passed
-        total_failed += failed
-    
-    # Final summary
-    console.print()
-    console.print(
-        Panel(
-            "[bold cyan]Overall Summary[/bold cyan]",
-            expand=False,
-            border_style="blue",
-            padding=(0, 2),
-        )
-    )
-    
-    overall_table = Table(show_header=True, header_style="bold magenta")
-    overall_table.add_column("Metric", style="cyan")
-    overall_table.add_column("Value", justify="right")
-    
-    total_tests = total_passed + total_failed
-    overall_table.add_row("Total Configurations Tested", str(total_tests))
-    overall_table.add_row("[green]Passed[/green]", str(total_passed))
-    overall_table.add_row("[red]Failed[/red]", str(total_failed))
-    
-    console.print(overall_table)
-    console.print()
-    
-    if total_failed == 0 and total_tests > 0:
-        log_message(LogLevel.SUCCESS, "All tests passed! 🎉")
-    elif total_tests > 0:
-        log_message(LogLevel.WARNING, f"{total_failed} configuration(s) had test failures")
-    
-    log_message(LogLevel.SUCCESS, "Diff generation complete!")
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    subparsers = parser.add_subparsers(dest="action")
-
-    create_configs_parser = subparsers.add_parser("create_configs")
-    create_configs_parser.add_argument("--model_name", type=str, required=True)
-    create_configs_parser.add_argument("--out_dir", type=str, required=True)
-    create_configs_parser.add_argument("--flavor", type=str, required=True)
-
-    submit_jobs_parser = subparsers.add_parser("submit_jobs")
-    submit_jobs_parser.add_argument("--inp_dir", type=str, required=True)
-    submit_jobs_parser.add_argument("--qos", type=str, required=True, choices=["low", "normal", "high", "prod"])
-    submit_jobs_parser.add_argument("--only", type=str, default=None, choices=[s.value for s in Status])
-
-    report_parser = subparsers.add_parser("report")
-    report_parser.add_argument("--inp_dir", type=str, required=True)
-    report_parser.add_argument("--only", type=str, default=None)
-
-    check_status_parser = subparsers.add_parser("check_status")
-    check_status_parser.add_argument("--inp_dir", type=str, required=True)
-
-    args = parser.parse_args()
-
-    if args.action == "create_configs":
-        create_configs(args.model_name, args.out_dir, args.flavor)
-    elif args.action == "submit_jobs":
-        submit_jobs(args.inp_dir, args.qos, args.only)
-    elif args.action == "report":
-        report(args.inp_dir, args.only)
-    elif args.action == "check_status":
-        check_status(args.inp_dir)
\ No newline at end of file

From a84854568c55b78d95e8f3bf3808b24619fb23c5 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 21 Oct 2025 15:22:27 +0000
Subject: [PATCH 078/129] fix linting

---
 torchtitan/experiments/__init__.py            |   8 +-
 .../transformers_backend/__init__.py          |  19 +-
 .../configs/qwen3_fsdp2_tp2_pp2.toml          |   5 +-
 .../infra/parallelize_hf_transformers.py      |  89 ++++---
 .../transformers_backend/infra/pipeline_hf.py |  25 +-
 .../transformers_backend/model/args.py        |  58 +++--
 .../transformers_backend/model/model.py       | 231 +++++++++++++-----
 torchtitan/protocols/train_spec.py            |   4 +-
 torchtitan/train.py                           |   6 +-
 9 files changed, 284 insertions(+), 161 deletions(-)

diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
index 75d22e58e6..6c1465c14a 100644
--- a/torchtitan/experiments/__init__.py
+++ b/torchtitan/experiments/__init__.py
@@ -5,5 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 _supported_experiments = frozenset(
-    ["flux", "simple_fsdp.llama3", "simple_fsdp.deepseek_v3", "vlm", "transformers_backend"]
+    [
+        "flux",
+        "simple_fsdp.llama3",
+        "simple_fsdp.deepseek_v3",
+        "vlm",
+        "transformers_backend",
+    ]
 )
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 110a376642..11bd36bc81 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -9,16 +9,16 @@
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers
-from torchtitan.datasets.hf_datasets import build_hf_dataloader
 from torchtitan.components.tokenizer import build_hf_tokenizer
-
-from .infra.pipeline_hf import pipeline_hf_transformers
+from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.models.moe import MoEArgs
 from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
 
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
+
+from .infra.pipeline_hf import pipeline_hf_transformers
 from .model.args import HFTransformerModelArgs
 from .model.model import HFTransformerModel
-from torchtitan.models.moe import MoEArgs
 
 
 __all__ = [
@@ -26,9 +26,11 @@
     "HFTransformerModel",
 ]
 
+
 @dataclass
 class TitanDenseModelArgs:
     """Arguments for the base TorchTitan model."""
+
     dim: int = 4096
     n_layers: int = 32
     n_heads: int = 32
@@ -47,6 +49,7 @@ class TitanDenseModelArgs:
 @dataclass
 class TitanMoeModelArgs:
     """Arguments specific to DeepSeekV3 models."""
+
     moe_args: MoEArgs | None = None
     n_group: int | None = None
     topk_group: int | None = None
@@ -97,8 +100,10 @@ class TitanMoeModelArgs:
                 score_func="softmax",
                 route_norm=True,
                 score_before_experts=False,
-            )
-        ) if os.environ.get("USE_MOE", "0") == "1" else None,
+            ),
+        )
+        if os.environ.get("USE_MOE", "0") == "1"
+        else None,
     ),
     "full": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(),
@@ -117,4 +122,4 @@ class TitanMoeModelArgs:
     build_loss_fn=build_cross_entropy_loss,
 )
 
-register_train_spec("hf_placeholder_name", hf_train_spec)
\ No newline at end of file
+register_train_spec("hf_placeholder_name", hf_train_spec)
diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
index 5f40ec41b3..4e216baa77 100644
--- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
+++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
@@ -3,8 +3,7 @@
 [job]
 dump_folder = "./outputs"
 description = "Qwen 3 debug training"
-print_args = false
-use_for_integration_test = false
+print_config = true
 
 [profiling]
 enable_profiling = true
@@ -77,7 +76,7 @@ selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac bas
 enable=false
 components = ["model", "loss"]
 
-[float8]
+[quantize.linear.float8]
 enable_fsdp_float8_all_gather = false
 precompute_float8_dynamic_scale_for_fsdp = false
 filter_fqns = ["output"]
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index 3d729f3afb..d1d8d4c480 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -25,16 +25,15 @@
     SequenceParallel,
 )
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
-from torchtitan.distributed import ParallelDims, NoParallel
+from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
+from torchtitan.distributed import NoParallel, ParallelDims
 
 from torchtitan.distributed.expert_parallel import (
     ExpertParallel,
     ExpertTensorParallel,
     ReordererSequenceParallel,
-    TensorParallel,
 )
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
 from torchtitan.tools.logging import logger
 
 # for selective op activation checkpointing
@@ -50,6 +49,7 @@
     torch.ops.aten.max.default,
 }
 
+
 def _apply_ac_to_transformer_block(
     module: nn.Module, ac_config: ACConfig, *, base_fqn: Optional[str] = None
 ):
@@ -137,6 +137,7 @@ def selective_checkpointing_context_fn():
         else:
             return module
 
+
 def apply_ac(model: nn.Module, ac_config: ACConfig):
     """Apply activation checkpointing to the model."""
     for layer_id, transformer_block in model.layers.named_children():
@@ -147,6 +148,7 @@ def apply_ac(model: nn.Module, ac_config: ACConfig):
 
     logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
 
+
 def apply_ddp(
     model: nn.Module,
     dp_mesh: DeviceMesh,
@@ -189,9 +191,7 @@ def parallelize_hf_transformers(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    if (
-        job_config.parallelism.context_parallel_degree > 1
-    ):
+    if job_config.parallelism.context_parallel_degree > 1:
         logger.warning("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
@@ -310,11 +310,11 @@ def apply_non_moe_tp(
     # transformer block's inputs)
     # 2. Parallelize the root norm layer over the sequence dim
     # 3. Parallelize the final linear output layer
-    
+
     # skipping nn.Identity modules (which are added by pipeline parallelism for unused modules)
     root_plan = {}
-    
-    if hasattr(model, 'tok_embeddings'):
+
+    if hasattr(model, "tok_embeddings"):
         if isinstance(model.tok_embeddings, nn.Identity):
             root_plan["tok_embeddings"] = NoParallel()
         else:
@@ -322,14 +322,14 @@ def apply_non_moe_tp(
                 input_layouts=Replicate(),
                 output_layouts=Shard(1),
             )
-    
-    if hasattr(model, 'norm'):
+
+    if hasattr(model, "norm"):
         if isinstance(model.norm, nn.Identity):
             root_plan["norm"] = NoParallel()
         else:
             root_plan["norm"] = SequenceParallel()
-    
-    if hasattr(model, 'output'):
+
+    if hasattr(model, "output"):
         if isinstance(model.output, nn.Identity):
             root_plan["output"] = NoParallel()
         else:
@@ -375,25 +375,33 @@ def apply_non_moe_tp(
         }
 
         if getattr(transformer_block.self_attn, "q_lora_rank", None) is None:
-            layer_plan.update({
-                "self_attn.q_proj": colwise_parallel(),
-                "self_attn.k_proj": colwise_parallel(),
-                "self_attn.v_proj": colwise_parallel(),
-            })
+            layer_plan.update(
+                {
+                    "self_attn.q_proj": colwise_parallel(),
+                    "self_attn.k_proj": colwise_parallel(),
+                    "self_attn.v_proj": colwise_parallel(),
+                }
+            )
         else:
-            layer_plan.update({
-                "self_attn.q_a_proj": NoParallel(),
-                "self_attn.q_a_layernorm": NoParallel(),
-                "self_attn.q_b_proj": colwise_parallel(),
-                "self_attn.kv_a_proj_with_mqa": NoParallel(),
-                "self_attn.kv_a_layernorm": NoParallel(),
-                "self_attn.kv_b_proj": colwise_parallel(),
-            })
+            layer_plan.update(
+                {
+                    "self_attn.q_a_proj": NoParallel(),
+                    "self_attn.q_a_layernorm": NoParallel(),
+                    "self_attn.q_b_proj": colwise_parallel(),
+                    "self_attn.kv_a_proj_with_mqa": NoParallel(),
+                    "self_attn.kv_a_layernorm": NoParallel(),
+                    "self_attn.kv_b_proj": colwise_parallel(),
+                }
+            )
 
         # Handle different names for the output projection layer, e.g. o_proj vs dense
-        o_proj_name = "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense"
-        layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(output_layouts=Shard(1))
-        
+        o_proj_name = (
+            "o_proj" if hasattr(transformer_block.self_attn, "o_proj") else "dense"
+        )
+        layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(
+            output_layouts=Shard(1)
+        )
+
         # For Qwen3 RMSNorm on Q and K
         # TODO(3outeille): we should probably shard(1) then replicate => then use SequenceParallel but for now I am fed up
         if hasattr(transformer_block.self_attn, "q_norm"):
@@ -409,14 +417,20 @@ def apply_non_moe_tp(
                 ),
             }
             # Handle different names for MLP layers, e.g. gate_proj vs fc1
-            gate_proj_name = "gate_proj" if hasattr(transformer_block.mlp, "gate_proj") else "fc1"
+            gate_proj_name = (
+                "gate_proj" if hasattr(transformer_block.mlp, "gate_proj") else "fc1"
+            )
             mlp_plan[f"mlp.{gate_proj_name}"] = colwise_parallel()
 
             if hasattr(transformer_block.mlp, "up_proj"):
                 mlp_plan["mlp.up_proj"] = colwise_parallel()
 
-            down_proj_name = "down_proj" if hasattr(transformer_block.mlp, "down_proj") else "fc2"
-            mlp_plan[f"mlp.{down_proj_name}"] = rowwise_parallel(output_layouts=Shard(1))
+            down_proj_name = (
+                "down_proj" if hasattr(transformer_block.mlp, "down_proj") else "fc2"
+            )
+            mlp_plan[f"mlp.{down_proj_name}"] = rowwise_parallel(
+                output_layouts=Shard(1)
+            )
             layer_plan.update(mlp_plan)
 
         # Some models like Phi-2 don't have post_attention_layernorm
@@ -494,7 +508,11 @@ def apply_fsdp(
         # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping
         # - the router and the shared experts are sharded together with the TransformerBlock
         # - the routed experts are sharded with the remaining dp_mod_ep_mesh
-        if hasattr(transformer_block, "moe_enabled") and transformer_block.moe_enabled and ep_degree > 1:
+        if (
+            hasattr(transformer_block, "moe_enabled")
+            and transformer_block.moe_enabled
+            and ep_degree > 1
+        ):
             fsdp_mod_ep_config = fsdp_config.copy()
             fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh
             moe_block = transformer_block.mlp
@@ -506,10 +524,7 @@ def apply_fsdp(
             #       shard_placement_fn on the outer TransformerBlock-level FSDP.
             _experts_shard_placement_fn = None
             assert dp_mod_ep_mesh is not None
-            if (
-                dp_mod_ep_mesh.size() * ep_degree
-                > moe_block.experts.num_experts
-            ):
+            if dp_mod_ep_mesh.size() * ep_degree > moe_block.experts.num_experts:
                 _experts_shard_placement_fn = lambda param: Shard(1)
 
             fully_shard(
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
index cd599ac2a5..ee7b268f9d 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
@@ -8,30 +8,26 @@
 
 import torch
 import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.pipelining import PipelineStage
 from torch.distributed.pipelining.schedules import (
     _PipelineSchedule,
     get_schedule_class,
     PipelineScheduleSingle,
+    ScheduleDualPipeV,
+    ScheduleZBVZeroBubble,
 )
 
 from torchtitan.components.loss import LossFunction
 from torchtitan.config import JobConfig
 from torchtitan.distributed import ParallelDims
-from torchtitan.distributed.pipeline_parallel import (
-    build_pipeline_schedule,
-    pipeline_module_split
-)
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.pipelining import PipelineStage
+from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
-from torch.distributed.pipelining.schedules import (
-    ScheduleDualPipeV,
-    ScheduleZBVZeroBubble,
-)
 
 # NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part
 
+
 def generate_llm_fqn_per_model_part(
     num_stages: int,
     num_layers: int,
@@ -57,11 +53,7 @@ def generate_llm_fqn_per_model_part(
     if num_stages == 1:
         # Single stage gets everything
         layer_names = [f"layers.{i}" for i in range(num_layers)]
-        return [
-            ["tok_embeddings"]
-            + layer_names
-            + ["norm", "output", "rotary_emb"]
-        ]
+        return [["tok_embeddings"] + layer_names + ["norm", "output", "rotary_emb"]]
 
     # Calculate effective layers including weights
     num_effective_layers = num_layers + input_weight + output_weight
@@ -285,6 +277,7 @@ def _get_stage_indices() -> tuple[int]:
 
     return stages, models
 
+
 def pipeline_hf_transformers(
     model: nn.Module,
     parallel_dims: ParallelDims,
@@ -397,4 +390,4 @@ def pipeline_hf_transformers(
         if stage.is_last:
             has_last_stage = True
 
-    return pp_schedule, model_parts, has_first_stage, has_last_stage
\ No newline at end of file
+    return pp_schedule, model_parts, has_first_stage, has_last_stage
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index e02a04e136..7181cb570a 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -4,30 +4,30 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import importlib
 from dataclasses import dataclass
-import torch
+
 from torch import nn
-import math
-from torch.nn import init
 from torchtitan.config import JobConfig
+from torchtitan.models.utils import (
+    get_dense_model_nparams_and_flops,
+    get_moe_model_nparams_and_flops,
+)
 from torchtitan.protocols import BaseModelArgs
-from torchtitan.tools.logging import logger
-from torchtitan.models.utils import get_dense_model_nparams_and_flops, get_moe_model_nparams_and_flops
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import AttentionInterface
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
+from transformers.modeling_utils import AttentionInterface
+
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
     """
     Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
-    
+
     Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
     Properties are created dynamically based on which arguments are provided.
     """
-    
+
     # Define all possible mappings organized by argument type
     _TT_TO_HF_MAPPINGS = {
         "dense": {
@@ -59,14 +59,14 @@ def __init__(
         assert titan_dense_args is not None, "titan_dense_args is required"
 
         active_mappings = {}
-        
+
         active_mappings.update(self._TT_TO_HF_MAPPINGS["dense"])
-        
+
         if titan_moe_args is not None:
             active_mappings.update(self._TT_TO_HF_MAPPINGS["moe"])
-        
+
         self._active_mappings = active_mappings
-        
+
         self._create_dynamic_properties()
 
         # Set HF attributes from titan_args based on mappings
@@ -83,14 +83,17 @@ def __init__(
 
         # HuggingFace specific args
         self.attn_implementation = attn_implementation
-        #NOTE:(3outeille):This will force create_causal_mask to return None
+        # NOTE:(3outeille):This will force create_causal_mask to return None
         AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
 
         # Start with passed_args as just titan_args
-        self._passed_args = {**titan_dense_args.__dict__, "attn_implementation": attn_implementation}
+        self._passed_args = {
+            **titan_dense_args.__dict__,
+            "attn_implementation": attn_implementation,
+        }
         self._passed_args.update(kwargs)
 
-        #NOTE(3outeille): Wait for transformers uniformization of MoE args
+        # NOTE(3outeille): Wait for transformers uniformization of MoE args
         if titan_moe_args is not None:
             # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
             # setting it to None in HuggingFace.
@@ -118,13 +121,16 @@ def __init__(
 
     def _create_dynamic_properties(self):
         """Create properties dynamically based on active mappings."""
+
         def _create_property(hf_name: str) -> property:
             def getter(self):
                 return getattr(self, hf_name)
+
             def setter(self, value):
                 setattr(self, hf_name, value)
+
             return property(getter, setter)
-        
+
         for titan_name, hf_name in self._active_mappings.items():
             # Create getter/setter for attribute that don't already exist
             if not hasattr(self.__class__, titan_name):
@@ -149,7 +155,7 @@ def update_from_config(self, job_config: JobConfig):
         hf_model_config = AutoConfig.from_pretrained(
             job_config.model.name,
             attn_implementation=self.attn_implementation,
-            trust_remote_code=True
+            trust_remote_code=True,
         )
 
         # Explicitly update attributes based on mappings
@@ -169,14 +175,14 @@ def update_from_config(self, job_config: JobConfig):
         # MoE
         if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
             self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
-        
+
         # Configure HF-specific settings to match TorchTitan settings
         self.attention_bias = False
         self.mlp_bias = False
         self.use_cache = False
         self.initializer_range = 1.0  # use as std for normal init in embedding
-        
-        if not hasattr(self, "inter_dim"): # Only for llama model
+
+        if not hasattr(self, "inter_dim"):  # Only for llama model
             ffn_hidden_size = 4 * self.dim
             ffn_hidden_size = int(2 * ffn_hidden_size / 3)
             if self.ffn_dim_multiplier is not None:
@@ -184,15 +190,15 @@ def update_from_config(self, job_config: JobConfig):
             self.intermediate_size = self.multiple_of * (
                 (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
             )
-        
+
         self.head_dim = self.dim // self.num_attention_heads
-        
+
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        is_moe = hasattr(self, 'n_routed_experts')
-        
+        is_moe = hasattr(self, "n_routed_experts")
+
         if is_moe:
             return get_moe_model_nparams_and_flops(self, model, seq_len)
         else:
-            return get_dense_model_nparams_and_flops(self, model, seq_len)
\ No newline at end of file
+            return get_dense_model_nparams_and_flops(self, model, seq_len)
diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index 0a8c000d0e..fd7561611e 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -1,17 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
 import math
+
 import torch
+from torch import nn
 from torch.nn import init
-from transformers.modeling_utils import PreTrainedModel
+from torchtitan.tools.logging import logger
 from transformers.configuration_utils import PretrainedConfig
-import importlib
-from torch import nn
+from transformers.modeling_utils import PreTrainedModel
+
 from .args import HFTransformerModelArgs
-from torchtitan.tools.logging import logger
+
 
 class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
         super().__init__()
-        
+
         # Try to import the model class dynamically from the transformers library if not found in globals
         model_class_name = model_args.architectures[0]
         model_cls = globals().get(model_class_name, None)
@@ -23,8 +32,8 @@ def __init__(self, model_args: HFTransformerModelArgs):
                 raise ImportError(
                     f"Could not find model class '{model_class_name}' in globals or transformers. "
                     f"Make sure the class is available. Original error: {e}"
-                )
-        
+                ) from e
+
         # Attempt to patch model weight initialization based on architecture type
         try:
             model_name_prefix = model_class_name.replace("ForCausalLM", "")
@@ -32,28 +41,34 @@ def __init__(self, model_args: HFTransformerModelArgs):
 
             attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None)
             mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None)
-            decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None)
+            decoder_layer_cls = getattr(
+                model_module, f"{model_name_prefix}DecoderLayer", None
+            )
 
-            is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model
+            is_moe = hasattr(
+                model_args, "n_routed_experts"
+            )  # TODO(3outeille): check if this is the most reliable to detect a moe model
             if is_moe:
                 moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None)
                 required_classes = {
                     "Attention": attention_cls,
-                    "MLP": mlp_cls, 
+                    "MLP": mlp_cls,
                     "DecoderLayer": decoder_layer_cls,
-                    "MoE": moe_cls
+                    "MoE": moe_cls,
                 }
-                
+
                 if all(required_classes.values()):
                     logger.info(f"Applying MoE-like patch for {model_name_prefix}")
                     self._patch_hf_moe_like(
                         decoder_layer_cls=decoder_layer_cls,
                         attention_cls=attention_cls,
                         mlp_cls=mlp_cls,
-                        moe_cls=moe_cls
+                        moe_cls=moe_cls,
                     )
                 else:
-                    missing = [name for name, cls in required_classes.items() if not cls]
+                    missing = [
+                        name for name, cls in required_classes.items() if not cls
+                    ]
                     logger.warning(
                         f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. "
                         "Skipping MoE-like patch."
@@ -61,18 +76,20 @@ def __init__(self, model_args: HFTransformerModelArgs):
             else:
                 required_classes = {
                     "Attention": attention_cls,
-                    "DecoderLayer": decoder_layer_cls
+                    "DecoderLayer": decoder_layer_cls,
                 }
-                
+
                 if all(required_classes.values()):
                     logger.info(f"Applying Llama-like patch for {model_name_prefix}")
                     self._patch_hf_llama_like(
                         decoder_layer_cls=decoder_layer_cls,
                         attention_cls=attention_cls,
-                        mlp_cls=mlp_cls  # mlp_cls can be None
+                        mlp_cls=mlp_cls,  # mlp_cls can be None
                     )
                 else:
-                    missing = [name for name, cls in required_classes.items() if not cls]
+                    missing = [
+                        name for name, cls in required_classes.items() if not cls
+                    ]
                     logger.warning(
                         f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
                         "Skipping Llama-like patch."
@@ -86,9 +103,12 @@ def __init__(self, model_args: HFTransformerModelArgs):
 
         self.model = model_cls(config=model_args)
         self.max_seq_len = model_args.max_seq_len
-        
+
         for layer in self.model.model.layers:
-            if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace:
+            if (
+                hasattr(model_args, "first_k_dense_replace")
+                and layer.layer_idx >= model_args.first_k_dense_replace
+            ):
                 layer.moe_enabled = True
             else:
                 layer.moe_enabled = False
@@ -226,7 +246,10 @@ def _init_weights_patched(self, module):
 
             elif isinstance(module, nn.Embedding):
                 # When tie_word_embeddings is True, use lm_head initialization
-                if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings:
+                if (
+                    hasattr(config, "tie_word_embeddings")
+                    and config.tie_word_embeddings
+                ):
                     final_out_std = config.hidden_size**-0.5
                     cutoff_factor = 3
                     nn.init.trunc_normal_(
@@ -239,13 +262,14 @@ def _init_weights_patched(self, module):
                 else:
                     std = config.initializer_range
                     module.weight.data.normal_(mean=0.0, std=std)
-                
+
                 if module.padding_idx is not None:
                     module.weight.data[module.padding_idx].zero_()
 
             elif (
                 isinstance(
-                    module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)
+                    module,
+                    (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d),
                 )
                 or "LayerNorm" in module.__class__.__name__
                 or "RMSNorm" in module.__class__.__name__
@@ -331,12 +355,14 @@ def _init_weights_patched(self, module):
                     nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
                 if hasattr(module, "q_b_proj"):
                     nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
-                
+
                 if hasattr(module, "kv_a_proj_with_mqa"):
-                    nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
+                    nn.init.trunc_normal_(
+                        module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02
+                    )
                 if hasattr(module, "kv_b_proj"):
                     nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
-                
+
                 if hasattr(module, "o_proj") and init_std is not None:
                     nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
 
@@ -345,22 +371,39 @@ def _init_weights_patched(self, module):
                 # DeepseekV3 uses std=0.02 for up_proj, unlike Llama
                 nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
                 if init_std is not None:
-                    nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
+                    nn.init.trunc_normal_(
+                        module.down_proj.weight, mean=0.0, std=init_std
+                    )
 
             elif isinstance(module, moe_cls):
                 if hasattr(module, "gate") and init_std is not None:
                     nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
                 if hasattr(module, "experts"):
                     for expert in module.experts:
-                        nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
+                        nn.init.trunc_normal_(
+                            expert.gate_proj.weight, mean=0.0, std=0.02
+                        )
                         nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
                         if init_std is not None:
-                            nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
-                if hasattr(module, "shared_experts") and module.shared_experts is not None:
-                    nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
-                    nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
+                            nn.init.trunc_normal_(
+                                expert.down_proj.weight, mean=0.0, std=init_std
+                            )
+                if (
+                    hasattr(module, "shared_experts")
+                    and module.shared_experts is not None
+                ):
+                    nn.init.trunc_normal_(
+                        module.shared_experts.gate_proj.weight, mean=0.0, std=0.02
+                    )
+                    nn.init.trunc_normal_(
+                        module.shared_experts.up_proj.weight, mean=0.0, std=0.02
+                    )
                     if init_std is not None:
-                        nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
+                        nn.init.trunc_normal_(
+                            module.shared_experts.down_proj.weight,
+                            mean=0.0,
+                            std=init_std,
+                        )
 
             elif module is getattr(self, "lm_head", None):
                 final_out_std = config.hidden_size**-0.5
@@ -377,7 +420,10 @@ def _init_weights_patched(self, module):
 
             elif isinstance(module, nn.Embedding):
                 # When tie_word_embeddings is True, use lm_head initialization
-                if hasattr(config, "tie_word_embeddings") and config.tie_word_embeddings:
+                if (
+                    hasattr(config, "tie_word_embeddings")
+                    and config.tie_word_embeddings
+                ):
                     final_out_std = config.hidden_size**-0.5
                     cutoff_factor = 3
                     nn.init.trunc_normal_(
@@ -390,11 +436,14 @@ def _init_weights_patched(self, module):
                 else:
                     std = config.initializer_range
                     module.weight.data.normal_(mean=0.0, std=std)
-                
+
                 if module.padding_idx is not None:
                     module.weight.data[module.padding_idx].zero_()
 
-            elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__:
+            elif (
+                "LayerNorm" in module.__class__.__name__
+                or "RMSNorm" in module.__class__.__name__
+            ):
                 if hasattr(module, "weight") and module.weight is not None:
                     module.weight.data.fill_(1.0)
                 if hasattr(module, "bias") and module.bias is not None:
@@ -407,52 +456,80 @@ def _init_weights_patched(self, module):
     @property
     def tok_embeddings(self):
         """Returns the model's embed_tokens, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "embed_tokens"
+        ):  # Llama-like
             return self.model.model.embed_tokens
         else:
-            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find embed_tokens in the model. Please check the model structure."
+            )
 
     @tok_embeddings.setter
     def tok_embeddings(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
-            setattr(self.model.model, "embed_tokens", value)
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "embed_tokens"
+        ):  # Llama-like
+            self.model.model.embed_tokens = value
         else:
-            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find embed_tokens in the model. Please check the model structure."
+            )
 
     @property
     def layers(self):
         """Returns the model's layers, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "layers"
+        ):  # Llama-like
             return self.model.model.layers
         else:
             # Add more cases here if needed for other model architectures
-            raise AttributeError("Could not find layers in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find layers in the model. Please check the model structure."
+            )
 
     @layers.setter
     def layers(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
-            setattr(self.model.model, "layers", value)
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "layers"
+        ):  # Llama-like
+            self.model.model.layers = value
         else:
-            raise AttributeError("Could not find layers in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find layers in the model. Please check the model structure."
+            )
 
     @property
     def norm(self):
         """Returns the model's norm, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "norm"
+        ):  # Llama-like
             return self.model.model.norm
-        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
+        elif hasattr(self.model, "model") and hasattr(
+            self.model.model, "final_layernorm"
+        ):  # Phi-like
             return self.model.model.final_layernorm
         else:
-            raise AttributeError("Could not find norm in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find norm in the model. Please check the model structure."
+            )
 
     @norm.setter
     def norm(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
-            setattr(self.model.model, "norm", value)
-        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
-            setattr(self.model.model, "final_layernorm", value)
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "norm"
+        ):  # Llama-like
+            self.model.model.norm = value
+        elif hasattr(self.model, "model") and hasattr(
+            self.model.model, "final_layernorm"
+        ):  # Phi-like
+            self.model.model.final_layernorm = value
         else:
-            raise AttributeError("Could not find norm in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find norm in the model. Please check the model structure."
+            )
 
     @property
     def output(self):
@@ -461,34 +538,52 @@ def output(self):
             return self.model.lm_head
         else:
             # Add more cases here if needed for other model architectures
-            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find output (lm_head) in the model. Please check the model structure."
+            )
 
     @output.setter
     def output(self, value):
         if hasattr(self.model, "lm_head"):  # For models like LlamaForCausalLM
-            setattr(self.model, "lm_head", value)
+            self.model.lm_head = value
         else:
-            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find output (lm_head) in the model. Please check the model structure."
+            )
 
     @property
     def rotary_emb(self):
         """Returns the model's rotary_emb, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "rotary_emb"
+        ):  # Llama-like
             return self.model.model.rotary_emb
         else:
-            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find rotary_emb in the model. Please check the model structure."
+            )
 
     @rotary_emb.setter
     def rotary_emb(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
-            setattr(self.model.model, "rotary_emb", value)
+        if hasattr(self.model, "model") and hasattr(
+            self.model.model, "rotary_emb"
+        ):  # Llama-like
+            self.model.model.rotary_emb = value
         else:
-            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
+            raise AttributeError(
+                "Could not find rotary_emb in the model. Please check the model structure."
+            )
 
     def forward(self, *args, **kwargs):
         local_seq_len = self.max_seq_len
-        local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1
-        kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0)
+        local_seq_len //= (
+            self.cp_mesh.size()
+            if self.cp_mesh is not None and self.cp_mesh.size() > 1
+            else 1
+        )
+        kwargs["position_ids"] = torch.arange(
+            local_seq_len, device=args[0].device
+        ).unsqueeze(0)
         output = self.model.model(*args, **kwargs)
         output = self.model.lm_head(output.last_hidden_state)
         return output
@@ -512,11 +607,13 @@ def selective_init(module):
 
         self.model.apply(selective_init)
 
-        #TODO(3outeille): For pipeline parallel, only tie weights if both input and output embeddings are on the same device
+        # TODO(3outeille): For pipeline parallel, only tie weights if both input and output embeddings are on the same device
         # Maybe better way of handling this?
-        if not isinstance(self.tok_embeddings, nn.Identity) and not isinstance(self.output, nn.Identity):
+        if not isinstance(self.tok_embeddings, nn.Identity) and not isinstance(
+            self.output, nn.Identity
+        ):
             self.model.tie_weights()
-    
+
     def named_children(self):
         """
         Provides a flattened view of the model's main components,
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index f04d6ac269..81933604bd 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -4,8 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from collections.abc import Callable
 import dataclasses
+from collections.abc import Callable
 from dataclasses import dataclass
 from importlib import import_module
 from typing import Mapping, TypeAlias
@@ -73,7 +73,7 @@ def register_train_spec(name: str, train_spec: TrainSpec) -> None:
 def get_train_spec(name: str) -> TrainSpec:
     # user-defined TrainSpec has higher priority
     global _extra_train_specs
-    if "/" in name: # HF model (dynamic loading)
+    if "/" in name:  # HF model (dynamic loading)
         hf_spec = _extra_train_specs["hf_placeholder_name"]
         return dataclasses.replace(hf_spec, name=name)
     elif name in _extra_train_specs:
diff --git a/torchtitan/train.py b/torchtitan/train.py
index bc7c23daee..ed4c11298e 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -12,6 +12,8 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
+
+import torchtitan.experiments.transformers_backend  # noqa: F401  # noqa: F401
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
 from torchtitan.components.dataloader import DataloaderExhaustedError
@@ -30,7 +32,7 @@
     maybe_enable_memory_snapshot,
     maybe_enable_profiling,
 )
-import torchtitan.experiments.transformers_backend  # noqa: F401
+
 
 class Trainer(torch.distributed.checkpoint.stateful.Stateful):
     # core configs
@@ -432,7 +434,7 @@ def forward_backward_step(
         # apply context parallelism if cp is enabled
         # ensure CP handles the separate freqs_cis buffer for each pp stage
         cp_buffers = [inputs, labels]
-        cp_seq_dims = [1, 1] 
+        cp_seq_dims = [1, 1]
         if hasattr(model_parts[0], "freqs_cis"):
             cp_buffers += [m.freqs_cis for m in model_parts]
             cp_seq_dims += [0 for _ in model_parts]

From 9488a165e87bd9161603d847c344cd19ef3620af Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 21 Oct 2025 15:47:16 +0000
Subject: [PATCH 079/129] create CI jobs to guard

---
 .../integration_test_8gpu_huggingface.yaml    | 55 ++++++++++++++
 .../tests/integration_tests.py                | 71 +++++++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 .github/workflows/integration_test_8gpu_huggingface.yaml
 create mode 100644 torchtitan/experiments/transformers_backend/tests/integration_tests.py

diff --git a/.github/workflows/integration_test_8gpu_huggingface.yaml b/.github/workflows/integration_test_8gpu_huggingface.yaml
new file mode 100644
index 0000000000..cde7959510
--- /dev/null
+++ b/.github/workflows/integration_test_8gpu_huggingface.yaml
@@ -0,0 +1,55 @@
+name: Transformers Backend 8 GPU Integration Tests
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'torchtitan/experiments/transformers_backend/**'
+  pull_request:
+    paths:
+      - 'torchtitan/experiments/transformers_backend/**'
+  schedule:
+    # Runs every 12 hours
+    - cron: '0 */12 * * *'
+
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  build-test:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.48xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      # This image is faster to clone than the default, but it lacks CC needed by triton
+      # (1m25s vs 2m37s).
+      docker-image: torchtitan-ubuntu-20.04-clang12
+      repository: pytorch/torchtitan
+      upload-artifact: outputs
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Log CUDA driver version for debugging.
+        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
+        echo "CUDA driver version: ${DRIVER_VERSION}"
+
+        pip config --user set global.progress_bar off
+
+        python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+
+        python -m pip install transformers==4.55.4
+
+        mkdir artifacts-to-be-uploaded
+        python -m torchtitan.experiments.transformers_backend.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
new file mode 100644
index 0000000000..1f2a38d322
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+from tests.integration_tests import OverrideDefinitions
+from tests.integration_tests.run_tests import run_tests
+
+
+def build_transformers_backend_test_list() -> list[OverrideDefinitions]:
+    """
+    key is the config file name and value is a list of OverrideDefinitions
+    that is used to generate variations of integration tests based on the
+    same root config file.
+    """
+    integration_tests_flavors = [
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name meta-llama/Llama-3.2-1B",
+                    "--training.dataset wikitext2-test",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_schedule 1F1B",
+                ],
+            ],
+            "Transformers Backend FSDP+TP+PP",
+            "transformers_backend_fsdp+tp+pp",
+            ngpu=8,
+        ),
+    ]
+    return integration_tests_flavors
+
+
+_TEST_SUITES_FUNCTION = {
+    "transformers_backend": build_transformers_backend_test_list,
+}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--config_path",
+        default="./tests/integration_tests/base_config.toml",
+        help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
+    )
+    parser.add_argument(
+        "--test_name",
+        default="all",
+        help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
+    )
+    parser.add_argument("--ngpu", default=8, type=int)
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    if os.listdir(args.output_dir):
+        raise RuntimeError("Please provide an empty output directory.")
+
+    test_list = _TEST_SUITES_FUNCTION["transformers_backend"]()()
+    run_tests(args, test_list)
+
+
+if __name__ == "__main__":
+    main()

From e8a17577e30ec5b143509ea9f6454ad1539e7310 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 29 Oct 2025 13:40:08 +0000
Subject: [PATCH 080/129] update the way we register_train_spec

---
 torchtitan/config/job_config.py               |   7 +
 .../transformers_backend/__init__.py          |  29 +-
 .../configs/qwen3_fsdp2_tp2_pp2.toml          |   5 +-
 .../transformers_backend/model/args.py        |   2 +-
 .../model/hf_transformers_args.py             | 782 ++++++++++++++++++
 torchtitan/protocols/train_spec.py            |   5 +-
 torchtitan/train.py                           |   1 -
 7 files changed, 809 insertions(+), 22 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/model/hf_transformers_args.py

diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
index 7fe6802374..ee89d13627 100644
--- a/torchtitan/config/job_config.py
+++ b/torchtitan/config/job_config.py
@@ -131,6 +131,12 @@ class Model:
     """
 
 
+@dataclass
+class HFTransformers:
+    model: str = ""
+    """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')"""
+
+
 @dataclass
 class Optimizer:
     name: str = "AdamW"
@@ -897,6 +903,7 @@ class JobConfig:
     profiling: Profiling = field(default_factory=Profiling)
     metrics: Metrics = field(default_factory=Metrics)
     model: Model = field(default_factory=Model)
+    hf_transformers: HFTransformers = field(default_factory=HFTransformers)
     optimizer: Optimizer = field(default_factory=Optimizer)
     lr_scheduler: LRScheduler = field(default_factory=LRScheduler)
     training: Training = field(default_factory=Training)
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 11bd36bc81..453cb338da 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -10,9 +10,9 @@
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers
 from torchtitan.components.tokenizer import build_hf_tokenizer
-from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.hf_datasets.text_datasets import build_text_dataloader
 from torchtitan.models.moe import MoEArgs
-from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from torchtitan.protocols.train_spec import TrainSpec
 
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
 
@@ -110,16 +110,15 @@ class TitanMoeModelArgs:
     ),
 }
 
-hf_train_spec = TrainSpec(
-    model_cls=HFTransformerModel,
-    model_args=flavors,
-    parallelize_fn=parallelize_hf_transformers,
-    pipelining_fn=pipeline_hf_transformers,
-    build_optimizers_fn=build_optimizers,
-    build_lr_schedulers_fn=build_lr_schedulers,
-    build_dataloader_fn=build_hf_dataloader,
-    build_tokenizer_fn=build_hf_tokenizer,
-    build_loss_fn=build_cross_entropy_loss,
-)
-
-register_train_spec("hf_placeholder_name", hf_train_spec)
+def get_train_spec() -> TrainSpec:
+    return TrainSpec(
+        model_cls=HFTransformerModel,
+        model_args=flavors,
+        parallelize_fn=parallelize_hf_transformers,
+        pipelining_fn=pipeline_hf_transformers,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_text_dataloader,
+        build_tokenizer_fn=build_hf_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
index 4e216baa77..2832304900 100644
--- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
+++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
@@ -20,12 +20,15 @@ save_tb_folder = "tb"
 enable_wandb = false
 
 [model]
-name = "Qwen/Qwen3-4B-Instruct-2507"
+name = "transformers_backend"
 flavor = "debugmodel"
 # test folder with tokenizer.json, for debug purpose only
 hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
 # converters = ["float8"]
 
+[hf_transformers]
+model = "Qwen/Qwen3-4B-Instruct-2507"
+
 [optimizer]
 name = "AdamW"
 lr = 8e-4
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 7181cb570a..bc150820ab 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -153,7 +153,7 @@ def __repr__(self) -> str:
     def update_from_config(self, job_config: JobConfig):
         # Load HF config (overwrites our HF attributes)
         hf_model_config = AutoConfig.from_pretrained(
-            job_config.model.name,
+            job_config.hf_transformers.model,
             attn_implementation=self.attn_implementation,
             trust_remote_code=True,
         )
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
new file mode 100644
index 0000000000..5cda5b3b5d
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
@@ -0,0 +1,782 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import importlib
+from dataclasses import dataclass
+import torch
+from torch import nn
+import math
+from torch.nn import init
+from torchtitan.config import JobConfig
+from torchtitan.protocols import BaseModelArgs
+from torchtitan.tools.logging import logger
+from transformers import AutoConfig
+from transformers.utils import is_torch_deterministic
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import AttentionInterface, PreTrainedModel
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+
+@dataclass
+class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
+    """
+    Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
+    
+    Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
+    Properties are created dynamically based on which arguments are provided.
+    """
+    
+    # Define all possible mappings organized by argument type
+    _TT_TO_HF_MAPPINGS = {
+        "base": {
+            # Core TorchTitan mappings (always available)
+            "dim": "hidden_size",
+            "n_layers": "num_hidden_layers",
+            "n_heads": "num_attention_heads",
+            "n_kv_heads": "num_key_value_heads",
+            "norm_eps": "rms_norm_eps",
+            "max_seq_len": "max_position_embeddings",
+            "eos_id": "eos_token_id",
+        },
+        "deepseek_v3": {
+            # DeepSeekV3 specific mappings (only when deepseek_v3_args provided)
+            "inter_dim": "intermediate_size",
+            "n_dense_layers": "first_k_dense_replace",
+        },
+    }
+
+    def __init__(
+        self,
+        titan_args,
+        deepseek_v3_args=None,
+        # HuggingFace specific args
+        attn_implementation: str = "sdpa_torchtitan",
+        **kwargs,
+    ):
+        super().__init__(attn_implementation=attn_implementation, **kwargs)
+        assert titan_args is not None, "titan_args is required"
+
+        active_mappings = {}
+        
+        active_mappings.update(self._TT_TO_HF_MAPPINGS["base"])
+        
+        if deepseek_v3_args is not None:
+            active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"])
+        
+        self._active_mappings = active_mappings
+        
+        self._create_dynamic_properties()
+
+        # Set HF attributes from titan_args based on mappings
+        for titan_name, hf_name in self._active_mappings.items():
+            if hasattr(titan_args, titan_name):
+                setattr(self, hf_name, getattr(titan_args, titan_name))
+
+        # Fill all TorchTitan-specific args (no HF equivalent)
+        self.multiple_of = titan_args.multiple_of
+        self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
+        self.depth_init = titan_args.depth_init
+        self.use_flex_attn = titan_args.use_flex_attn
+        self.attn_mask_type = titan_args.attn_mask_type
+
+        # HuggingFace specific args
+        self.attn_implementation = attn_implementation
+        #NOTE:(3outeille):This will force create_causal_mask to return None
+        AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
+
+        # Start with passed_args as just titan_args
+        self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}
+        self._passed_args.update(kwargs)
+
+        #NOTE(3outeille): Wait for transformers uniformization of MoE args
+        if deepseek_v3_args is not None:
+            # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
+            # setting it to None in HuggingFace.
+            q_lora_rank = deepseek_v3_args.q_lora_rank
+            if q_lora_rank == 0:
+                q_lora_rank = None
+            deepseek_v3_args.q_lora_rank = q_lora_rank
+
+            self._passed_args.update(**deepseek_v3_args.__dict__)
+
+            self.rope_interleave = deepseek_v3_args.rope_interleave
+            self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor
+
+            if deepseek_v3_args.moe_args is not None:
+                moe_args = deepseek_v3_args.moe_args
+                self.num_experts_per_tok = moe_args.top_k
+                self.n_routed_experts = moe_args.num_experts
+                self.n_shared_experts = moe_args.num_shared_experts
+                self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
+                self._passed_args.update(
+                    dict(
+                        num_experts_per_tok=moe_args.top_k,
+                        n_routed_experts=moe_args.num_experts,
+                        n_shared_experts=moe_args.num_shared_experts,
+                        moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
+                    )
+                )
+
+    def _create_dynamic_properties(self):
+        """Create properties dynamically based on active mappings."""
+        def _create_property(hf_name: str) -> property:
+            def getter(self):
+                return getattr(self, hf_name)
+            def setter(self, value):
+                setattr(self, hf_name, value)
+            return property(getter, setter)
+        
+        for titan_name, hf_name in self._active_mappings.items():
+            # Create getter/setter for attribute that don't already exist
+            if not hasattr(self.__class__, titan_name):
+                setattr(self.__class__, titan_name, _create_property(hf_name))
+
+    def __repr__(self) -> str:
+        # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
+        # PretrainedConfig has a __repr__ that serializes the object to JSON, but it
+        # doesn't work well with how HFTransformerModelArgs is initialized.
+        # This custom __repr__ provides a dataclass-like representation that correctly
+        # displays the arguments passed during initialization.
+        args_lines = [
+            f"{k}={getattr(self, k)!r}"
+            for k in sorted(self._passed_args.keys())
+            if hasattr(self, k)
+        ]
+        args_str = "\n".join(args_lines)
+        return f"{self.__class__.__name__}(\n{args_str}\n)"
+
+    def update_from_config(self, job_config: JobConfig):
+        # Load HF config (overwrites our HF attributes)
+        hf_model_config = AutoConfig.from_pretrained(
+            job_config.hf_transformers.model,
+            attn_implementation=self.attn_implementation,
+            trust_remote_code=True
+        )
+
+        # Explicitly update attributes based on mappings
+        for titan_name, hf_name in self._active_mappings.items():
+            if hasattr(hf_model_config, hf_name):
+                setattr(self, titan_name, getattr(hf_model_config, hf_name))
+
+        # Copy any other attributes that might not be in the mapping
+        for key, value in hf_model_config.to_dict().items():
+            setattr(self, key, value)
+
+        # Update our attributes with the passed args from flavors
+        for key, value in self._passed_args.items():
+            if hasattr(self, key) and value is not None:
+                setattr(self, key, value)
+
+        # MoE
+        if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
+            self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
+        
+        # Configure HF-specific settings to match TorchTitan settings
+        self.tie_word_embeddings = False
+        self.attention_bias = False
+        self.mlp_bias = False
+        self.use_cache = False
+        self.initializer_range = 1.0  # use as std for normal init in embedding
+        
+        if not hasattr(self, "inter_dim"): # Only for llama model
+            ffn_hidden_size = 4 * self.dim
+            ffn_hidden_size = int(2 * ffn_hidden_size / 3)
+            if self.ffn_dim_multiplier is not None:
+                ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
+            self.intermediate_size = self.multiple_of * (
+                (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
+            )
+        
+        self.head_dim = self.dim // self.num_attention_heads
+        
+        return self
+
+    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        # Check if this is a MoE model by looking for MoE attributes
+        is_moe = hasattr(self, 'n_routed_experts')
+        
+        if is_moe:
+            # MoE parameter counting (adapted from DeepSeek V3 implementation)
+            nparams_embedding = 0
+            nparams_moe_router = 0
+            nparams_shared_experts = 0
+            nparams_experts = 0
+            nparams_dense = 0
+
+            for name, p in model.named_parameters():
+                if "embedding" in name:
+                    nparams_embedding += p.numel()
+                    nparams_dense += p.numel()
+                elif "moe.shared_experts" in name:
+                    nparams_shared_experts += p.numel()
+                elif "moe.router" in name:
+                    nparams_moe_router += p.numel()
+                elif "moe.experts" in name:
+                    nparams_experts += p.numel()
+                else:
+                    nparams_dense += p.numel()
+
+            nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
+            nparams = nparams_dense + nparams_sparse
+            nparams_sparse_active = (
+                nparams_moe_router
+                + nparams_shared_experts
+                + nparams_experts * self.num_experts_per_tok // self.n_routed_experts
+            )
+
+            logger.info(
+                f"Total parameter count: dense {nparams_dense:,}, "
+                f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
+            )
+
+            l, h, q, t = (
+                self.n_layers,
+                self.n_heads,
+                self.dim // self.n_heads,
+                seq_len,
+            )
+            # Use active parameters for FLOPS calculation in MoE
+            num_flops_per_token = (
+                6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
+                + 12 * l * h * q * t
+            )
+        else:
+            # Dense model parameter counting (original implementation)
+            nparams = sum(p.numel() for p in model.parameters())
+            nparams_embedding = sum(
+                sum(p.numel() for p in m.parameters())
+                for m in model.children()
+                if isinstance(m, nn.Embedding)
+            )
+
+            l, h, q, t = (
+                self.n_layers,
+                self.n_heads,
+                self.dim // self.n_heads,
+                seq_len,
+            )
+            # Reasoning behind the factor of 12 for the self-attention part of the formula:
+            # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+            # 2. the flash attention does 1 more matmul recomputation in the backward
+            #    but recomputation should not be counted in calculating MFU           (+0)
+            # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+            # 4. we follow the convention and do not account for sparsity in causal attention
+            num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+
+        return nparams, num_flops_per_token
+
+class HFTransformerModel(nn.Module):
+    def __init__(self, model_args: HFTransformerModelArgs):
+        super().__init__()
+        
+        # Try to import the model class dynamically from the transformers library if not found in globals
+        model_class_name = model_args.architectures[0]
+        model_cls = globals().get(model_class_name, None)
+        if model_cls is None:
+            try:
+                transformers_mod = importlib.import_module("transformers")
+                model_cls = getattr(transformers_mod, model_class_name)
+            except (ImportError, AttributeError) as e:
+                raise ImportError(
+                    f"Could not find model class '{model_class_name}' in globals or transformers. "
+                    f"Make sure the class is available. Original error: {e}"
+                )
+        
+        # Attempt to patch model weight initialization based on architecture type
+        try:
+            model_name_prefix = model_class_name.replace("ForCausalLM", "")
+            model_module = importlib.import_module(model_cls.__module__)
+
+            attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None)
+            mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None)
+            decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None)
+
+            is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model
+            if is_moe:
+                moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None)
+                required_classes = {
+                    "Attention": attention_cls,
+                    "MLP": mlp_cls, 
+                    "DecoderLayer": decoder_layer_cls,
+                    "MoE": moe_cls
+                }
+                
+                if all(required_classes.values()):
+                    logger.info(f"Applying MoE-like patch for {model_name_prefix}")
+                    self._patch_hf_moe_like(
+                        decoder_layer_cls=decoder_layer_cls,
+                        attention_cls=attention_cls,
+                        mlp_cls=mlp_cls,
+                        moe_cls=moe_cls
+                    )
+                else:
+                    missing = [name for name, cls in required_classes.items() if not cls]
+                    logger.warning(
+                        f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. "
+                        "Skipping MoE-like patch."
+                    )
+            else:
+                required_classes = {
+                    "Attention": attention_cls,
+                    "DecoderLayer": decoder_layer_cls
+                }
+                
+                if all(required_classes.values()):
+                    logger.info(f"Applying Llama-like patch for {model_name_prefix}")
+                    self._patch_hf_llama_like(
+                        decoder_layer_cls=decoder_layer_cls,
+                        attention_cls=attention_cls,
+                        mlp_cls=mlp_cls  # mlp_cls can be None
+                    )
+                else:
+                    missing = [name for name, cls in required_classes.items() if not cls]
+                    logger.warning(
+                        f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
+                        "Skipping Llama-like patch."
+                    )
+
+        except Exception as e:
+            logger.warning(
+                f"Failed to apply agnostic patch for {model_class_name} due to: {e}. "
+                "Weight initialization might not match TorchTitan."
+            )
+
+        self.model = model_cls(config=model_args)
+        self.max_seq_len = model_args.max_seq_len
+        
+        for layer in self.model.model.layers:
+            if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace:
+                layer.moe_enabled = True
+            else:
+                layer.moe_enabled = False
+
+        self.cp_mesh = None
+        self.tp_mesh = None
+        self.pp_mesh = None
+
+    def set_cp_mesh(self, mesh):
+        self.cp_mesh = mesh
+    
+    def set_tp_mesh(self, mesh):
+        self.tp_mesh = mesh
+    
+    def set_pp_mesh(self, mesh):
+        self.pp_mesh = mesh
+
+    def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None):
+        """
+        This patch modifies a Hugging Face Llama-like model's weight initialization to match
+        the initialization scheme used in TorchTitan. This is crucial for ensuring
+        bit-for-bit reproducibility when converting checkpoints between the native
+        TorchTitan format and the Hugging Face format.
+
+        The patch targets the following aspects of the model:
+        - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly.
+        - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization
+          for attention, MLP, embedding, and layer norm layers. This includes depth-dependent
+          initialization for attention and MLP layers.
+        - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within
+          each decoder layer, which is required for the depth-dependent initialization.
+        """
+
+        _original_decoder_layer_init = decoder_layer_cls.__init__
+
+        def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
+            _original_decoder_layer_init(self, config, layer_idx)
+            self.layer_idx = layer_idx
+            # Ensure both attention and mlp modules have layer_idx for depth-based init
+            if hasattr(self, "self_attn"):
+                self.self_attn.layer_idx = layer_idx
+            # some models might not have mlp in each layer
+            if hasattr(self, "mlp") and self.mlp is not None:
+                self.mlp.layer_idx = layer_idx
+
+        def _initialize_weights_patched(self, module):
+            # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
+            # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
+            # which prevents subsequent proper initialization.
+            if getattr(module, "_is_hf_initialized", False):
+                return
+
+            for param in module.parameters(recurse=True):
+                if param.device.type == "meta":
+                    return
+
+            # If not on a meta device, call the original weight initialization
+            self._init_weights(module)
+            module._is_hf_initialized = True
+
+        def _init_weights_patched(self, module):
+            """
+            Patched version of _init_weights to match TorchTitan's initialization for Llama-like models.
+            `self` is a PreTrainedModel instance.
+            """
+            config = self.config
+
+            # Build tuple of classes to check for layer_idx-based init_std calculation
+            layer_idx_classes = [attention_cls]
+            if mlp_cls:
+                layer_idx_classes.append(mlp_cls)
+            layer_idx_classes = tuple(layer_idx_classes)
+
+            if isinstance(module, layer_idx_classes):
+                if not hasattr(module, "layer_idx"):
+                    return
+                layer_idx = module.layer_idx
+
+                if hasattr(config, "depth_init") and config.depth_init:
+                    init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+                else:
+                    init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+            if isinstance(module, attention_cls):
+                # Initialize weights and biases for q, k, v projections
+                for proj_name in ["q_proj", "k_proj", "v_proj"]:
+                    proj = getattr(module, proj_name)
+                    nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02)
+                    if proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(proj.bias, -bound, bound)
+
+                # Handle different names for the output projection layer
+                o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
+                if o_proj is not None:
+                    nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std)
+                    if o_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(o_proj.bias, -bound, bound)
+
+            elif mlp_cls and isinstance(module, mlp_cls):
+                # Handle different names for MLP layers
+                gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
+                up_proj = getattr(module, "up_proj", None)
+                down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))
+
+                # gate_proj (or fc1) should always use std=0.02 for numerical stability.
+                if gate_proj is not None:
+                    nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02)
+                    if gate_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(gate_proj.bias, -bound, bound)
+                # up_proj and down_proj (or fc2) use the depth-dependent init_std.
+                if up_proj is not None:
+                    nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std)
+                    if up_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(up_proj.bias, -bound, bound)
+                if down_proj is not None:
+                    nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std)
+                    if down_proj.bias is not None:
+                        fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight)
+                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                        init.uniform_(down_proj.bias, -bound, bound)
+
+            elif module is getattr(
+                self, "lm_head", None
+            ):  # TODO(3outeille): find a better way to detect lm_head
+                final_out_std = config.hidden_size**-0.5
+                cutoff_factor = 3
+                nn.init.trunc_normal_(
+                    module.weight,
+                    mean=0.0,
+                    std=final_out_std,
+                    a=-cutoff_factor * final_out_std,
+                    b=cutoff_factor * final_out_std,
+                )
+                if module.bias is not None:
+                    module.bias.data.zero_()
+
+            elif isinstance(module, nn.Embedding):
+                std = config.initializer_range
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+
+            elif (
+                isinstance(
+                    module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)
+                )
+                or "LayerNorm" in module.__class__.__name__
+                or "RMSNorm" in module.__class__.__name__
+            ):
+                # Norms can exist without weights (in which case they are None from torch primitives)
+                if hasattr(module, "weight") and module.weight is not None:
+                    module.weight.data.fill_(1.0)
+                if hasattr(module, "bias") and module.bias is not None:
+                    module.bias.data.zero_()
+
+        decoder_layer_cls.__init__ = _decoder_layer_init_patched
+        PreTrainedModel._init_weights = _init_weights_patched
+        PreTrainedModel._initialize_weights = _initialize_weights_patched
+
+    def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls):
+        """
+        This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight
+        initialization to match the initialization scheme used in TorchTitan,
+        drawing from patterns in models like DeepseekV3.
+
+        The patch targets:
+        - `PreTrainedModel._initialize_weights`: For correct meta device initialization.
+        - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization
+          for attention, MLP, MoE, embedding, and layer norm layers.
+        - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert
+          modules, required for depth-dependent initialization.
+        """
+
+        _original_decoder_layer_init = decoder_layer_cls.__init__
+
+        def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
+            _original_decoder_layer_init(self, config, layer_idx)
+            self.layer_idx = layer_idx
+
+            if hasattr(self, "self_attn"):
+                self.self_attn.layer_idx = layer_idx
+
+            if hasattr(self, "mlp"):
+                self.mlp.layer_idx = layer_idx
+                if hasattr(self.mlp, "experts"):
+                    for expert in self.mlp.experts:
+                        expert.layer_idx = layer_idx
+                if hasattr(self.mlp, "shared_experts"):
+                    # Not all MoE models have shared experts
+                    if self.mlp.shared_experts is not None:
+                        self.mlp.shared_experts.layer_idx = layer_idx
+
+        def _initialize_weights_patched(self, module):
+            if getattr(module, "_is_hf_initialized", False):
+                return
+            for param in module.parameters(recurse=True):
+                if param.device.type == "meta":
+                    return
+            self._init_weights(module)
+            module._is_hf_initialized = True
+
+        def _init_weights_patched(self, module):
+            """
+            Patched version of _init_weights for MoE models.
+            """
+            config = self.config
+            init_std = None
+
+            if isinstance(module, (attention_cls, mlp_cls, moe_cls)):
+                if hasattr(module, "layer_idx"):
+                    layer_idx = module.layer_idx
+                    if hasattr(config, "depth_init") and config.depth_init:
+                        init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
+                    else:
+                        # Fallback for models without depth_init
+                        init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
+
+            if isinstance(module, attention_cls):
+                # Handle different attention projection layer names by initializing if they exist
+                if hasattr(module, "q_proj"):
+                    nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
+                if hasattr(module, "k_proj"):
+                    nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
+                if hasattr(module, "v_proj"):
+                    nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
+
+                if hasattr(module, "q_a_proj"):
+                    nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
+                if hasattr(module, "q_b_proj"):
+                    nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
+                
+                if hasattr(module, "kv_a_proj_with_mqa"):
+                    nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
+                if hasattr(module, "kv_b_proj"):
+                    nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
+                
+                if hasattr(module, "o_proj") and init_std is not None:
+                    nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
+
+            elif isinstance(module, mlp_cls):
+                nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
+                # DeepseekV3 uses std=0.02 for up_proj, unlike Llama
+                nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
+                if init_std is not None:
+                    nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
+
+            elif isinstance(module, moe_cls):
+                if hasattr(module, "gate") and init_std is not None:
+                    nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
+                if hasattr(module, "experts"):
+                    for expert in module.experts:
+                        nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
+                        nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
+                        if init_std is not None:
+                            nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
+                if hasattr(module, "shared_experts") and module.shared_experts is not None:
+                    nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
+                    nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
+                    if init_std is not None:
+                        nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
+
+            elif module is getattr(self, "lm_head", None):
+                final_out_std = config.hidden_size**-0.5
+                cutoff_factor = 3
+                nn.init.trunc_normal_(
+                    module.weight,
+                    mean=0.0,
+                    std=final_out_std,
+                    a=-cutoff_factor * final_out_std,
+                    b=cutoff_factor * final_out_std,
+                )
+                if module.bias is not None:
+                    module.bias.data.zero_()
+
+            elif isinstance(module, nn.Embedding):
+                std = config.initializer_range
+                module.weight.data.normal_(mean=0.0, std=std)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+
+            elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__:
+                if hasattr(module, "weight") and module.weight is not None:
+                    module.weight.data.fill_(1.0)
+                if hasattr(module, "bias") and module.bias is not None:
+                    module.bias.data.zero_()
+
+        decoder_layer_cls.__init__ = _decoder_layer_init_patched
+        PreTrainedModel._init_weights = _init_weights_patched
+        PreTrainedModel._initialize_weights = _initialize_weights_patched
+
+    @property
+    def tok_embeddings(self):
+        """Returns the model's embed_tokens, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
+            return self.model.model.embed_tokens
+        else:
+            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+
+    @tok_embeddings.setter
+    def tok_embeddings(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
+            setattr(self.model.model, "embed_tokens", value)
+        else:
+            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
+
+    @property
+    def layers(self):
+        """Returns the model's layers, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
+            return self.model.model.layers
+        else:
+            # Add more cases here if needed for other model architectures
+            raise AttributeError("Could not find layers in the model. Please check the model structure.")
+
+    @layers.setter
+    def layers(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
+            setattr(self.model.model, "layers", value)
+        else:
+            raise AttributeError("Could not find layers in the model. Please check the model structure.")
+
+    @property
+    def norm(self):
+        """Returns the model's norm, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
+            return self.model.model.norm
+        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
+            return self.model.model.final_layernorm
+        else:
+            raise AttributeError("Could not find norm in the model. Please check the model structure.")
+
+    @norm.setter
+    def norm(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
+            setattr(self.model.model, "norm", value)
+        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
+            setattr(self.model.model, "final_layernorm", value)
+        else:
+            raise AttributeError("Could not find norm in the model. Please check the model structure.")
+
+    @property
+    def output(self):
+        """Returns the model's output layer, handling different Hugging Face model structures."""
+        if hasattr(self.model, "lm_head"):  # For models like LlamaForCausalLM
+            return self.model.lm_head
+        else:
+            # Add more cases here if needed for other model architectures
+            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
+
+    @output.setter
+    def output(self, value):
+        if hasattr(self.model, "lm_head"):  # For models like LlamaForCausalLM
+            setattr(self.model, "lm_head", value)
+        else:
+            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
+
+    @property
+    def rotary_emb(self):
+        """Returns the model's rotary_emb, handling different Hugging Face model structures."""
+        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
+            return self.model.model.rotary_emb
+        else:
+            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
+
+    @rotary_emb.setter
+    def rotary_emb(self, value):
+        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
+            setattr(self.model.model, "rotary_emb", value)
+        else:
+            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
+
+    def forward(self, *args, **kwargs):
+        # local_seq_len = self.max_seq_len
+        # local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1
+        # kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0)
+        output = self.model.model(*args, **kwargs)
+        output = self.model.lm_head(output.last_hidden_state)
+        return output
+
+    def init_weights(self, *args, **kwargs):
+        # This method replicates the behavior of the original PreTrainedModel.init_weights,
+        # but with a custom weight initialization function that skips nn.Identity modules (when PP is enabled)
+
+        if self.model.config.pruned_heads:
+            logger.info("Pruning heads as per model configuration.")
+            self.model.prune_heads(self.model.config.pruned_heads)
+
+        original_init_weights_fn = self.model._init_weights
+
+        def selective_init(module):
+            # For pipeline parallel, we need to skip nn.Identity modules
+            if not isinstance(module, nn.Identity):
+                original_init_weights_fn(module)
+            else:
+                logger.info("Skipping nn.Identity module during weight initialization.")
+
+        self.model.apply(selective_init)
+
+        self.model.tie_weights()
+    
+    def named_children(self):
+        """
+        Provides a flattened view of the model's main components,
+        making it compatible with TorchTitan's expectations.
+        """
+        yield "tok_embeddings", self.tok_embeddings
+        yield "layers", self.layers
+        yield "norm", self.norm
+        yield "output", self.output
+        yield "rotary_emb", self.rotary_emb
+
+    def __setattr__(self, name, value):
+        # If a property with a setter exists for this name, use it.
+        # This is to bypass the nn.Module.__setattr__ logic that
+        # directly registers modules and skips property setters.
+        cls = self.__class__
+        if hasattr(cls, name):
+            prop = getattr(cls, name)
+            if isinstance(prop, property) and prop.fset is not None:
+                prop.fset(self, value)
+                return
+
+        # Otherwise, fall back to the default nn.Module behavior.
+        super().__setattr__(name, value)
\ No newline at end of file
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index 81933604bd..c5bd62793b 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -73,10 +73,7 @@ def register_train_spec(name: str, train_spec: TrainSpec) -> None:
 def get_train_spec(name: str) -> TrainSpec:
     # user-defined TrainSpec has higher priority
     global _extra_train_specs
-    if "/" in name:  # HF model (dynamic loading)
-        hf_spec = _extra_train_specs["hf_placeholder_name"]
-        return dataclasses.replace(hf_spec, name=name)
-    elif name in _extra_train_specs:
+    if name in _extra_train_specs:
         return _extra_train_specs[name]
 
     from torchtitan.experiments import _supported_experiments
diff --git a/torchtitan/train.py b/torchtitan/train.py
index 59813638fe..d4de8bc5d4 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -13,7 +13,6 @@
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
 
-import torchtitan.experiments.transformers_backend  # noqa: F401  # noqa: F401
 import torchtitan.protocols.train_spec as train_spec_module
 from torchtitan.components.checkpoint import CheckpointManager
 from torchtitan.components.dataloader import DataloaderExhaustedError

From 141c377c75cd6b4c2f12ad9b335ffb53bff0b656 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 29 Oct 2025 13:44:38 +0000
Subject: [PATCH 081/129] relative path for qwen3_fsdp2_tp2_pp2.toml

---
 .../transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
index 2832304900..d1433bb7ed 100644
--- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
+++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
@@ -23,7 +23,7 @@ enable_wandb = false
 name = "transformers_backend"
 flavor = "debugmodel"
 # test folder with tokenizer.json, for debug purpose only
-hf_assets_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer"
+hf_assets_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
 [hf_transformers]
@@ -47,7 +47,7 @@ seq_len = 2048
 max_norm = 1.0  # grad norm clipping
 steps = 10
 dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
-dataset_path = "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/c4_test"
+dataset_path = "./tests/assets/c4_test"
 mixed_precision_param = "float32" # force float32 for comparison
 mixed_precision_reduce = "float32"
 

From a67e971d7250747f656d8e9aa143cb4c51eaf713 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 29 Oct 2025 14:25:51 +0000
Subject: [PATCH 082/129] dont use os.environ, use debugmodel or debugmodel_moe

---
 .../transformers_backend/__init__.py          |  13 +-
 .../transformers_backend/model/args.py        |  34 +-
 .../model/hf_transformers_args.py             | 782 ------------------
 3 files changed, 42 insertions(+), 787 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/model/hf_transformers_args.py

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 453cb338da..1c44b9684c 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -3,7 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import os
 from dataclasses import dataclass
 
 from torchtitan.components.loss import build_cross_entropy_loss
@@ -80,6 +79,14 @@ class TitanMoeModelArgs:
             n_heads=16,
             n_kv_heads=16,
         ),
+    ),
+    "debugmodel_moe": HFTransformerModelArgs(
+        titan_dense_args=TitanDenseModelArgs(
+            dim=256,
+            n_layers=6,
+            n_heads=16,
+            n_kv_heads=16,
+        ),
         titan_moe_args=TitanMoeModelArgs(
             partial_rotary_factor=4.0,
             inter_dim=1024,
@@ -101,9 +108,7 @@ class TitanMoeModelArgs:
                 route_norm=True,
                 score_before_experts=False,
             ),
-        )
-        if os.environ.get("USE_MOE", "0") == "1"
-        else None,
+        ),
     ),
     "full": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(),
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index bc150820ab..4837e9527a 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -103,19 +103,51 @@ def __init__(
             titan_moe_args.q_lora_rank = q_lora_rank
 
             self._passed_args.update(**titan_moe_args.__dict__)
-
+            
             if titan_moe_args.moe_args is not None:
                 moe_args = titan_moe_args.moe_args
+                
+                # Store moe_args for nparams/flops calculation
+                self.moe_args = moe_args
                 self.num_experts_per_tok = moe_args.top_k
                 self.n_routed_experts = moe_args.num_experts
                 self.n_shared_experts = moe_args.num_shared_experts
                 self.moe_intermediate_size = titan_moe_args.moe_inter_dim
+                
+                # Set MoE-specific attributes directly on config for model access
+                if hasattr(titan_moe_args, 'rope_interleave'):
+                    self.rope_interleave = titan_moe_args.rope_interleave
+                if hasattr(titan_moe_args, 'partial_rotary_factor'):
+                    self.partial_rotary_factor = titan_moe_args.partial_rotary_factor
+                if hasattr(titan_moe_args, 'n_group'):
+                    self.n_group = titan_moe_args.n_group
+                if hasattr(titan_moe_args, 'topk_group'):
+                    self.topk_group = titan_moe_args.topk_group
+                if hasattr(titan_moe_args, 'kv_lora_rank'):
+                    self.kv_lora_rank = titan_moe_args.kv_lora_rank
+                if hasattr(titan_moe_args, 'q_lora_rank'):
+                    self.q_lora_rank = q_lora_rank  # Use the modified version (0 -> None)
+                if hasattr(titan_moe_args, 'qk_nope_head_dim'):
+                    self.qk_nope_head_dim = titan_moe_args.qk_nope_head_dim
+                if hasattr(titan_moe_args, 'qk_rope_head_dim'):
+                    self.qk_rope_head_dim = titan_moe_args.qk_rope_head_dim
+                if hasattr(titan_moe_args, 'v_head_dim'):
+                    self.v_head_dim = titan_moe_args.v_head_dim
+        
                 self._passed_args.update(
                     dict(
                         num_experts_per_tok=moe_args.top_k,
                         n_routed_experts=moe_args.num_experts,
                         n_shared_experts=moe_args.num_shared_experts,
                         moe_intermediate_size=titan_moe_args.moe_inter_dim,
+                        rope_interleave=titan_moe_args.rope_interleave,
+                        partial_rotary_factor=titan_moe_args.partial_rotary_factor,
+                        n_group=titan_moe_args.n_group,
+                        topk_group=titan_moe_args.topk_group,
+                        kv_lora_rank=titan_moe_args.kv_lora_rank,
+                        qk_nope_head_dim=titan_moe_args.qk_nope_head_dim,
+                        qk_rope_head_dim=titan_moe_args.qk_rope_head_dim,
+                        v_head_dim=titan_moe_args.v_head_dim,
                     )
                 )
 
diff --git a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py b/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
deleted file mode 100644
index 5cda5b3b5d..0000000000
--- a/torchtitan/experiments/transformers_backend/model/hf_transformers_args.py
+++ /dev/null
@@ -1,782 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import importlib
-from dataclasses import dataclass
-import torch
-from torch import nn
-import math
-from torch.nn import init
-from torchtitan.config import JobConfig
-from torchtitan.protocols import BaseModelArgs
-from torchtitan.tools.logging import logger
-from transformers import AutoConfig
-from transformers.utils import is_torch_deterministic
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import AttentionInterface, PreTrainedModel
-from transformers.integrations.sdpa_attention import sdpa_attention_forward
-
-@dataclass
-class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
-    """
-    Configuration class that bridges TorchTitan and HuggingFace Transformers naming conventions.
-    
-    Uses properties to provide TorchTitan-style access while maintaining HuggingFace compatibility.
-    Properties are created dynamically based on which arguments are provided.
-    """
-    
-    # Define all possible mappings organized by argument type
-    _TT_TO_HF_MAPPINGS = {
-        "base": {
-            # Core TorchTitan mappings (always available)
-            "dim": "hidden_size",
-            "n_layers": "num_hidden_layers",
-            "n_heads": "num_attention_heads",
-            "n_kv_heads": "num_key_value_heads",
-            "norm_eps": "rms_norm_eps",
-            "max_seq_len": "max_position_embeddings",
-            "eos_id": "eos_token_id",
-        },
-        "deepseek_v3": {
-            # DeepSeekV3 specific mappings (only when deepseek_v3_args provided)
-            "inter_dim": "intermediate_size",
-            "n_dense_layers": "first_k_dense_replace",
-        },
-    }
-
-    def __init__(
-        self,
-        titan_args,
-        deepseek_v3_args=None,
-        # HuggingFace specific args
-        attn_implementation: str = "sdpa_torchtitan",
-        **kwargs,
-    ):
-        super().__init__(attn_implementation=attn_implementation, **kwargs)
-        assert titan_args is not None, "titan_args is required"
-
-        active_mappings = {}
-        
-        active_mappings.update(self._TT_TO_HF_MAPPINGS["base"])
-        
-        if deepseek_v3_args is not None:
-            active_mappings.update(self._TT_TO_HF_MAPPINGS["deepseek_v3"])
-        
-        self._active_mappings = active_mappings
-        
-        self._create_dynamic_properties()
-
-        # Set HF attributes from titan_args based on mappings
-        for titan_name, hf_name in self._active_mappings.items():
-            if hasattr(titan_args, titan_name):
-                setattr(self, hf_name, getattr(titan_args, titan_name))
-
-        # Fill all TorchTitan-specific args (no HF equivalent)
-        self.multiple_of = titan_args.multiple_of
-        self.ffn_dim_multiplier = titan_args.ffn_dim_multiplier
-        self.depth_init = titan_args.depth_init
-        self.use_flex_attn = titan_args.use_flex_attn
-        self.attn_mask_type = titan_args.attn_mask_type
-
-        # HuggingFace specific args
-        self.attn_implementation = attn_implementation
-        #NOTE:(3outeille):This will force create_causal_mask to return None
-        AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
-
-        # Start with passed_args as just titan_args
-        self._passed_args = {**titan_args.__dict__, "attn_implementation": attn_implementation}
-        self._passed_args.update(kwargs)
-
-        #NOTE(3outeille): Wait for transformers uniformization of MoE args
-        if deepseek_v3_args is not None:
-            # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
-            # setting it to None in HuggingFace.
-            q_lora_rank = deepseek_v3_args.q_lora_rank
-            if q_lora_rank == 0:
-                q_lora_rank = None
-            deepseek_v3_args.q_lora_rank = q_lora_rank
-
-            self._passed_args.update(**deepseek_v3_args.__dict__)
-
-            self.rope_interleave = deepseek_v3_args.rope_interleave
-            self.partial_rotary_factor = deepseek_v3_args.partial_rotary_factor
-
-            if deepseek_v3_args.moe_args is not None:
-                moe_args = deepseek_v3_args.moe_args
-                self.num_experts_per_tok = moe_args.top_k
-                self.n_routed_experts = moe_args.num_experts
-                self.n_shared_experts = moe_args.num_shared_experts
-                self.moe_intermediate_size = deepseek_v3_args.moe_inter_dim
-                self._passed_args.update(
-                    dict(
-                        num_experts_per_tok=moe_args.top_k,
-                        n_routed_experts=moe_args.num_experts,
-                        n_shared_experts=moe_args.num_shared_experts,
-                        moe_intermediate_size=deepseek_v3_args.moe_inter_dim,
-                    )
-                )
-
-    def _create_dynamic_properties(self):
-        """Create properties dynamically based on active mappings."""
-        def _create_property(hf_name: str) -> property:
-            def getter(self):
-                return getattr(self, hf_name)
-            def setter(self, value):
-                setattr(self, hf_name, value)
-            return property(getter, setter)
-        
-        for titan_name, hf_name in self._active_mappings.items():
-            # Create getter/setter for attribute that don't already exist
-            if not hasattr(self.__class__, titan_name):
-                setattr(self.__class__, titan_name, _create_property(hf_name))
-
-    def __repr__(self) -> str:
-        # HFTransformerModelArgs is a dataclass that also inherits from PretrainedConfig.
-        # PretrainedConfig has a __repr__ that serializes the object to JSON, but it
-        # doesn't work well with how HFTransformerModelArgs is initialized.
-        # This custom __repr__ provides a dataclass-like representation that correctly
-        # displays the arguments passed during initialization.
-        args_lines = [
-            f"{k}={getattr(self, k)!r}"
-            for k in sorted(self._passed_args.keys())
-            if hasattr(self, k)
-        ]
-        args_str = "\n".join(args_lines)
-        return f"{self.__class__.__name__}(\n{args_str}\n)"
-
-    def update_from_config(self, job_config: JobConfig):
-        # Load HF config (overwrites our HF attributes)
-        hf_model_config = AutoConfig.from_pretrained(
-            job_config.hf_transformers.model,
-            attn_implementation=self.attn_implementation,
-            trust_remote_code=True
-        )
-
-        # Explicitly update attributes based on mappings
-        for titan_name, hf_name in self._active_mappings.items():
-            if hasattr(hf_model_config, hf_name):
-                setattr(self, titan_name, getattr(hf_model_config, hf_name))
-
-        # Copy any other attributes that might not be in the mapping
-        for key, value in hf_model_config.to_dict().items():
-            setattr(self, key, value)
-
-        # Update our attributes with the passed args from flavors
-        for key, value in self._passed_args.items():
-            if hasattr(self, key) and value is not None:
-                setattr(self, key, value)
-
-        # MoE
-        if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
-            self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
-        
-        # Configure HF-specific settings to match TorchTitan settings
-        self.tie_word_embeddings = False
-        self.attention_bias = False
-        self.mlp_bias = False
-        self.use_cache = False
-        self.initializer_range = 1.0  # use as std for normal init in embedding
-        
-        if not hasattr(self, "inter_dim"): # Only for llama model
-            ffn_hidden_size = 4 * self.dim
-            ffn_hidden_size = int(2 * ffn_hidden_size / 3)
-            if self.ffn_dim_multiplier is not None:
-                ffn_hidden_size = int(self.ffn_dim_multiplier * ffn_hidden_size)
-            self.intermediate_size = self.multiple_of * (
-                (ffn_hidden_size + self.multiple_of - 1) // self.multiple_of
-            )
-        
-        self.head_dim = self.dim // self.num_attention_heads
-        
-        return self
-
-    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        # Check if this is a MoE model by looking for MoE attributes
-        is_moe = hasattr(self, 'n_routed_experts')
-        
-        if is_moe:
-            # MoE parameter counting (adapted from DeepSeek V3 implementation)
-            nparams_embedding = 0
-            nparams_moe_router = 0
-            nparams_shared_experts = 0
-            nparams_experts = 0
-            nparams_dense = 0
-
-            for name, p in model.named_parameters():
-                if "embedding" in name:
-                    nparams_embedding += p.numel()
-                    nparams_dense += p.numel()
-                elif "moe.shared_experts" in name:
-                    nparams_shared_experts += p.numel()
-                elif "moe.router" in name:
-                    nparams_moe_router += p.numel()
-                elif "moe.experts" in name:
-                    nparams_experts += p.numel()
-                else:
-                    nparams_dense += p.numel()
-
-            nparams_sparse = nparams_moe_router + nparams_shared_experts + nparams_experts
-            nparams = nparams_dense + nparams_sparse
-            nparams_sparse_active = (
-                nparams_moe_router
-                + nparams_shared_experts
-                + nparams_experts * self.num_experts_per_tok // self.n_routed_experts
-            )
-
-            logger.info(
-                f"Total parameter count: dense {nparams_dense:,}, "
-                f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
-            )
-
-            l, h, q, t = (
-                self.n_layers,
-                self.n_heads,
-                self.dim // self.n_heads,
-                seq_len,
-            )
-            # Use active parameters for FLOPS calculation in MoE
-            num_flops_per_token = (
-                6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
-                + 12 * l * h * q * t
-            )
-        else:
-            # Dense model parameter counting (original implementation)
-            nparams = sum(p.numel() for p in model.parameters())
-            nparams_embedding = sum(
-                sum(p.numel() for p in m.parameters())
-                for m in model.children()
-                if isinstance(m, nn.Embedding)
-            )
-
-            l, h, q, t = (
-                self.n_layers,
-                self.n_heads,
-                self.dim // self.n_heads,
-                seq_len,
-            )
-            # Reasoning behind the factor of 12 for the self-attention part of the formula:
-            # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
-            # 2. the flash attention does 1 more matmul recomputation in the backward
-            #    but recomputation should not be counted in calculating MFU           (+0)
-            # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
-            # 4. we follow the convention and do not account for sparsity in causal attention
-            num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
-
-        return nparams, num_flops_per_token
-
-class HFTransformerModel(nn.Module):
-    def __init__(self, model_args: HFTransformerModelArgs):
-        super().__init__()
-        
-        # Try to import the model class dynamically from the transformers library if not found in globals
-        model_class_name = model_args.architectures[0]
-        model_cls = globals().get(model_class_name, None)
-        if model_cls is None:
-            try:
-                transformers_mod = importlib.import_module("transformers")
-                model_cls = getattr(transformers_mod, model_class_name)
-            except (ImportError, AttributeError) as e:
-                raise ImportError(
-                    f"Could not find model class '{model_class_name}' in globals or transformers. "
-                    f"Make sure the class is available. Original error: {e}"
-                )
-        
-        # Attempt to patch model weight initialization based on architecture type
-        try:
-            model_name_prefix = model_class_name.replace("ForCausalLM", "")
-            model_module = importlib.import_module(model_cls.__module__)
-
-            attention_cls = getattr(model_module, f"{model_name_prefix}Attention", None)
-            mlp_cls = getattr(model_module, f"{model_name_prefix}MLP", None)
-            decoder_layer_cls = getattr(model_module, f"{model_name_prefix}DecoderLayer", None)
-
-            is_moe = hasattr(model_args, "n_routed_experts") #TODO(3outeille): check if this is the most reliable to detect a moe model
-            if is_moe:
-                moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None)
-                required_classes = {
-                    "Attention": attention_cls,
-                    "MLP": mlp_cls, 
-                    "DecoderLayer": decoder_layer_cls,
-                    "MoE": moe_cls
-                }
-                
-                if all(required_classes.values()):
-                    logger.info(f"Applying MoE-like patch for {model_name_prefix}")
-                    self._patch_hf_moe_like(
-                        decoder_layer_cls=decoder_layer_cls,
-                        attention_cls=attention_cls,
-                        mlp_cls=mlp_cls,
-                        moe_cls=moe_cls
-                    )
-                else:
-                    missing = [name for name, cls in required_classes.items() if not cls]
-                    logger.warning(
-                        f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. "
-                        "Skipping MoE-like patch."
-                    )
-            else:
-                required_classes = {
-                    "Attention": attention_cls,
-                    "DecoderLayer": decoder_layer_cls
-                }
-                
-                if all(required_classes.values()):
-                    logger.info(f"Applying Llama-like patch for {model_name_prefix}")
-                    self._patch_hf_llama_like(
-                        decoder_layer_cls=decoder_layer_cls,
-                        attention_cls=attention_cls,
-                        mlp_cls=mlp_cls  # mlp_cls can be None
-                    )
-                else:
-                    missing = [name for name, cls in required_classes.items() if not cls]
-                    logger.warning(
-                        f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
-                        "Skipping Llama-like patch."
-                    )
-
-        except Exception as e:
-            logger.warning(
-                f"Failed to apply agnostic patch for {model_class_name} due to: {e}. "
-                "Weight initialization might not match TorchTitan."
-            )
-
-        self.model = model_cls(config=model_args)
-        self.max_seq_len = model_args.max_seq_len
-        
-        for layer in self.model.model.layers:
-            if hasattr(model_args, "first_k_dense_replace") and layer.layer_idx >= model_args.first_k_dense_replace:
-                layer.moe_enabled = True
-            else:
-                layer.moe_enabled = False
-
-        self.cp_mesh = None
-        self.tp_mesh = None
-        self.pp_mesh = None
-
-    def set_cp_mesh(self, mesh):
-        self.cp_mesh = mesh
-    
-    def set_tp_mesh(self, mesh):
-        self.tp_mesh = mesh
-    
-    def set_pp_mesh(self, mesh):
-        self.pp_mesh = mesh
-
-    def _patch_hf_llama_like(self, decoder_layer_cls, attention_cls, mlp_cls=None):
-        """
-        This patch modifies a Hugging Face Llama-like model's weight initialization to match
-        the initialization scheme used in TorchTitan. This is crucial for ensuring
-        bit-for-bit reproducibility when converting checkpoints between the native
-        TorchTitan format and the Hugging Face format.
-
-        The patch targets the following aspects of the model:
-        - `PreTrainedModel._initialize_weights`: Handles meta device initialization correctly.
-        - `PreTrainedModel._init_weights`: Implements TorchTitan's specific initialization
-          for attention, MLP, embedding, and layer norm layers. This includes depth-dependent
-          initialization for attention and MLP layers.
-        - `DecoderLayer.__init__`: Adds `layer_idx` to attention and MLP modules within
-          each decoder layer, which is required for the depth-dependent initialization.
-        """
-
-        _original_decoder_layer_init = decoder_layer_cls.__init__
-
-        def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
-            _original_decoder_layer_init(self, config, layer_idx)
-            self.layer_idx = layer_idx
-            # Ensure both attention and mlp modules have layer_idx for depth-based init
-            if hasattr(self, "self_attn"):
-                self.self_attn.layer_idx = layer_idx
-            # some models might not have mlp in each layer
-            if hasattr(self, "mlp") and self.mlp is not None:
-                self.mlp.layer_idx = layer_idx
-
-        def _initialize_weights_patched(self, module):
-            # NOTE(3outeille): monkey-patch PreTrainedModel to handle meta device initialization correctly
-            # The default _initialize_weights sets _is_hf_initialized = True even on a meta device,
-            # which prevents subsequent proper initialization.
-            if getattr(module, "_is_hf_initialized", False):
-                return
-
-            for param in module.parameters(recurse=True):
-                if param.device.type == "meta":
-                    return
-
-            # If not on a meta device, call the original weight initialization
-            self._init_weights(module)
-            module._is_hf_initialized = True
-
-        def _init_weights_patched(self, module):
-            """
-            Patched version of _init_weights to match TorchTitan's initialization for Llama-like models.
-            `self` is a PreTrainedModel instance.
-            """
-            config = self.config
-
-            # Build tuple of classes to check for layer_idx-based init_std calculation
-            layer_idx_classes = [attention_cls]
-            if mlp_cls:
-                layer_idx_classes.append(mlp_cls)
-            layer_idx_classes = tuple(layer_idx_classes)
-
-            if isinstance(module, layer_idx_classes):
-                if not hasattr(module, "layer_idx"):
-                    return
-                layer_idx = module.layer_idx
-
-                if hasattr(config, "depth_init") and config.depth_init:
-                    init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-                else:
-                    init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
-
-            if isinstance(module, attention_cls):
-                # Initialize weights and biases for q, k, v projections
-                for proj_name in ["q_proj", "k_proj", "v_proj"]:
-                    proj = getattr(module, proj_name)
-                    nn.init.trunc_normal_(proj.weight, mean=0.0, std=0.02)
-                    if proj.bias is not None:
-                        fan_in, _ = init._calculate_fan_in_and_fan_out(proj.weight)
-                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                        init.uniform_(proj.bias, -bound, bound)
-
-                # Handle different names for the output projection layer
-                o_proj = getattr(module, "o_proj", getattr(module, "dense", None))
-                if o_proj is not None:
-                    nn.init.trunc_normal_(o_proj.weight, mean=0.0, std=init_std)
-                    if o_proj.bias is not None:
-                        fan_in, _ = init._calculate_fan_in_and_fan_out(o_proj.weight)
-                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                        init.uniform_(o_proj.bias, -bound, bound)
-
-            elif mlp_cls and isinstance(module, mlp_cls):
-                # Handle different names for MLP layers
-                gate_proj = getattr(module, "gate_proj", getattr(module, "fc1", None))
-                up_proj = getattr(module, "up_proj", None)
-                down_proj = getattr(module, "down_proj", getattr(module, "fc2", None))
-
-                # gate_proj (or fc1) should always use std=0.02 for numerical stability.
-                if gate_proj is not None:
-                    nn.init.trunc_normal_(gate_proj.weight, mean=0.0, std=0.02)
-                    if gate_proj.bias is not None:
-                        fan_in, _ = init._calculate_fan_in_and_fan_out(gate_proj.weight)
-                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                        init.uniform_(gate_proj.bias, -bound, bound)
-                # up_proj and down_proj (or fc2) use the depth-dependent init_std.
-                if up_proj is not None:
-                    nn.init.trunc_normal_(up_proj.weight, mean=0.0, std=init_std)
-                    if up_proj.bias is not None:
-                        fan_in, _ = init._calculate_fan_in_and_fan_out(up_proj.weight)
-                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                        init.uniform_(up_proj.bias, -bound, bound)
-                if down_proj is not None:
-                    nn.init.trunc_normal_(down_proj.weight, mean=0.0, std=init_std)
-                    if down_proj.bias is not None:
-                        fan_in, _ = init._calculate_fan_in_and_fan_out(down_proj.weight)
-                        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                        init.uniform_(down_proj.bias, -bound, bound)
-
-            elif module is getattr(
-                self, "lm_head", None
-            ):  # TODO(3outeille): find a better way to detect lm_head
-                final_out_std = config.hidden_size**-0.5
-                cutoff_factor = 3
-                nn.init.trunc_normal_(
-                    module.weight,
-                    mean=0.0,
-                    std=final_out_std,
-                    a=-cutoff_factor * final_out_std,
-                    b=cutoff_factor * final_out_std,
-                )
-                if module.bias is not None:
-                    module.bias.data.zero_()
-
-            elif isinstance(module, nn.Embedding):
-                std = config.initializer_range
-                module.weight.data.normal_(mean=0.0, std=std)
-                if module.padding_idx is not None:
-                    module.weight.data[module.padding_idx].zero_()
-
-            elif (
-                isinstance(
-                    module, (nn.GroupNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)
-                )
-                or "LayerNorm" in module.__class__.__name__
-                or "RMSNorm" in module.__class__.__name__
-            ):
-                # Norms can exist without weights (in which case they are None from torch primitives)
-                if hasattr(module, "weight") and module.weight is not None:
-                    module.weight.data.fill_(1.0)
-                if hasattr(module, "bias") and module.bias is not None:
-                    module.bias.data.zero_()
-
-        decoder_layer_cls.__init__ = _decoder_layer_init_patched
-        PreTrainedModel._init_weights = _init_weights_patched
-        PreTrainedModel._initialize_weights = _initialize_weights_patched
-
-    def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls):
-        """
-        This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight
-        initialization to match the initialization scheme used in TorchTitan,
-        drawing from patterns in models like DeepseekV3.
-
-        The patch targets:
-        - `PreTrainedModel._initialize_weights`: For correct meta device initialization.
-        - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization
-          for attention, MLP, MoE, embedding, and layer norm layers.
-        - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert
-          modules, required for depth-dependent initialization.
-        """
-
-        _original_decoder_layer_init = decoder_layer_cls.__init__
-
-        def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
-            _original_decoder_layer_init(self, config, layer_idx)
-            self.layer_idx = layer_idx
-
-            if hasattr(self, "self_attn"):
-                self.self_attn.layer_idx = layer_idx
-
-            if hasattr(self, "mlp"):
-                self.mlp.layer_idx = layer_idx
-                if hasattr(self.mlp, "experts"):
-                    for expert in self.mlp.experts:
-                        expert.layer_idx = layer_idx
-                if hasattr(self.mlp, "shared_experts"):
-                    # Not all MoE models have shared experts
-                    if self.mlp.shared_experts is not None:
-                        self.mlp.shared_experts.layer_idx = layer_idx
-
-        def _initialize_weights_patched(self, module):
-            if getattr(module, "_is_hf_initialized", False):
-                return
-            for param in module.parameters(recurse=True):
-                if param.device.type == "meta":
-                    return
-            self._init_weights(module)
-            module._is_hf_initialized = True
-
-        def _init_weights_patched(self, module):
-            """
-            Patched version of _init_weights for MoE models.
-            """
-            config = self.config
-            init_std = None
-
-            if isinstance(module, (attention_cls, mlp_cls, moe_cls)):
-                if hasattr(module, "layer_idx"):
-                    layer_idx = module.layer_idx
-                    if hasattr(config, "depth_init") and config.depth_init:
-                        init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-                    else:
-                        # Fallback for models without depth_init
-                        init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
-
-            if isinstance(module, attention_cls):
-                # Handle different attention projection layer names by initializing if they exist
-                if hasattr(module, "q_proj"):
-                    nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
-                if hasattr(module, "k_proj"):
-                    nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
-                if hasattr(module, "v_proj"):
-                    nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
-
-                if hasattr(module, "q_a_proj"):
-                    nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
-                if hasattr(module, "q_b_proj"):
-                    nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
-                
-                if hasattr(module, "kv_a_proj_with_mqa"):
-                    nn.init.trunc_normal_(module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02)
-                if hasattr(module, "kv_b_proj"):
-                    nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
-                
-                if hasattr(module, "o_proj") and init_std is not None:
-                    nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
-
-            elif isinstance(module, mlp_cls):
-                nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
-                # DeepseekV3 uses std=0.02 for up_proj, unlike Llama
-                nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
-                if init_std is not None:
-                    nn.init.trunc_normal_(module.down_proj.weight, mean=0.0, std=init_std)
-
-            elif isinstance(module, moe_cls):
-                if hasattr(module, "gate") and init_std is not None:
-                    nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
-                if hasattr(module, "experts"):
-                    for expert in module.experts:
-                        nn.init.trunc_normal_(expert.gate_proj.weight, mean=0.0, std=0.02)
-                        nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
-                        if init_std is not None:
-                            nn.init.trunc_normal_(expert.down_proj.weight, mean=0.0, std=init_std)
-                if hasattr(module, "shared_experts") and module.shared_experts is not None:
-                    nn.init.trunc_normal_(module.shared_experts.gate_proj.weight, mean=0.0, std=0.02)
-                    nn.init.trunc_normal_(module.shared_experts.up_proj.weight, mean=0.0, std=0.02)
-                    if init_std is not None:
-                        nn.init.trunc_normal_(module.shared_experts.down_proj.weight, mean=0.0, std=init_std)
-
-            elif module is getattr(self, "lm_head", None):
-                final_out_std = config.hidden_size**-0.5
-                cutoff_factor = 3
-                nn.init.trunc_normal_(
-                    module.weight,
-                    mean=0.0,
-                    std=final_out_std,
-                    a=-cutoff_factor * final_out_std,
-                    b=cutoff_factor * final_out_std,
-                )
-                if module.bias is not None:
-                    module.bias.data.zero_()
-
-            elif isinstance(module, nn.Embedding):
-                std = config.initializer_range
-                module.weight.data.normal_(mean=0.0, std=std)
-                if module.padding_idx is not None:
-                    module.weight.data[module.padding_idx].zero_()
-
-            elif "LayerNorm" in module.__class__.__name__ or "RMSNorm" in module.__class__.__name__:
-                if hasattr(module, "weight") and module.weight is not None:
-                    module.weight.data.fill_(1.0)
-                if hasattr(module, "bias") and module.bias is not None:
-                    module.bias.data.zero_()
-
-        decoder_layer_cls.__init__ = _decoder_layer_init_patched
-        PreTrainedModel._init_weights = _init_weights_patched
-        PreTrainedModel._initialize_weights = _initialize_weights_patched
-
-    @property
-    def tok_embeddings(self):
-        """Returns the model's embed_tokens, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
-            return self.model.model.embed_tokens
-        else:
-            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
-
-    @tok_embeddings.setter
-    def tok_embeddings(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "embed_tokens"):  # Llama-like
-            setattr(self.model.model, "embed_tokens", value)
-        else:
-            raise AttributeError("Could not find embed_tokens in the model. Please check the model structure.")
-
-    @property
-    def layers(self):
-        """Returns the model's layers, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
-            return self.model.model.layers
-        else:
-            # Add more cases here if needed for other model architectures
-            raise AttributeError("Could not find layers in the model. Please check the model structure.")
-
-    @layers.setter
-    def layers(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "layers"):  # Llama-like
-            setattr(self.model.model, "layers", value)
-        else:
-            raise AttributeError("Could not find layers in the model. Please check the model structure.")
-
-    @property
-    def norm(self):
-        """Returns the model's norm, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
-            return self.model.model.norm
-        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
-            return self.model.model.final_layernorm
-        else:
-            raise AttributeError("Could not find norm in the model. Please check the model structure.")
-
-    @norm.setter
-    def norm(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "norm"):  # Llama-like
-            setattr(self.model.model, "norm", value)
-        elif hasattr(self.model, "model") and hasattr(self.model.model, "final_layernorm"):  # Phi-like
-            setattr(self.model.model, "final_layernorm", value)
-        else:
-            raise AttributeError("Could not find norm in the model. Please check the model structure.")
-
-    @property
-    def output(self):
-        """Returns the model's output layer, handling different Hugging Face model structures."""
-        if hasattr(self.model, "lm_head"):  # For models like LlamaForCausalLM
-            return self.model.lm_head
-        else:
-            # Add more cases here if needed for other model architectures
-            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
-
-    @output.setter
-    def output(self, value):
-        if hasattr(self.model, "lm_head"):  # For models like LlamaForCausalLM
-            setattr(self.model, "lm_head", value)
-        else:
-            raise AttributeError("Could not find output (lm_head) in the model. Please check the model structure.")
-
-    @property
-    def rotary_emb(self):
-        """Returns the model's rotary_emb, handling different Hugging Face model structures."""
-        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
-            return self.model.model.rotary_emb
-        else:
-            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
-
-    @rotary_emb.setter
-    def rotary_emb(self, value):
-        if hasattr(self.model, "model") and hasattr(self.model.model, "rotary_emb"):  # Llama-like
-            setattr(self.model.model, "rotary_emb", value)
-        else:
-            raise AttributeError("Could not find rotary_emb in the model. Please check the model structure.")
-
-    def forward(self, *args, **kwargs):
-        # local_seq_len = self.max_seq_len
-        # local_seq_len //= self.cp_mesh.size() if self.cp_mesh is not None and self.cp_mesh.size() > 1 else 1
-        # kwargs["position_ids"] = torch.arange(local_seq_len, device=args[0].device).unsqueeze(0)
-        output = self.model.model(*args, **kwargs)
-        output = self.model.lm_head(output.last_hidden_state)
-        return output
-
-    def init_weights(self, *args, **kwargs):
-        # This method replicates the behavior of the original PreTrainedModel.init_weights,
-        # but with a custom weight initialization function that skips nn.Identity modules (when PP is enabled)
-
-        if self.model.config.pruned_heads:
-            logger.info("Pruning heads as per model configuration.")
-            self.model.prune_heads(self.model.config.pruned_heads)
-
-        original_init_weights_fn = self.model._init_weights
-
-        def selective_init(module):
-            # For pipeline parallel, we need to skip nn.Identity modules
-            if not isinstance(module, nn.Identity):
-                original_init_weights_fn(module)
-            else:
-                logger.info("Skipping nn.Identity module during weight initialization.")
-
-        self.model.apply(selective_init)
-
-        self.model.tie_weights()
-    
-    def named_children(self):
-        """
-        Provides a flattened view of the model's main components,
-        making it compatible with TorchTitan's expectations.
-        """
-        yield "tok_embeddings", self.tok_embeddings
-        yield "layers", self.layers
-        yield "norm", self.norm
-        yield "output", self.output
-        yield "rotary_emb", self.rotary_emb
-
-    def __setattr__(self, name, value):
-        # If a property with a setter exists for this name, use it.
-        # This is to bypass the nn.Module.__setattr__ logic that
-        # directly registers modules and skips property setters.
-        cls = self.__class__
-        if hasattr(cls, name):
-            prop = getattr(cls, name)
-            if isinstance(prop, property) and prop.fset is not None:
-                prop.fset(self, value)
-                return
-
-        # Otherwise, fall back to the default nn.Module behavior.
-        super().__setattr__(name, value)
\ No newline at end of file

From 060befe7ae48026d806e82c60d1c26f0b2f4382a Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 30 Oct 2025 10:20:29 +0000
Subject: [PATCH 083/129] refactor args to make it clearer

---
 .../transformers_backend/model/args.py        | 191 +++++++++---------
 1 file changed, 99 insertions(+), 92 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 4837e9527a..d281d68b3c 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -46,6 +46,28 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
             "n_dense_layers": "first_k_dense_replace",
         },
     }
+    
+    # Declarative list of TorchTitan-only attributes (no HF equivalent)
+    _TT_SPECIFIC_ATTRIBUTES = [
+        "multiple_of",
+        "ffn_dim_multiplier",
+        "depth_init",
+        "use_flex_attn",
+        "attn_mask_type",
+    ]
+    
+    # MoE attributes that should be copied directly
+    _MOE_SHARED_ATTRIBUTES = [
+        "rope_interleave",
+        "partial_rotary_factor",
+        "n_group",
+        "topk_group",
+        "kv_lora_rank",
+        "q_lora_rank",
+        "qk_nope_head_dim",
+        "qk_rope_head_dim",
+        "v_head_dim",
+    ]
 
     def __init__(
         self,
@@ -58,101 +80,81 @@ def __init__(
         super().__init__(attn_implementation=attn_implementation, **kwargs)
         assert titan_dense_args is not None, "titan_dense_args is required"
 
-        active_mappings = {}
-
-        active_mappings.update(self._TT_TO_HF_MAPPINGS["dense"])
+        # Create getter/setter dynamically for TT <-> HF attribute mappings
+        self._create_getter_setter_dynamically(titan_moe_args is not None)
+        
+        self._titan_injected_model_args = {}
+        self._titan_injected_model_args.update(kwargs)
+        self._configure_hf_attention(attn_implementation)
 
+        self._initialize_dense_attributes(titan_dense_args)
+        
         if titan_moe_args is not None:
-            active_mappings.update(self._TT_TO_HF_MAPPINGS["moe"])
-
-        self._active_mappings = active_mappings
-
-        self._create_dynamic_properties()
-
-        # Set HF attributes from titan_args based on mappings
-        for titan_name, hf_name in self._active_mappings.items():
+            self._initialize_moe_attributes(titan_moe_args)
+        
+    def _initialize_dense_attributes(self, titan_dense_args):
+        """Initialize all dense model attributes."""
+        # Set mapped attributes (TorchTitan <-> HuggingFace)
+        for titan_name, hf_name in self._tt_to_hf_attribute_map.items():
             if hasattr(titan_dense_args, titan_name):
-                setattr(self, hf_name, getattr(titan_dense_args, titan_name))
-
-        # Fill all TorchTitan-specific args (no HF equivalent)
-        self.multiple_of = titan_dense_args.multiple_of
-        self.ffn_dim_multiplier = titan_dense_args.ffn_dim_multiplier
-        self.depth_init = titan_dense_args.depth_init
-        self.use_flex_attn = titan_dense_args.use_flex_attn
-        self.attn_mask_type = titan_dense_args.attn_mask_type
-
-        # HuggingFace specific args
+                value = getattr(titan_dense_args, titan_name)
+                setattr(self, hf_name, value)
+        
+        # Set TorchTitan-only attributes
+        for attr_name in self._TT_SPECIFIC_ATTRIBUTES:
+            if hasattr(titan_dense_args, attr_name):
+                setattr(self, attr_name, getattr(titan_dense_args, attr_name))
+        
+        # Update passed_args
+        self._titan_injected_model_args.update(titan_dense_args.__dict__)
+
+    def _initialize_moe_attributes(self, titan_moe_args):
+        """Initialize all MoE-specific attributes."""
+        if titan_moe_args.moe_args is None:
+            self._titan_injected_model_args.update(titan_moe_args.__dict__)
+            return
+        
+        moe_args = titan_moe_args.moe_args
+        
+        # Convert q_lora_rank (0 -> None for HuggingFace compatibility)
+        self.q_lora_rank = None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank
+        
+        # Set core MoE attributes
+        self.moe_args = moe_args
+        self.num_experts_per_tok = moe_args.top_k
+        self.n_routed_experts = moe_args.num_experts
+        self.n_shared_experts = moe_args.num_shared_experts
+        self.moe_intermediate_size = titan_moe_args.moe_inter_dim
+        
+        # Set remaining architecture-specific MoE attributes
+        for attr in self._MOE_SHARED_ATTRIBUTES:
+            if attr == "q_lora_rank":
+                continue  # Already set above
+            if hasattr(titan_moe_args, attr):
+                setattr(self, attr, getattr(titan_moe_args, attr))
+        
+        # Track all MoE arguments
+        self._titan_injected_model_args.update(titan_moe_args.__dict__)
+        self._titan_injected_model_args.update({
+            "num_experts_per_tok": moe_args.top_k,
+            "n_routed_experts": moe_args.num_experts,
+            "n_shared_experts": moe_args.num_shared_experts,
+            "moe_intermediate_size": titan_moe_args.moe_inter_dim,
+            "q_lora_rank": self.q_lora_rank,
+        })
+
+    def _configure_hf_attention(self, attn_implementation: str):
+        """Configure HuggingFace attention settings."""
+        self._titan_injected_model_args["attn_implementation"] = attn_implementation
         self.attn_implementation = attn_implementation
         # NOTE:(3outeille):This will force create_causal_mask to return None
         AttentionInterface._global_mapping[attn_implementation] = sdpa_attention_forward
 
-        # Start with passed_args as just titan_args
-        self._passed_args = {
-            **titan_dense_args.__dict__,
-            "attn_implementation": attn_implementation,
-        }
-        self._passed_args.update(kwargs)
-
-        # NOTE(3outeille): Wait for transformers uniformization of MoE args
-        if titan_moe_args is not None:
-            # For DeepSeekV3, setting q_lora_rank to 0 in TorchTitan is equivalent to
-            # setting it to None in HuggingFace.
-            q_lora_rank = titan_moe_args.q_lora_rank
-            if q_lora_rank == 0:
-                q_lora_rank = None
-            titan_moe_args.q_lora_rank = q_lora_rank
-
-            self._passed_args.update(**titan_moe_args.__dict__)
-            
-            if titan_moe_args.moe_args is not None:
-                moe_args = titan_moe_args.moe_args
-                
-                # Store moe_args for nparams/flops calculation
-                self.moe_args = moe_args
-                self.num_experts_per_tok = moe_args.top_k
-                self.n_routed_experts = moe_args.num_experts
-                self.n_shared_experts = moe_args.num_shared_experts
-                self.moe_intermediate_size = titan_moe_args.moe_inter_dim
-                
-                # Set MoE-specific attributes directly on config for model access
-                if hasattr(titan_moe_args, 'rope_interleave'):
-                    self.rope_interleave = titan_moe_args.rope_interleave
-                if hasattr(titan_moe_args, 'partial_rotary_factor'):
-                    self.partial_rotary_factor = titan_moe_args.partial_rotary_factor
-                if hasattr(titan_moe_args, 'n_group'):
-                    self.n_group = titan_moe_args.n_group
-                if hasattr(titan_moe_args, 'topk_group'):
-                    self.topk_group = titan_moe_args.topk_group
-                if hasattr(titan_moe_args, 'kv_lora_rank'):
-                    self.kv_lora_rank = titan_moe_args.kv_lora_rank
-                if hasattr(titan_moe_args, 'q_lora_rank'):
-                    self.q_lora_rank = q_lora_rank  # Use the modified version (0 -> None)
-                if hasattr(titan_moe_args, 'qk_nope_head_dim'):
-                    self.qk_nope_head_dim = titan_moe_args.qk_nope_head_dim
-                if hasattr(titan_moe_args, 'qk_rope_head_dim'):
-                    self.qk_rope_head_dim = titan_moe_args.qk_rope_head_dim
-                if hasattr(titan_moe_args, 'v_head_dim'):
-                    self.v_head_dim = titan_moe_args.v_head_dim
-        
-                self._passed_args.update(
-                    dict(
-                        num_experts_per_tok=moe_args.top_k,
-                        n_routed_experts=moe_args.num_experts,
-                        n_shared_experts=moe_args.num_shared_experts,
-                        moe_intermediate_size=titan_moe_args.moe_inter_dim,
-                        rope_interleave=titan_moe_args.rope_interleave,
-                        partial_rotary_factor=titan_moe_args.partial_rotary_factor,
-                        n_group=titan_moe_args.n_group,
-                        topk_group=titan_moe_args.topk_group,
-                        kv_lora_rank=titan_moe_args.kv_lora_rank,
-                        qk_nope_head_dim=titan_moe_args.qk_nope_head_dim,
-                        qk_rope_head_dim=titan_moe_args.qk_rope_head_dim,
-                        v_head_dim=titan_moe_args.v_head_dim,
-                    )
-                )
-
-    def _create_dynamic_properties(self):
-        """Create properties dynamically based on active mappings."""
+    def _create_getter_setter_dynamically(self, has_moe: bool):
+        """
+        Create properties dynamically based on tt and hf attribute mappings.
+        For example, creates a property 'dim' that reads/writes to 'hidden_size'.
+        """
 
         def _create_property(hf_name: str) -> property:
             def getter(self):
@@ -162,8 +164,13 @@ def setter(self, value):
                 setattr(self, hf_name, value)
 
             return property(getter, setter)
+        
+        # Setup attribute mappings
+        self._tt_to_hf_attribute_map = dict(self._TT_TO_HF_MAPPINGS["dense"])
+        if has_moe:
+            self._tt_to_hf_attribute_map.update(self._TT_TO_HF_MAPPINGS["moe"])
 
-        for titan_name, hf_name in self._active_mappings.items():
+        for titan_name, hf_name in self._tt_to_hf_attribute_map.items():
             # Create getter/setter for attribute that don't already exist
             if not hasattr(self.__class__, titan_name):
                 setattr(self.__class__, titan_name, _create_property(hf_name))
@@ -176,7 +183,7 @@ def __repr__(self) -> str:
         # displays the arguments passed during initialization.
         args_lines = [
             f"{k}={getattr(self, k)!r}"
-            for k in sorted(self._passed_args.keys())
+            for k in sorted(self._titan_injected_model_args.keys())
             if hasattr(self, k)
         ]
         args_str = "\n".join(args_lines)
@@ -191,7 +198,7 @@ def update_from_config(self, job_config: JobConfig):
         )
 
         # Explicitly update attributes based on mappings
-        for titan_name, hf_name in self._active_mappings.items():
+        for titan_name, hf_name in self._tt_to_hf_attribute_map.items():
             if hasattr(hf_model_config, hf_name):
                 setattr(self, titan_name, getattr(hf_model_config, hf_name))
 
@@ -200,7 +207,7 @@ def update_from_config(self, job_config: JobConfig):
             setattr(self, key, value)
 
         # Update our attributes with the passed args from flavors
-        for key, value in self._passed_args.items():
+        for key, value in self._titan_injected_model_args.items():
             if hasattr(self, key) and value is not None:
                 setattr(self, key, value)
 

From 3425b12bf797cf87f99da2482256962020d291f0 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 31 Oct 2025 09:37:54 +0000
Subject: [PATCH 084/129] add README

---
 .../transformers_backend/README.md            | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 torchtitan/experiments/transformers_backend/README.md

diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
new file mode 100644
index 0000000000..650855f28d
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -0,0 +1,51 @@
+# Huggingface Transformers backend
+
+## Quick start
+
+- Requirements `transformers==4.55.4`
+
+- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml` 
+```diff
+...
+[model]
+- name = "llama3"
++ name = "transformers_backend" 
+flavor = "debugmodel"
+hf_assets_path = "./tests/assets/tokenizer"
+
++[hf_transformers]
++model = "Qwen/Qwen3-4B-Instruct-2507"
+...
+```
+- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATHQ/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh`
+<img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" />
+
+## Supported Features
+
+- The following models were tested:
+    - Dense (FSDP/CP/TP/PP)
+        - `meta-llama/Llama-3.2-1B`
+        - `microsoft/phi-2`
+        - `Qwen/Qwen2.5-7B`
+        - `mistralai/Mistral-7B-v0.1`
+        - `ByteDance-Seed/Seed-Coder-8B-Instruct`
+        - `Qwen/Qwen3-4B-Instruct-2507`
+        - `arcee-ai/AFM-4.5B`
+        - `ibm-granite/granite-3b-code-base-2k`
+        - `baidu/ERNIE-4.5-0.3B-Base-PT`
+        - `kyutai/helium-1-preview-2b`
+        - `allenai/OLMo-7B-hf`
+        - `mistralai/Ministral-8B-Instruct-2410`
+    - MoE (upcoming)
+
+## Known issues to address later
+
+- When using HF modeling, the test `FSDP=2 vs FSDP=2 + PP=2`, the `loss` and `grad_norm` not bitwise matching (but converging) while it is the case with Torchtitan modeling. This will be addressed in another PR but the culprit is probably `register_buffer` when loading `seed_checkpoint`
+- the HF modeling has lower MFU than Torchtitan MFU
+
+## Further work
+
+- Missing `build_optimizers_with_moe_load_balancing` support for MoE
+- Missing TP/PP/EP supports for MoE
+- Load HF weights
+- Add LORA support
\ No newline at end of file

From 7b0ee5d5d72cac310104936a55ab857277e0d2b7 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 31 Oct 2025 09:38:47 +0000
Subject: [PATCH 085/129] add requirements.txt

---
 torchtitan/experiments/transformers_backend/requirements.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 torchtitan/experiments/transformers_backend/requirements.txt

diff --git a/torchtitan/experiments/transformers_backend/requirements.txt b/torchtitan/experiments/transformers_backend/requirements.txt
new file mode 100644
index 0000000000..6b0cc637db
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/requirements.txt
@@ -0,0 +1 @@
+transformers==4.55.4

From 3e2222c702bddaf02d0554c68374b984cd05d1d3 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 31 Oct 2025 09:51:24 +0000
Subject: [PATCH 086/129] fix linting

---
 .../transformers_backend/README.md            |  6 +--
 .../transformers_backend/__init__.py          |  1 +
 .../transformers_backend/model/args.py        | 46 ++++++++++---------
 torchtitan/protocols/train_spec.py            |  1 -
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
index 650855f28d..ce4d7ff7c8 100644
--- a/torchtitan/experiments/transformers_backend/README.md
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -4,12 +4,12 @@
 
 - Requirements `transformers==4.55.4`
 
-- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml` 
+- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml`
 ```diff
 ...
 [model]
 - name = "llama3"
-+ name = "transformers_backend" 
++ name = "transformers_backend"
 flavor = "debugmodel"
 hf_assets_path = "./tests/assets/tokenizer"
 
@@ -48,4 +48,4 @@ hf_assets_path = "./tests/assets/tokenizer"
 - Missing `build_optimizers_with_moe_load_balancing` support for MoE
 - Missing TP/PP/EP supports for MoE
 - Load HF weights
-- Add LORA support
\ No newline at end of file
+- Add LORA support
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 1c44b9684c..50e8119b15 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -115,6 +115,7 @@ class TitanMoeModelArgs:
     ),
 }
 
+
 def get_train_spec() -> TrainSpec:
     return TrainSpec(
         model_cls=HFTransformerModel,
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index d281d68b3c..b9b79bda04 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -46,7 +46,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
             "n_dense_layers": "first_k_dense_replace",
         },
     }
-    
+
     # Declarative list of TorchTitan-only attributes (no HF equivalent)
     _TT_SPECIFIC_ATTRIBUTES = [
         "multiple_of",
@@ -55,7 +55,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
         "use_flex_attn",
         "attn_mask_type",
     ]
-    
+
     # MoE attributes that should be copied directly
     _MOE_SHARED_ATTRIBUTES = [
         "rope_interleave",
@@ -82,16 +82,16 @@ def __init__(
 
         # Create getter/setter dynamically for TT <-> HF attribute mappings
         self._create_getter_setter_dynamically(titan_moe_args is not None)
-        
+
         self._titan_injected_model_args = {}
         self._titan_injected_model_args.update(kwargs)
         self._configure_hf_attention(attn_implementation)
 
         self._initialize_dense_attributes(titan_dense_args)
-        
+
         if titan_moe_args is not None:
             self._initialize_moe_attributes(titan_moe_args)
-        
+
     def _initialize_dense_attributes(self, titan_dense_args):
         """Initialize all dense model attributes."""
         # Set mapped attributes (TorchTitan <-> HuggingFace)
@@ -99,12 +99,12 @@ def _initialize_dense_attributes(self, titan_dense_args):
             if hasattr(titan_dense_args, titan_name):
                 value = getattr(titan_dense_args, titan_name)
                 setattr(self, hf_name, value)
-        
+
         # Set TorchTitan-only attributes
         for attr_name in self._TT_SPECIFIC_ATTRIBUTES:
             if hasattr(titan_dense_args, attr_name):
                 setattr(self, attr_name, getattr(titan_dense_args, attr_name))
-        
+
         # Update passed_args
         self._titan_injected_model_args.update(titan_dense_args.__dict__)
 
@@ -113,35 +113,39 @@ def _initialize_moe_attributes(self, titan_moe_args):
         if titan_moe_args.moe_args is None:
             self._titan_injected_model_args.update(titan_moe_args.__dict__)
             return
-        
+
         moe_args = titan_moe_args.moe_args
-        
+
         # Convert q_lora_rank (0 -> None for HuggingFace compatibility)
-        self.q_lora_rank = None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank
-        
+        self.q_lora_rank = (
+            None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank
+        )
+
         # Set core MoE attributes
         self.moe_args = moe_args
         self.num_experts_per_tok = moe_args.top_k
         self.n_routed_experts = moe_args.num_experts
         self.n_shared_experts = moe_args.num_shared_experts
         self.moe_intermediate_size = titan_moe_args.moe_inter_dim
-        
+
         # Set remaining architecture-specific MoE attributes
         for attr in self._MOE_SHARED_ATTRIBUTES:
             if attr == "q_lora_rank":
                 continue  # Already set above
             if hasattr(titan_moe_args, attr):
                 setattr(self, attr, getattr(titan_moe_args, attr))
-        
+
         # Track all MoE arguments
         self._titan_injected_model_args.update(titan_moe_args.__dict__)
-        self._titan_injected_model_args.update({
-            "num_experts_per_tok": moe_args.top_k,
-            "n_routed_experts": moe_args.num_experts,
-            "n_shared_experts": moe_args.num_shared_experts,
-            "moe_intermediate_size": titan_moe_args.moe_inter_dim,
-            "q_lora_rank": self.q_lora_rank,
-        })
+        self._titan_injected_model_args.update(
+            {
+                "num_experts_per_tok": moe_args.top_k,
+                "n_routed_experts": moe_args.num_experts,
+                "n_shared_experts": moe_args.num_shared_experts,
+                "moe_intermediate_size": titan_moe_args.moe_inter_dim,
+                "q_lora_rank": self.q_lora_rank,
+            }
+        )
 
     def _configure_hf_attention(self, attn_implementation: str):
         """Configure HuggingFace attention settings."""
@@ -164,7 +168,7 @@ def setter(self, value):
                 setattr(self, hf_name, value)
 
             return property(getter, setter)
-        
+
         # Setup attribute mappings
         self._tt_to_hf_attribute_map = dict(self._TT_TO_HF_MAPPINGS["dense"])
         if has_moe:
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index c5bd62793b..1f7899e965 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import dataclasses
 from collections.abc import Callable
 from dataclasses import dataclass
 from importlib import import_module

From 70c348d1409e1f3ed566270a16329baccbe33585 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sat, 1 Nov 2025 12:48:14 +0000
Subject: [PATCH 087/129] fix bug related to training with different seq_len
 than max_seq_len

---
 torchtitan/experiments/transformers_backend/model/args.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index b9b79bda04..285a82c5a8 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -215,6 +215,9 @@ def update_from_config(self, job_config: JobConfig):
             if hasattr(self, key) and value is not None:
                 setattr(self, key, value)
 
+        if hasattr(job_config.training, 'seq_len') and job_config.training.seq_len != self.max_seq_len:
+            self.max_seq_len = job_config.training.seq_len
+        
         # MoE
         if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
             self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim

From af0a1cb76ed494adf847d8915d4dc38cf52c5497 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Sat, 1 Nov 2025 16:45:52 +0000
Subject: [PATCH 088/129] decouple MoE logic to another PR

---
 .../transformers_backend/__init__.py          |  58 ----
 .../transformers_backend/model/args.py        |  82 +-----
 .../transformers_backend/model/model.py       | 247 ++----------------
 3 files changed, 25 insertions(+), 362 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 50e8119b15..c4343b8cb7 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -10,7 +10,6 @@
 from torchtitan.components.optimizer import build_optimizers
 from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.hf_datasets.text_datasets import build_text_dataloader
-from torchtitan.models.moe import MoEArgs
 from torchtitan.protocols.train_spec import TrainSpec
 
 from .infra.parallelize_hf_transformers import parallelize_hf_transformers
@@ -44,33 +43,6 @@ class TitanDenseModelArgs:
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
 
-
-@dataclass
-class TitanMoeModelArgs:
-    """Arguments specific to DeepSeekV3 models."""
-
-    moe_args: MoEArgs | None = None
-    n_group: int | None = None
-    topk_group: int | None = None
-    inter_dim: int | None = None
-    moe_inter_dim: int | None = None
-    n_dense_layers: int | None = None
-    n_expert_groups: int | None = None
-    n_limited_groups: int | None = None
-    q_lora_rank: int | None = None
-    kv_lora_rank: int | None = None
-    qk_nope_head_dim: int | None = None
-    qk_rope_head_dim: int | None = None
-    v_head_dim: int | None = None
-    original_seq_len: int | None = None
-    rope_factor: float | None = None
-    beta_fast: int | None = None
-    beta_slow: int | None = None
-    mscale: float | None = None
-    partial_rotary_factor: float | None = None
-    rope_interleave: bool = True
-
-
 flavors = {
     "debugmodel": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(
@@ -80,36 +52,6 @@ class TitanMoeModelArgs:
             n_kv_heads=16,
         ),
     ),
-    "debugmodel_moe": HFTransformerModelArgs(
-        titan_dense_args=TitanDenseModelArgs(
-            dim=256,
-            n_layers=6,
-            n_heads=16,
-            n_kv_heads=16,
-        ),
-        titan_moe_args=TitanMoeModelArgs(
-            partial_rotary_factor=4.0,
-            inter_dim=1024,
-            moe_inter_dim=256,
-            n_dense_layers=1,
-            n_group=2,
-            topk_group=1,
-            kv_lora_rank=512,
-            q_lora_rank=0,
-            qk_nope_head_dim=128,
-            qk_rope_head_dim=64,
-            v_head_dim=128,
-            mscale=0.70,
-            moe_args=MoEArgs(
-                num_experts=8,
-                num_shared_experts=2,
-                top_k=3,
-                score_func="softmax",
-                route_norm=True,
-                score_before_experts=False,
-            ),
-        ),
-    ),
     "full": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(),
     ),
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 285a82c5a8..2e90eea854 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -8,10 +8,7 @@
 
 from torch import nn
 from torchtitan.config import JobConfig
-from torchtitan.models.utils import (
-    get_dense_model_nparams_and_flops,
-    get_moe_model_nparams_and_flops,
-)
+from torchtitan.models.utils import get_dense_model_nparams_and_flops
 from torchtitan.protocols import BaseModelArgs
 from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
@@ -39,12 +36,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
             "norm_eps": "rms_norm_eps",
             "max_seq_len": "max_position_embeddings",
             "eos_id": "eos_token_id",
-        },
-        "moe": {
-            # TorchTitan moe model specific mappings (only when titan_moe_args provided)
-            "inter_dim": "intermediate_size",
-            "n_dense_layers": "first_k_dense_replace",
-        },
+        }
     }
 
     # Declarative list of TorchTitan-only attributes (no HF equivalent)
@@ -56,23 +48,9 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
         "attn_mask_type",
     ]
 
-    # MoE attributes that should be copied directly
-    _MOE_SHARED_ATTRIBUTES = [
-        "rope_interleave",
-        "partial_rotary_factor",
-        "n_group",
-        "topk_group",
-        "kv_lora_rank",
-        "q_lora_rank",
-        "qk_nope_head_dim",
-        "qk_rope_head_dim",
-        "v_head_dim",
-    ]
-
     def __init__(
         self,
         titan_dense_args,
-        titan_moe_args=None,
         # HuggingFace specific args
         attn_implementation: str = "sdpa_torchtitan",
         **kwargs,
@@ -81,7 +59,7 @@ def __init__(
         assert titan_dense_args is not None, "titan_dense_args is required"
 
         # Create getter/setter dynamically for TT <-> HF attribute mappings
-        self._create_getter_setter_dynamically(titan_moe_args is not None)
+        self._create_getter_setter_dynamically(has_moe=False)
 
         self._titan_injected_model_args = {}
         self._titan_injected_model_args.update(kwargs)
@@ -89,9 +67,6 @@ def __init__(
 
         self._initialize_dense_attributes(titan_dense_args)
 
-        if titan_moe_args is not None:
-            self._initialize_moe_attributes(titan_moe_args)
-
     def _initialize_dense_attributes(self, titan_dense_args):
         """Initialize all dense model attributes."""
         # Set mapped attributes (TorchTitan <-> HuggingFace)
@@ -107,46 +82,6 @@ def _initialize_dense_attributes(self, titan_dense_args):
 
         # Update passed_args
         self._titan_injected_model_args.update(titan_dense_args.__dict__)
-
-    def _initialize_moe_attributes(self, titan_moe_args):
-        """Initialize all MoE-specific attributes."""
-        if titan_moe_args.moe_args is None:
-            self._titan_injected_model_args.update(titan_moe_args.__dict__)
-            return
-
-        moe_args = titan_moe_args.moe_args
-
-        # Convert q_lora_rank (0 -> None for HuggingFace compatibility)
-        self.q_lora_rank = (
-            None if titan_moe_args.q_lora_rank == 0 else titan_moe_args.q_lora_rank
-        )
-
-        # Set core MoE attributes
-        self.moe_args = moe_args
-        self.num_experts_per_tok = moe_args.top_k
-        self.n_routed_experts = moe_args.num_experts
-        self.n_shared_experts = moe_args.num_shared_experts
-        self.moe_intermediate_size = titan_moe_args.moe_inter_dim
-
-        # Set remaining architecture-specific MoE attributes
-        for attr in self._MOE_SHARED_ATTRIBUTES:
-            if attr == "q_lora_rank":
-                continue  # Already set above
-            if hasattr(titan_moe_args, attr):
-                setattr(self, attr, getattr(titan_moe_args, attr))
-
-        # Track all MoE arguments
-        self._titan_injected_model_args.update(titan_moe_args.__dict__)
-        self._titan_injected_model_args.update(
-            {
-                "num_experts_per_tok": moe_args.top_k,
-                "n_routed_experts": moe_args.num_experts,
-                "n_shared_experts": moe_args.num_shared_experts,
-                "moe_intermediate_size": titan_moe_args.moe_inter_dim,
-                "q_lora_rank": self.q_lora_rank,
-            }
-        )
-
     def _configure_hf_attention(self, attn_implementation: str):
         """Configure HuggingFace attention settings."""
         self._titan_injected_model_args["attn_implementation"] = attn_implementation
@@ -217,10 +152,6 @@ def update_from_config(self, job_config: JobConfig):
 
         if hasattr(job_config.training, 'seq_len') and job_config.training.seq_len != self.max_seq_len:
             self.max_seq_len = job_config.training.seq_len
-        
-        # MoE
-        if hasattr(self, "qk_nope_head_dim") and hasattr(self, "qk_rope_head_dim"):
-            self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
 
         # Configure HF-specific settings to match TorchTitan settings
         self.attention_bias = False
@@ -242,9 +173,4 @@ def update_from_config(self, job_config: JobConfig):
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        is_moe = hasattr(self, "n_routed_experts")
-
-        if is_moe:
-            return get_moe_model_nparams_and_flops(self, model, seq_len)
-        else:
-            return get_dense_model_nparams_and_flops(self, model, seq_len)
+        return get_dense_model_nparams_and_flops(self, model, seq_len)
diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index fd7561611e..8041e54f70 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -45,55 +45,26 @@ def __init__(self, model_args: HFTransformerModelArgs):
                 model_module, f"{model_name_prefix}DecoderLayer", None
             )
 
-            is_moe = hasattr(
-                model_args, "n_routed_experts"
-            )  # TODO(3outeille): check if this is the most reliable to detect a moe model
-            if is_moe:
-                moe_cls = getattr(model_module, f"{model_name_prefix}MoE", None)
-                required_classes = {
-                    "Attention": attention_cls,
-                    "MLP": mlp_cls,
-                    "DecoderLayer": decoder_layer_cls,
-                    "MoE": moe_cls,
-                }
-
-                if all(required_classes.values()):
-                    logger.info(f"Applying MoE-like patch for {model_name_prefix}")
-                    self._patch_hf_moe_like(
-                        decoder_layer_cls=decoder_layer_cls,
-                        attention_cls=attention_cls,
-                        mlp_cls=mlp_cls,
-                        moe_cls=moe_cls,
-                    )
-                else:
-                    missing = [
-                        name for name, cls in required_classes.items() if not cls
-                    ]
-                    logger.warning(
-                        f"Could not find required classes ({', '.join(missing)}) for MoE patching of {model_name_prefix}. "
-                        "Skipping MoE-like patch."
-                    )
+            required_classes = {
+                "Attention": attention_cls,
+                "DecoderLayer": decoder_layer_cls,
+            }
+
+            if all(required_classes.values()):
+                logger.info(f"Applying Llama-like patch for {model_name_prefix}")
+                self._patch_hf_llama_like(
+                    decoder_layer_cls=decoder_layer_cls,
+                    attention_cls=attention_cls,
+                    mlp_cls=mlp_cls,  # mlp_cls can be None
+                )
             else:
-                required_classes = {
-                    "Attention": attention_cls,
-                    "DecoderLayer": decoder_layer_cls,
-                }
-
-                if all(required_classes.values()):
-                    logger.info(f"Applying Llama-like patch for {model_name_prefix}")
-                    self._patch_hf_llama_like(
-                        decoder_layer_cls=decoder_layer_cls,
-                        attention_cls=attention_cls,
-                        mlp_cls=mlp_cls,  # mlp_cls can be None
-                    )
-                else:
-                    missing = [
-                        name for name, cls in required_classes.items() if not cls
-                    ]
-                    logger.warning(
-                        f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
-                        "Skipping Llama-like patch."
-                    )
+                missing = [
+                    name for name, cls in required_classes.items() if not cls
+                ]
+                logger.warning(
+                    f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
+                    "Skipping Llama-like patch."
+                )
 
         except Exception as e:
             logger.warning(
@@ -103,17 +74,10 @@ def __init__(self, model_args: HFTransformerModelArgs):
 
         self.model = model_cls(config=model_args)
         self.max_seq_len = model_args.max_seq_len
+        self.cp_mesh = None
 
         for layer in self.model.model.layers:
-            if (
-                hasattr(model_args, "first_k_dense_replace")
-                and layer.layer_idx >= model_args.first_k_dense_replace
-            ):
-                layer.moe_enabled = True
-            else:
-                layer.moe_enabled = False
-
-        self.cp_mesh = None
+            layer.moe_enabled = False
 
     def set_cp_mesh(self, mesh):
         self.cp_mesh = mesh
@@ -284,175 +248,6 @@ def _init_weights_patched(self, module):
         PreTrainedModel._init_weights = _init_weights_patched
         PreTrainedModel._initialize_weights = _initialize_weights_patched
 
-    def _patch_hf_moe_like(self, decoder_layer_cls, attention_cls, mlp_cls, moe_cls):
-        """
-        This patch modifies a Hugging Face MoE (Mixture-of-Experts) model's weight
-        initialization to match the initialization scheme used in TorchTitan,
-        drawing from patterns in models like DeepseekV3.
-
-        The patch targets:
-        - `PreTrainedModel._initialize_weights`: For correct meta device initialization.
-        - `PreTrainedModel._init_weights`: To implement TorchTitan's specific initialization
-          for attention, MLP, MoE, embedding, and layer norm layers.
-        - `DecoderLayer.__init__`: Adds `layer_idx` to attention, MLP, and MoE expert
-          modules, required for depth-dependent initialization.
-        """
-
-        _original_decoder_layer_init = decoder_layer_cls.__init__
-
-        def _decoder_layer_init_patched(self, config: PretrainedConfig, layer_idx: int):
-            _original_decoder_layer_init(self, config, layer_idx)
-            self.layer_idx = layer_idx
-
-            if hasattr(self, "self_attn"):
-                self.self_attn.layer_idx = layer_idx
-
-            if hasattr(self, "mlp"):
-                self.mlp.layer_idx = layer_idx
-                if hasattr(self.mlp, "experts"):
-                    for expert in self.mlp.experts:
-                        expert.layer_idx = layer_idx
-                if hasattr(self.mlp, "shared_experts"):
-                    # Not all MoE models have shared experts
-                    if self.mlp.shared_experts is not None:
-                        self.mlp.shared_experts.layer_idx = layer_idx
-
-        def _initialize_weights_patched(self, module):
-            if getattr(module, "_is_hf_initialized", False):
-                return
-            for param in module.parameters(recurse=True):
-                if param.device.type == "meta":
-                    return
-            self._init_weights(module)
-            module._is_hf_initialized = True
-
-        def _init_weights_patched(self, module):
-            """
-            Patched version of _init_weights for MoE models.
-            """
-            config = self.config
-            init_std = None
-
-            if isinstance(module, (attention_cls, mlp_cls, moe_cls)):
-                if hasattr(module, "layer_idx"):
-                    layer_idx = module.layer_idx
-                    if hasattr(config, "depth_init") and config.depth_init:
-                        init_std = 0.02 / (2 * (layer_idx + 1)) ** 0.5
-                    else:
-                        # Fallback for models without depth_init
-                        init_std = 0.02 / (2 * config.num_hidden_layers) ** 0.5
-
-            if isinstance(module, attention_cls):
-                # Handle different attention projection layer names by initializing if they exist
-                if hasattr(module, "q_proj"):
-                    nn.init.trunc_normal_(module.q_proj.weight, mean=0.0, std=0.02)
-                if hasattr(module, "k_proj"):
-                    nn.init.trunc_normal_(module.k_proj.weight, mean=0.0, std=0.02)
-                if hasattr(module, "v_proj"):
-                    nn.init.trunc_normal_(module.v_proj.weight, mean=0.0, std=0.02)
-
-                if hasattr(module, "q_a_proj"):
-                    nn.init.trunc_normal_(module.q_a_proj.weight, mean=0.0, std=0.02)
-                if hasattr(module, "q_b_proj"):
-                    nn.init.trunc_normal_(module.q_b_proj.weight, mean=0.0, std=0.02)
-
-                if hasattr(module, "kv_a_proj_with_mqa"):
-                    nn.init.trunc_normal_(
-                        module.kv_a_proj_with_mqa.weight, mean=0.0, std=0.02
-                    )
-                if hasattr(module, "kv_b_proj"):
-                    nn.init.trunc_normal_(module.kv_b_proj.weight, mean=0.0, std=0.02)
-
-                if hasattr(module, "o_proj") and init_std is not None:
-                    nn.init.trunc_normal_(module.o_proj.weight, mean=0.0, std=init_std)
-
-            elif isinstance(module, mlp_cls):
-                nn.init.trunc_normal_(module.gate_proj.weight, mean=0.0, std=0.02)
-                # DeepseekV3 uses std=0.02 for up_proj, unlike Llama
-                nn.init.trunc_normal_(module.up_proj.weight, mean=0.0, std=0.02)
-                if init_std is not None:
-                    nn.init.trunc_normal_(
-                        module.down_proj.weight, mean=0.0, std=init_std
-                    )
-
-            elif isinstance(module, moe_cls):
-                if hasattr(module, "gate") and init_std is not None:
-                    nn.init.trunc_normal_(module.gate.weight, mean=0.0, std=init_std)
-                if hasattr(module, "experts"):
-                    for expert in module.experts:
-                        nn.init.trunc_normal_(
-                            expert.gate_proj.weight, mean=0.0, std=0.02
-                        )
-                        nn.init.trunc_normal_(expert.up_proj.weight, mean=0.0, std=0.02)
-                        if init_std is not None:
-                            nn.init.trunc_normal_(
-                                expert.down_proj.weight, mean=0.0, std=init_std
-                            )
-                if (
-                    hasattr(module, "shared_experts")
-                    and module.shared_experts is not None
-                ):
-                    nn.init.trunc_normal_(
-                        module.shared_experts.gate_proj.weight, mean=0.0, std=0.02
-                    )
-                    nn.init.trunc_normal_(
-                        module.shared_experts.up_proj.weight, mean=0.0, std=0.02
-                    )
-                    if init_std is not None:
-                        nn.init.trunc_normal_(
-                            module.shared_experts.down_proj.weight,
-                            mean=0.0,
-                            std=init_std,
-                        )
-
-            elif module is getattr(self, "lm_head", None):
-                final_out_std = config.hidden_size**-0.5
-                cutoff_factor = 3
-                nn.init.trunc_normal_(
-                    module.weight,
-                    mean=0.0,
-                    std=final_out_std,
-                    a=-cutoff_factor * final_out_std,
-                    b=cutoff_factor * final_out_std,
-                )
-                if module.bias is not None:
-                    module.bias.data.zero_()
-
-            elif isinstance(module, nn.Embedding):
-                # When tie_word_embeddings is True, use lm_head initialization
-                if (
-                    hasattr(config, "tie_word_embeddings")
-                    and config.tie_word_embeddings
-                ):
-                    final_out_std = config.hidden_size**-0.5
-                    cutoff_factor = 3
-                    nn.init.trunc_normal_(
-                        module.weight,
-                        mean=0.0,
-                        std=final_out_std,
-                        a=-cutoff_factor * final_out_std,
-                        b=cutoff_factor * final_out_std,
-                    )
-                else:
-                    std = config.initializer_range
-                    module.weight.data.normal_(mean=0.0, std=std)
-
-                if module.padding_idx is not None:
-                    module.weight.data[module.padding_idx].zero_()
-
-            elif (
-                "LayerNorm" in module.__class__.__name__
-                or "RMSNorm" in module.__class__.__name__
-            ):
-                if hasattr(module, "weight") and module.weight is not None:
-                    module.weight.data.fill_(1.0)
-                if hasattr(module, "bias") and module.bias is not None:
-                    module.bias.data.zero_()
-
-        decoder_layer_cls.__init__ = _decoder_layer_init_patched
-        PreTrainedModel._init_weights = _init_weights_patched
-        PreTrainedModel._initialize_weights = _initialize_weights_patched
-
     @property
     def tok_embeddings(self):
         """Returns the model's embed_tokens, handling different Hugging Face model structures."""

From 980a92b9997a61ded55163f7049a303b779bfd00 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 3 Nov 2025 12:07:36 +0000
Subject: [PATCH 089/129] update experiments README

---
 torchtitan/experiments/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
index ad1e3ee79c..5db88af3d8 100644
--- a/torchtitan/experiments/README.md
+++ b/torchtitan/experiments/README.md
@@ -31,3 +31,4 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
 | [compiler_toolkit](./compiler_tookit/) | TBA | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
+| [transformers_backend](./transformers_backend/) | TBA | [@3outeille](https://github.com/3outeille) |

From 06b6f24cb5b24bbb6ff6acb1828bbeee5cc606a4 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Mon, 3 Nov 2025 12:22:01 +0000
Subject: [PATCH 090/129] update README to confirm torch.compile support

---
 torchtitan/experiments/transformers_backend/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
index ce4d7ff7c8..be819e223a 100644
--- a/torchtitan/experiments/transformers_backend/README.md
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -23,7 +23,7 @@ hf_assets_path = "./tests/assets/tokenizer"
 ## Supported Features
 
 - The following models were tested:
-    - Dense (FSDP/CP/TP/PP)
+    - Dense (FSDP/CP/TP/PP/`torch.compile`)
         - `meta-llama/Llama-3.2-1B`
         - `microsoft/phi-2`
         - `Qwen/Qwen2.5-7B`

From a70c4c4e36310fbd90cf5095ceee4f83cbb31742 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 10:07:20 +0000
Subject: [PATCH 091/129] custom job_config

---
 torchtitan/config/job_config.py                        |  7 -------
 torchtitan/experiments/transformers_backend/README.md  |  3 ++-
 .../infra/parallelize_hf_transformers.py               |  3 ++-
 .../transformers_backend/infra/pipeline_hf.py          |  2 +-
 .../experiments/transformers_backend/job_config.py     | 10 ++++++++++
 .../experiments/transformers_backend/model/args.py     |  2 +-
 6 files changed, 16 insertions(+), 11 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/job_config.py

diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
index ee89d13627..7fe6802374 100644
--- a/torchtitan/config/job_config.py
+++ b/torchtitan/config/job_config.py
@@ -131,12 +131,6 @@ class Model:
     """
 
 
-@dataclass
-class HFTransformers:
-    model: str = ""
-    """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')"""
-
-
 @dataclass
 class Optimizer:
     name: str = "AdamW"
@@ -903,7 +897,6 @@ class JobConfig:
     profiling: Profiling = field(default_factory=Profiling)
     metrics: Metrics = field(default_factory=Metrics)
     model: Model = field(default_factory=Model)
-    hf_transformers: HFTransformers = field(default_factory=HFTransformers)
     optimizer: Optimizer = field(default_factory=Optimizer)
     lr_scheduler: LRScheduler = field(default_factory=LRScheduler)
     training: Training = field(default_factory=Training)
diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
index be819e223a..8fbd19f0e8 100644
--- a/torchtitan/experiments/transformers_backend/README.md
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -17,7 +17,8 @@ hf_assets_path = "./tests/assets/tokenizer"
 +model = "Qwen/Qwen3-4B-Instruct-2507"
 ...
 ```
-- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATHQ/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh`
+- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATHQ/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --job.custom_config_module=torchtitan.experiments.transformers_backend.job_config --compile.enable`
+    - Make sure you have created the tokenizers beforehand 
 <img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" />
 
 ## Supported Features
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
index d1d8d4c480..27730a5914 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
@@ -24,7 +24,8 @@
     RowwiseParallel,
     SequenceParallel,
 )
-from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.config import TORCH_DTYPE_MAP
 from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
 from torchtitan.distributed import NoParallel, ParallelDims
 
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
index ee7b268f9d..088cc05642 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
@@ -19,7 +19,7 @@
 )
 
 from torchtitan.components.loss import LossFunction
-from torchtitan.config import JobConfig
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py
new file mode 100644
index 0000000000..6344529d20
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/job_config.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass, field
+
+@dataclass
+class HFTransformers:
+    model: str = ""
+    """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')"""
+
+@dataclass
+class JobConfig:
+    hf_transformers: HFTransformers = field(default_factory=HFTransformers)
\ No newline at end of file
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 2e90eea854..668fa48aeb 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 
 from torch import nn
-from torchtitan.config import JobConfig
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.models.utils import get_dense_model_nparams_and_flops
 from torchtitan.protocols import BaseModelArgs
 from transformers import AutoConfig

From 42884cda72e5ffab1f7e216ac6b789a93f353e36 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 10:18:27 +0000
Subject: [PATCH 092/129] remove unecessary change in train_spec

---
 torchtitan/protocols/train_spec.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index 1f7899e965..22bfa7df9b 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -51,7 +51,6 @@ class TrainSpec:
     build_dataloader_fn: DataLoaderBuilder
     build_tokenizer_fn: TokenizerBuilder | None
     build_loss_fn: LossFunctionBuilder
-    name: str | None = None
     build_validator_fn: ValidatorBuilder | None = None
     build_metrics_processor_fn: MetricsProcessorBuilder | None = None
     state_dict_adapter: type[BaseStateDictAdapter] | None = None

From 4fa0874f4ea4ab79735f71760695be745bf1f247 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 10:26:41 +0000
Subject: [PATCH 093/129] rename file to comply with torchtitan style

---
 torchtitan/experiments/transformers_backend/__init__.py       | 4 ++--
 .../infra/{parallelize_hf_transformers.py => parallelize.py}  | 0
 .../infra/{pipeline_hf.py => pipeline.py}                     | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename torchtitan/experiments/transformers_backend/infra/{parallelize_hf_transformers.py => parallelize.py} (100%)
 rename torchtitan/experiments/transformers_backend/infra/{pipeline_hf.py => pipeline.py} (100%)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index c4343b8cb7..b72b77760c 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -12,9 +12,9 @@
 from torchtitan.hf_datasets.text_datasets import build_text_dataloader
 from torchtitan.protocols.train_spec import TrainSpec
 
-from .infra.parallelize_hf_transformers import parallelize_hf_transformers
+from .infra.parallelize import parallelize_hf_transformers
 
-from .infra.pipeline_hf import pipeline_hf_transformers
+from .infra.pipeline import pipeline_hf_transformers
 from .model.args import HFTransformerModelArgs
 from .model.model import HFTransformerModel
 
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
similarity index 100%
rename from torchtitan/experiments/transformers_backend/infra/parallelize_hf_transformers.py
rename to torchtitan/experiments/transformers_backend/infra/parallelize.py
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_hf.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
similarity index 100%
rename from torchtitan/experiments/transformers_backend/infra/pipeline_hf.py
rename to torchtitan/experiments/transformers_backend/infra/pipeline.py

From 8ffa7f4dc731b5c1e29c7c650540ede0b4cd456f Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 10:37:53 +0000
Subject: [PATCH 094/129] reuse ac form torchtitan

---
 .../transformers_backend/infra/parallelize.py | 112 +-----------------
 1 file changed, 1 insertion(+), 111 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 27730a5914..27ff2718be 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -37,117 +37,7 @@
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.tools.logging import logger
 
-# for selective op activation checkpointing
-_save_list = {
-    torch.ops.aten.mm.default,
-    torch.ops.aten._scaled_dot_product_efficient_attention.default,
-    torch.ops.aten._scaled_dot_product_flash_attention.default,
-    torch._higher_order_ops.flex_attention,
-    torch.ops._c10d_functional.reduce_scatter_tensor.default,
-    # for low precision training, it's useful to always save
-    # the result of max, since the absolute maximum is
-    # used to compute the scaling factor for quantization.
-    torch.ops.aten.max.default,
-}
-
-
-def _apply_ac_to_transformer_block(
-    module: nn.Module, ac_config: ACConfig, *, base_fqn: Optional[str] = None
-):
-    valid_ac_modes = ("full", "selective")
-    if ac_config.mode not in valid_ac_modes:
-        raise ValueError(
-            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
-        )
-
-    if ac_config.mode == "full":
-        return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
-
-    assert ac_config.mode == "selective", f"{ac_config.mode}"
-    use_op_sac = ac_config.selective_ac_option == "op"
-    use_layer_sac = ac_config.selective_ac_option.isdigit()
-    if not use_op_sac and not use_layer_sac:
-        raise ValueError(
-            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
-            f"Valid options: 'op' or a positive int representing layer frequency"
-        )
-    if use_op_sac:
-        from torch.utils.checkpoint import (
-            CheckpointPolicy,
-            create_selective_checkpoint_contexts,
-        )
-
-        mm_recompute_shapes = set()
-        if len(ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns) > 0:
-            for module_fqn, submod in module.named_modules():
-                fqn = module_fqn
-                if base_fqn is not None:
-                    fqn = f"{base_fqn}.{module_fqn}"
-                if not any(
-                    filter_fqn in fqn
-                    for filter_fqn in ac_config.per_op_sac_force_recompute_mm_shapes_by_fqns
-                ):
-                    continue
-                if not isinstance(submod, nn.Linear):
-                    raise ValueError(
-                        "per_op_sac_force_recompute_mm_shapes_by_fqns expected to match "
-                        f"a nn.Linear, but got: {submod}"
-                    )
-                out_f, in_f = submod.weight.shape
-                mm_recompute_shapes.add((in_f, out_f))
-            logger.debug(
-                f"Selective op AC force recomputing mms with rhs shapes {mm_recompute_shapes}"
-            )
-
-        def _get_custom_policy(meta):
-            def _custom_policy(ctx, func, *args, **kwargs):
-                mode = "recompute" if ctx.is_recompute else "forward"
-                mm_count_key = f"{mode}_mm_count"
-                if func == torch.ops.aten.mm.default:
-                    if args[1].shape in mm_recompute_shapes:
-                        return CheckpointPolicy.PREFER_RECOMPUTE
-                    meta[mm_count_key] += 1
-                # Saves output of all compute ops, except every second mm
-                to_save = func in _save_list and not (
-                    func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
-                )
-                return (
-                    CheckpointPolicy.MUST_SAVE
-                    if to_save
-                    else CheckpointPolicy.PREFER_RECOMPUTE
-                )
-
-            return _custom_policy
-
-        def selective_checkpointing_context_fn():
-            meta = defaultdict(int)
-            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
-
-        return ptd_checkpoint_wrapper(
-            module,
-            context_fn=selective_checkpointing_context_fn,
-            preserve_rng_state=False,
-        )
-    elif use_layer_sac:
-        # Checkpoint every `ac_freq` of the modules passed to this function
-        ac_freq = int(ac_config.selective_ac_option)
-        ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0)
-        ptd_checkpoint_wrapper._count += 1
-        if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0:
-            return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
-        else:
-            return module
-
-
-def apply_ac(model: nn.Module, ac_config: ACConfig):
-    """Apply activation checkpointing to the model."""
-    for layer_id, transformer_block in model.layers.named_children():
-        transformer_block = _apply_ac_to_transformer_block(
-            transformer_block, ac_config, base_fqn=f"layers.{layer_id}"
-        )
-        model.layers.register_module(layer_id, transformer_block)
-
-    logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
+from torchtitan.distributed.activation_checkpoint import apply_ac
 
 
 def apply_ddp(

From ff21c2be00de5c5a9134baca6dd11bf3df5b6322 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 10:38:58 +0000
Subject: [PATCH 095/129] reuse ddp from torchtitan

---
 .../transformers_backend/infra/parallelize.py | 20 +------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 27ff2718be..5a8cf94791 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -38,25 +38,7 @@
 from torchtitan.tools.logging import logger
 
 from torchtitan.distributed.activation_checkpoint import apply_ac
-
-
-def apply_ddp(
-    model: nn.Module,
-    dp_mesh: DeviceMesh,
-    enable_compile: bool,
-    enable_compiled_autograd: bool,
-):
-    if enable_compile:
-        if enable_compiled_autograd:
-            torch._dynamo.config.optimize_ddp = (
-                "python_reducer_without_compiled_forward"
-            )
-        else:
-            torch._dynamo.config.optimize_ddp = "ddp_optimizer"
-
-    replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
-
-    logger.info("Applied DDP to the model")
+from torchtitan.models.llama3.infra.parallelize import apply_ddp
 
 
 def parallelize_hf_transformers(

From 0a43a8a96a69bd77bff758b0d29e19166ecc5080 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 10:40:13 +0000
Subject: [PATCH 096/129] reuse compile from torchtitan llama3

---
 .../transformers_backend/infra/parallelize.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 5a8cf94791..2aca64fdcf 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -39,6 +39,7 @@
 
 from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.models.llama3.infra.parallelize import apply_ddp
+from torchtitan.models.llama3.infra.parallelize import apply_compile
 
 
 def parallelize_hf_transformers(
@@ -564,20 +565,4 @@ def apply_moe_ep_tp(
                 module=moe_block.experts,
                 device_mesh=experts_mesh,
                 parallelize_plan=experts_plan,
-            )
-
-
-def apply_compile(model: nn.Module):
-    """
-    Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
-    repeated structure. Alternatively one can compile the whole model (after applying DP).
-    """
-    for layer_id, transformer_block in model.layers.named_children():
-        # TODO: remove when torch.compile supports fullgraph=True for MoE
-        fullgraph = True
-        if transformer_block.moe_enabled:
-            fullgraph = False
-        transformer_block = torch.compile(transformer_block, fullgraph=fullgraph)
-        model.layers.register_module(layer_id, transformer_block)
-
-    logger.info("Compiling each TransformerBlock with torch.compile")
+            )
\ No newline at end of file

From 8026bc7898e8880a14e8dfa0392fa62b569d7633 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 10:49:53 +0000
Subject: [PATCH 097/129] reuse compile from torchtitan

---
 .../transformers_backend/infra/parallelize.py          | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 2aca64fdcf..4eac61b74e 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -4,15 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from collections import defaultdict
-from typing import Optional
-
 import torch
 import torch.nn as nn
-from torch.distributed._composable.replicate import replicate
-from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    checkpoint_wrapper as ptd_checkpoint_wrapper,
-)
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Partial, Replicate, Shard
@@ -26,7 +19,6 @@
 )
 from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.config import TORCH_DTYPE_MAP
-from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
 from torchtitan.distributed import NoParallel, ParallelDims
 
 from torchtitan.distributed.expert_parallel import (
@@ -113,7 +105,7 @@ def parallelize_hf_transformers(
     if model_compile_enabled:
         # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
         torch._dynamo.config.capture_scalar_outputs = True
-        apply_compile(model)
+        apply_compile(model, job_config.compile)
 
     dp_mesh: DeviceMesh | None = None
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:

From cd4042fa6cac6c1b31ff23fc1885c45f8261899e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 11:26:27 +0000
Subject: [PATCH 098/129] update parallelize with main

---
 .../transformers_backend/infra/parallelize.py | 49 +++----------------
 1 file changed, 6 insertions(+), 43 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 4eac61b74e..276cf94bcd 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -57,9 +57,6 @@ def parallelize_hf_transformers(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    if job_config.parallelism.context_parallel_degree > 1:
-        logger.warning("CP support for FlexAttention is still in progress.")
-
     if parallel_dims.tp_enabled:
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.quantize.linear.float8.recipe_name in (
@@ -80,64 +77,32 @@ def parallelize_hf_transformers(
         )
         maybe_enable_async_tp(job_config, world_mesh["tp"])
 
-    if parallel_dims.tp_enabled or parallel_dims.ep_enabled:
-        apply_moe_ep_tp(
-            model,
-            tp_mesh=world_mesh["tp"] if parallel_dims.tp_enabled else None,
-            ep_mesh=world_mesh["ep"] if parallel_dims.ep_enabled else None,
-            ep_tp_mesh=(
-                world_mesh["ep", "tp"]
-                if parallel_dims.tp_enabled
-                and parallel_dims.ep_enabled
-                and parallel_dims.etp_enabled
-                else None
-            ),
-            etp_enabled=parallel_dims.etp_enabled,
-        )
+    model_compile_enabled = (
+        job_config.compile.enable and "model" in job_config.compile.components
+    )
 
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(model, job_config.activation_checkpoint)
 
-    model_compile_enabled = (
-        job_config.compile.enable and "model" in job_config.compile.components
-    )
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
-        # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
-        torch._dynamo.config.capture_scalar_outputs = True
         apply_compile(model, job_config.compile)
 
-    dp_mesh: DeviceMesh | None = None
-    if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
+    if parallel_dims.fsdp_enabled:
         # apply FSDP or HSDP, potentially with Context Parallel
         if parallel_dims.dp_replicate_enabled:
             dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
         else:
             dp_mesh_dim_names = ("dp_shard_cp",)
-        dp_mesh = world_mesh[tuple(dp_mesh_dim_names)]
-
-        # the mesh dim names of which the MoE params are sharded on via FSDP/HSDP
-        dp_mod_ep_mesh_dim_names = []
-        if parallel_dims.ep_enabled:
-            if parallel_dims.dp_replicate_enabled:
-                dp_mod_ep_mesh_dim_names.append("dp_replicate")
-            dp_mod_ep_mesh_dim_names.append("dp_shard_mod_ep")
 
         apply_fsdp(
             model,
-            dp_mesh,
+            world_mesh[tuple(dp_mesh_dim_names)],
             param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
             reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
             pp_enabled=parallel_dims.pp_enabled,
             cpu_offload=job_config.training.enable_cpu_offload,
             reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
-            ep_degree=parallel_dims.ep,
-            dp_mod_ep_mesh=(
-                world_mesh[tuple(dp_mod_ep_mesh_dim_names)]
-                if parallel_dims.ep_enabled
-                else None
-            ),
-            gradient_divide_factor=parallel_dims.fsdp_gradient_divide_factor,
         )
 
         if parallel_dims.dp_replicate_enabled:
@@ -154,12 +119,10 @@ def parallelize_hf_transformers(
     elif parallel_dims.dp_replicate_enabled:
         if world_mesh.ndim > 1:
             raise RuntimeError("DDP has not supported > 1D parallelism")
-        dp_mesh = world_mesh
         apply_ddp(
             model,
-            dp_mesh,
+            world_mesh,
             enable_compile=model_compile_enabled,
-            enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
         )
 
     return model

From 0700bdbe46ef83b31dac0d3315e9433dc87f2702 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 4 Nov 2025 12:16:22 +0000
Subject: [PATCH 099/129] remove moe ep tp for now

---
 .../transformers_backend/infra/parallelize.py | 84 -------------------
 1 file changed, 84 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 276cf94bcd..87099f883a 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -437,87 +437,3 @@ def apply_fsdp(
                 )
         elif model.tok_embeddings is not None:
             transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings])
-
-
-def apply_moe_ep_tp(
-    model: nn.Module,
-    tp_mesh: DeviceMesh | None,
-    ep_mesh: DeviceMesh | None,
-    ep_tp_mesh: DeviceMesh | None,
-    etp_enabled: bool,
-):
-    for transformer_block in model.layers:
-        if not transformer_block.moe_enabled:
-            continue
-
-        moe_block = transformer_block.mlp
-        if tp_mesh is not None:
-            moe_layer_plan = {
-                # input / output sharding on the seqlen dim
-                # all-gather for input, reduce-scatter for output
-                "mlp": PrepareModuleInputOutput(
-                    input_layouts=(Shard(1),),
-                    desired_input_layouts=(Replicate(),),
-                    use_local_input=True,
-                    output_layouts=(Partial(),),
-                    desired_output_layouts=(Shard(1),),
-                ),
-                # replicate computation for the router
-                "mlp.gate": NoParallel(),
-            }
-            if ep_mesh is not None and not etp_enabled:
-                # If TP is borrowed for EP, then split the tokens across TP ranks so that
-                # the reorderer, the all-to-all comms, and routed experts computation
-                # are effectively running Sequence Parallel (split along the folded bs*slen dim)
-                moe_layer_plan.update({"mlp.reorderer": ReordererSequenceParallel()})
-            if moe_block.shared_experts is not None:
-                # input Replicate, output Partial
-                moe_layer_plan.update(
-                    {
-                        "mlp.shared_experts.gate_proj": ColwiseParallel(),
-                        "mlp.shared_experts.up_proj": ColwiseParallel(),
-                        "mlp.shared_experts.down_proj": RowwiseParallel(
-                            output_layouts=Partial()
-                        ),
-                    }
-                )
-            parallelize_module(
-                module=transformer_block,
-                device_mesh=tp_mesh,
-                parallelize_plan=moe_layer_plan,
-            )
-
-        if ep_mesh is None:  # This is the TP-only case for experts
-            experts_mesh = tp_mesh
-            expert_tp_plan = {}
-            for i in range(len(moe_block.experts)):
-                expert_tp_plan.update(
-                    {
-                        f"{i}.gate_proj": ColwiseParallel(),
-                        f"{i}.up_proj": ColwiseParallel(),
-                        f"{i}.down_proj": RowwiseParallel(output_layouts=Partial()),
-                    }
-                )
-            parallelize_module(
-                module=moe_block.experts,
-                device_mesh=experts_mesh,
-                parallelize_plan=expert_tp_plan,
-            )
-        else:  # EP or ETP enabled
-            experts_mesh, experts_plan = None, None
-            if tp_mesh is None:
-                experts_mesh = ep_mesh
-                # input / output sharding on the batch / tokens dim
-                experts_plan = ExpertParallel()
-            elif etp_enabled:
-                experts_mesh = ep_tp_mesh
-                experts_plan = ExpertTensorParallel(tp_mesh=tp_mesh, ep_mesh=ep_mesh)
-            else:
-                experts_mesh = ep_mesh
-                experts_plan = ExpertParallel()
-
-            parallelize_module(
-                module=moe_block.experts,
-                device_mesh=experts_mesh,
-                parallelize_plan=experts_plan,
-            )
\ No newline at end of file

From 767f71d610d16e25bdd0f6498cb5a4b2c683bc8f Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 5 Nov 2025 12:11:11 +0000
Subject: [PATCH 100/129] fix SequenceParallel for q and k norm

---
 .../transformers_backend/infra/parallelize.py | 20 +++++--------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 87099f883a..db78f7ea24 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -8,12 +8,11 @@
 import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
-from torch.distributed.tensor import Partial, Replicate, Shard
+from torch.distributed.tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
     PrepareModuleInput,
-    PrepareModuleInputOutput,
     RowwiseParallel,
     SequenceParallel,
 )
@@ -21,11 +20,6 @@
 from torchtitan.config import TORCH_DTYPE_MAP
 from torchtitan.distributed import NoParallel, ParallelDims
 
-from torchtitan.distributed.expert_parallel import (
-    ExpertParallel,
-    ExpertTensorParallel,
-    ReordererSequenceParallel,
-)
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.tools.logging import logger
 
@@ -33,7 +27,6 @@
 from torchtitan.models.llama3.infra.parallelize import apply_ddp
 from torchtitan.models.llama3.infra.parallelize import apply_compile
 
-
 def parallelize_hf_transformers(
     model: nn.Module,
     parallel_dims: ParallelDims,
@@ -230,13 +223,10 @@ def apply_non_moe_tp(
         layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(
             output_layouts=Shard(1)
         )
-
-        # For Qwen3 RMSNorm on Q and K
-        # TODO(3outeille): we should probably shard(1) then replicate => then use SequenceParallel but for now I am fed up
-        if hasattr(transformer_block.self_attn, "q_norm"):
-            layer_plan["self_attn.q_norm"] = NoParallel()
-        if hasattr(transformer_block.self_attn, "k_norm"):
-            layer_plan["self_attn.k_norm"] = NoParallel()
+        #For model that uses RMSNorm on Q and K (i.e. Qwen3)
+        if hasattr(transformer_block.self_attn, "q_norm") and hasattr(transformer_block.self_attn, "k_norm"):
+            layer_plan["self_attn.q_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True)
+            layer_plan["self_attn.k_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True)
 
         if not transformer_block.moe_enabled:
             mlp_plan = {

From 7f71f885eb76322d7ea03955a6e619be0e23921c Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 5 Nov 2025 12:24:29 +0000
Subject: [PATCH 101/129]  job_config.training will always have seq_len

---
 torchtitan/experiments/transformers_backend/model/args.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 668fa48aeb..4c9ffcae72 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -150,8 +150,7 @@ def update_from_config(self, job_config: JobConfig):
             if hasattr(self, key) and value is not None:
                 setattr(self, key, value)
 
-        if hasattr(job_config.training, 'seq_len') and job_config.training.seq_len != self.max_seq_len:
-            self.max_seq_len = job_config.training.seq_len
+        self.max_seq_len = job_config.training.seq_len
 
         # Configure HF-specific settings to match TorchTitan settings
         self.attention_bias = False

From 7e63a82541b226fa275716522e72371c412610ba Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 7 Nov 2025 16:06:44 +0000
Subject: [PATCH 102/129] fix loading weights in PP by using Module Dict

---
 .../transformers_backend/model/model.py       | 34 ++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index 8041e54f70..e0d5628f1a 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -16,6 +16,30 @@
 
 from .args import HFTransformerModelArgs
 
+class SlicableModuleDict(nn.ModuleDict):
+    """
+    A ModuleDict that supports slicing like ModuleList.
+    Keys are expected to be string representations of integers (e.g., "0", "1", "2").
+    """
+    
+    def __getitem__(self, key):
+        if isinstance(key, slice):
+            # Handle slicing: convert slice to list of keys
+            keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf'))
+            sliced_keys = keys[key]
+            # Return a new SlicableModuleDict with the sliced items
+            return SlicableModuleDict({k: self[k] for k in sliced_keys})
+        return super().__getitem__(key)
+    
+    def __iter__(self):
+        # Iterate over values in sorted order by key (as integers)
+        keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf'))
+        for key in keys:
+            yield self[key]
+    
+    def __len__(self):
+        return len(self._modules)
+
 
 class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
@@ -76,7 +100,15 @@ def __init__(self, model_args: HFTransformerModelArgs):
         self.max_seq_len = model_args.max_seq_len
         self.cp_mesh = None
 
-        for layer in self.model.model.layers:
+        # Convert ModuleList to ModuleDict to preserve original indices
+        # This ensures state dict keys match checkpoint keys
+        if isinstance(self.model.model.layers, nn.ModuleList):
+            self.model.model.layers = SlicableModuleDict({
+                str(i): layer
+                for i, layer in enumerate(self.model.model.layers)
+            })
+
+        for layer in self.model.model.layers.values():
             layer.moe_enabled = False
 
     def set_cp_mesh(self, mesh):

From 04fb8eb9c1ab022e8cc9d75a0363c66689c71b89 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 13 Nov 2025 09:21:54 +0000
Subject: [PATCH 103/129] clean reference qwen config

---
 .../transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml      | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
index d1433bb7ed..13e3f4ddf0 100644
--- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
+++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
@@ -6,7 +6,7 @@ description = "Qwen 3 debug training"
 print_config = true
 
 [profiling]
-enable_profiling = true
+enable_profiling = false
 save_traces_folder = "profile_trace"
 profile_freq = 5
 enable_memory_snapshot = false
@@ -41,7 +41,6 @@ decay_type = "linear"
 min_lr_factor = 0.0
 
 [training]
-global_batch_size = 4
 local_batch_size = 2
 seq_len = 2048
 max_norm = 1.0  # grad norm clipping

From 0d80f62c64f3cfe2dbe7d7441cd8cceee464bb67 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 13 Nov 2025 09:27:03 +0000
Subject: [PATCH 104/129] error out if no layer_idx

---
 torchtitan/experiments/transformers_backend/model/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index e0d5628f1a..bb50fd466c 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -171,7 +171,8 @@ def _init_weights_patched(self, module):
 
             if isinstance(module, layer_idx_classes):
                 if not hasattr(module, "layer_idx"):
-                    return
+                    raise ValueError(f"Module {module} does not have a layer_idx attribute")
+
                 layer_idx = module.layer_idx
 
                 if hasattr(config, "depth_init") and config.depth_init:

From 09f0c94790a5817eb9c2f5d40f5d11236f7c79b9 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 13 Nov 2025 10:06:01 +0000
Subject: [PATCH 105/129] reuse pipeline from torchtitan

---
 torchtitan/distributed/pipeline_parallel.py   |  17 +-
 .../transformers_backend/infra/pipeline.py    | 270 +-----------------
 2 files changed, 25 insertions(+), 262 deletions(-)

diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py
index 06dba40d6f..0c0eb89dcc 100644
--- a/torchtitan/distributed/pipeline_parallel.py
+++ b/torchtitan/distributed/pipeline_parallel.py
@@ -228,6 +228,7 @@ def generate_llm_fqn_per_model_part(
     num_layers: int,
     input_weight: int = 1,
     output_weight: int = 1,
+    include_rotary_emb: bool = False,
 ) -> list[list[str]]:
     """
     Programmatically generates module names model part, focused on LLMs models.
@@ -237,6 +238,7 @@ def generate_llm_fqn_per_model_part(
         num_layers: Total number of transformer layers in the model
         input_weight: Weight for input modules (tok_embeddings) in layer calculation
         output_weight: Weight for output modules (norm + output) in layer calculation
+        include_rotary_emb: Whether to include rotary_emb in each model part
 
     Returns:
         List of lists containing module names for each model part
@@ -251,7 +253,10 @@ def generate_llm_fqn_per_model_part(
     if num_stages == 1:
         # Single stage gets everything
         layer_names = [f"layers.{i}" for i in range(num_layers)]
-        return [["tok_embeddings"] + layer_names + ["norm", "output"]]
+        result = [["tok_embeddings"] + layer_names + ["norm", "output"]]
+        if include_rotary_emb:
+            result[0].append("rotary_emb")
+        return result
 
     # Calculate effective layers including weights
     num_effective_layers = num_layers + input_weight + output_weight
@@ -329,6 +334,8 @@ def generate_llm_fqn_per_model_part(
                     stage_modules.append(f"layers.{current_layer}")
                     current_layer += 1
 
+        if include_rotary_emb:
+            stage_modules.append("rotary_emb")
         module_names_per_stage.append(stage_modules)
 
     return module_names_per_stage
@@ -340,6 +347,7 @@ def pipeline_module_split(
     pp_schedule: str,
     device: torch.device,
     module_names_per_stage: list[list[str]],
+    use_identity_for_missing_modules: bool = False,
 ) -> tuple[list[PipelineStage], list[nn.Module]]:
     """
     This API creates pipeline stages based on specified module names for each stage.
@@ -361,6 +369,8 @@ def pipeline_module_split(
                                - "layers.0", "layers.1" for specific transformer layers
                                - "norm" for the final normalization layer
                                - "output" for the output projection layer
+        use_identity_for_missing_modules: If True, replace missing modules with nn.Identity(),
+                                         otherwise replace with None
 
     Returns:
         Tuple of (stages, models) where stages are PipelineStage objects and models are the
@@ -417,8 +427,9 @@ def _build_stage_from_modules(
                         setattr(model, module_name, nn.ModuleList())
             # Handle simple module attributes (e.g., "linear", "norm")
             elif module_name not in modules_to_keep:
-                # Replace with None
-                setattr(model, module_name, None)
+                # Replace with Identity or None based on configuration
+                replacement = nn.Identity() if use_identity_for_missing_modules else None
+                setattr(model, module_name, replacement)
 
         stage = PipelineStage(
             model,
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
index 088cc05642..bfb876e911 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py
@@ -3,280 +3,27 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import copy
 import math
 
 import torch
 import torch.nn as nn
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.pipelining import PipelineStage
 from torch.distributed.pipelining.schedules import (
     _PipelineSchedule,
     get_schedule_class,
     PipelineScheduleSingle,
-    ScheduleDualPipeV,
-    ScheduleZBVZeroBubble,
 )
 
 from torchtitan.components.loss import LossFunction
 from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.distributed import ParallelDims
-from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule
+from torchtitan.distributed.pipeline_parallel import (
+    build_pipeline_schedule,
+    generate_llm_fqn_per_model_part,
+    pipeline_module_split,
+)
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
 
-# NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part
-
-
-def generate_llm_fqn_per_model_part(
-    num_stages: int,
-    num_layers: int,
-    input_weight: int = 1,
-    output_weight: int = 1,
-) -> list[list[str]]:
-    """
-    Programmatically generates module names model part, focused on LLMs models.
-    Args:
-        num_stages: Number of pipeline stages
-        num_layers: Total number of transformer layers in the model
-        input_weight: Weight for input modules (embed_tokens) in layer calculation
-        output_weight: Weight for output modules (norm + output) in layer calculation
-    Returns:
-        List of lists containing module names for each model part
-    Example:
-        generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2)
-        treats embeddings as 2 layers and norm+output as 2 layers for distribution
-    """
-    if num_stages < 1:
-        raise ValueError("Number of stages must be at least 1")
-
-    if num_stages == 1:
-        # Single stage gets everything
-        layer_names = [f"layers.{i}" for i in range(num_layers)]
-        return [["tok_embeddings"] + layer_names + ["norm", "output", "rotary_emb"]]
-
-    # Calculate effective layers including weights
-    num_effective_layers = num_layers + input_weight + output_weight
-
-    if num_stages > num_effective_layers:
-        raise ValueError(
-            f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})"
-        )
-
-    # Calculate layers per stage (distribute evenly)
-    layers_per_stage = num_effective_layers // num_stages
-    extra_layers = num_effective_layers % num_stages
-
-    # Feasibility check: Ensure at least 1 layer in each PP stage
-    if layers_per_stage == 0:
-        raise ValueError(
-            f"Configuration would result in empty stages. "
-            f"With {num_stages} stages and {num_effective_layers} effective layers "
-            f"(num_layers={num_layers} + input_weight={input_weight} + output_weight={output_weight}), "
-            f"each stage would get {layers_per_stage} layers on average. "
-            f"Reduce num_stages or increase num_layers/weights."
-        )
-
-    # Balance check: Ensure weights don't exceed minimum layers per stage
-    if input_weight > layers_per_stage:
-        raise ValueError(
-            f"input_weight ({input_weight}) exceeds minimum layers per stage ({layers_per_stage})."
-        )
-    if output_weight > layers_per_stage:
-        raise ValueError(
-            f"output_weight ({output_weight}) exceeds minimum layers per stage ({layers_per_stage})."
-        )
-
-    module_names_per_stage = []
-    current_layer = 0
-
-    for stage_idx in range(num_stages):
-        stage_modules = []
-
-        # Calculate effective layers for this stage
-        effective_layers_for_stage = layers_per_stage
-        if stage_idx < extra_layers:
-            effective_layers_for_stage += 1
-
-        # First stage: handle input modules with weighting
-        if stage_idx == 0:
-            stage_modules.append("tok_embeddings")
-            # Account for input weight in layer distribution
-            remaining_layers_for_stage = effective_layers_for_stage - input_weight
-
-            # Add transformer layers
-            for _ in range(remaining_layers_for_stage):
-                if current_layer < num_layers:
-                    stage_modules.append(f"layers.{current_layer}")
-                    current_layer += 1
-
-        # Last stage: handle output modules with weighting
-        elif stage_idx == num_stages - 1:
-            # Account for output weight in layer distribution
-            remaining_layers_for_stage = effective_layers_for_stage - output_weight
-
-            # Add transformer layers
-            for _ in range(remaining_layers_for_stage):
-                if current_layer < num_layers:
-                    stage_modules.append(f"layers.{current_layer}")
-                    current_layer += 1
-
-            # Add output modules
-            stage_modules.extend(["norm", "output"])
-
-        # Middle stages: only transformer layers
-        else:
-            for _ in range(effective_layers_for_stage):
-                if current_layer < num_layers:
-                    stage_modules.append(f"layers.{current_layer}")
-                    current_layer += 1
-
-        stage_modules.append("rotary_emb")
-        module_names_per_stage.append(stage_modules)
-
-    return module_names_per_stage
-
-
-def pipeline_module_split(
-    whole_model: nn.Module,
-    pp_mesh: DeviceMesh,
-    pp_schedule: str,
-    device: torch.device,
-    module_names_per_stage: list[list[str]],
-) -> tuple[list[PipelineStage], list[nn.Module]]:
-    """
-    This API creates pipeline stages based on specified module names for each stage.
-
-    Some model restrictions include:
-    - forward() method should tolerate deleted layers
-    - weight initialization methods should tolerate deleted layers
-    - Does not support nested moduledict and modulelist structures
-
-    Args:
-        whole_model: The complete model to be split
-        pp_mesh: Pipeline parallel device mesh
-        pp_schedule: Name of pipeline parallelism schedule
-        device: Device
-        module_names_per_stage: List of lists, where each inner list contains the module names
-                               that should be included in that stage. Module names should be
-                               dot-separated paths. Examples:
-                               - "tok_embeddings" for token embeddings
-                               - "layers.0", "layers.1" for specific transformer layers
-                               - "norm" for the final normalization layer
-                               - "output" for the output projection layer
-
-    Returns:
-        Tuple of (stages, models) where stages are PipelineStage objects and models are the
-        corresponding model chunks
-
-    Example usage:
-        module_names_per_stage = [
-            ["tok_embeddings", "layers.0"],     # Stage 0: embeddings + first layer
-            ["layers.1", "layers.2"],           # Stage 1: middle layers
-            ["norm", "output"]                  # Stage 2: final norm + output
-        ]
-    """
-    pp_rank = pp_mesh.get_local_rank()
-    pp_degree = pp_mesh.size()
-
-    def _build_stage_from_modules(
-        stage_idx: int, module_names: list[str], num_stages: int
-    ) -> tuple[PipelineStage, nn.Module]:
-        model = copy.deepcopy(whole_model)
-
-        # Create a set of modules to keep for faster lookup
-        modules_to_keep = set(module_names)
-        for module_name, module_value in model.named_children():
-            # Handle layer-like structures (e.g., "layers.0", "layers.1")
-            if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)):
-                layers_to_keep = {
-                    name.split(".", 1)[1]
-                    for name in modules_to_keep
-                    if name.startswith(f"{module_name}.")
-                }
-                if layers_to_keep:
-                    # Keep only specified layers
-                    if isinstance(module_value, nn.ModuleDict):
-                        for layer_name in list(module_value.keys()):
-                            if layer_name not in layers_to_keep:
-                                del module_value[layer_name]
-                    elif isinstance(module_value, nn.ModuleList):
-                        indices_to_keep = {
-                            int(idx) for idx in layers_to_keep if idx.isdigit()
-                        }
-                        new_layers = nn.ModuleList(
-                            [
-                                layer
-                                for i, layer in enumerate(module_value)
-                                if i in indices_to_keep
-                            ]
-                        )
-                        setattr(model, module_name, new_layers)
-                else:
-                    # No layers from this structure needed, set to empty structure
-                    if isinstance(module_value, nn.ModuleDict):
-                        setattr(model, module_name, nn.ModuleDict())
-                    elif isinstance(module_value, nn.ModuleList):
-                        setattr(model, module_name, nn.ModuleList())
-            # Handle simple module attributes (e.g., "linear", "norm")
-            elif module_name not in modules_to_keep:
-                # Replace with Identity
-                setattr(model, module_name, nn.Identity())
-
-        stage = PipelineStage(
-            model,
-            stage_idx,
-            num_stages,
-            device,
-            group=pp_mesh.get_group("pp"),
-        )
-        return stage, model
-
-    num_stages = len(module_names_per_stage)
-    stages = []
-    models = []
-
-    schedule_class = get_schedule_class(pp_schedule)
-    style = (
-        "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop"
-    )
-
-    def _get_stage_indices() -> tuple[int]:
-        """
-        Compute the stage ids for the stages that will run on this pp rank
-        for either a looped or V style schedule
-        """
-        assert (
-            num_stages % pp_degree == 0
-        ), f"num_stages {num_stages} must be evenly divisible by pp_degree {pp_degree}"
-        stages_per_rank = num_stages // pp_degree
-        if style == "loop":
-            return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank))
-        elif style == "v":
-            assert (
-                stages_per_rank == 2
-            ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
-            stage_v_pairs = list(
-                zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1))
-            )
-            return stage_v_pairs[pp_rank]
-
-    for stage_idx in _get_stage_indices():
-        module_names = module_names_per_stage[stage_idx]
-        stage, model_chunk = _build_stage_from_modules(
-            stage_idx,
-            module_names,
-            num_stages,
-        )
-        logger.info(
-            f"PP rank {pp_rank} is building stage_idx {stage_idx} "
-            f"with modules {module_names}"
-        )
-        stages.append(stage)
-        models.append(model_chunk)
-
-    return stages, models
-
 
 def pipeline_hf_transformers(
     model: nn.Module,
@@ -355,7 +102,11 @@ def pipeline_hf_transformers(
     module_names_per_stage = job_config.parallelism.module_fqns_per_model_part
     if module_names_per_stage is None:
         module_names_per_stage = generate_llm_fqn_per_model_part(
-            num_virtual_stages, num_layers, input_weight, output_weight
+            num_virtual_stages,
+            num_layers,
+            input_weight,
+            output_weight,
+            include_rotary_emb=True,
         )
     for i, stage_ms in enumerate(module_names_per_stage):
         logger.debug(f"Stage {i}: {stage_ms}")
@@ -366,6 +117,7 @@ def pipeline_hf_transformers(
         job_config.parallelism.pipeline_parallel_schedule,
         device,
         module_names_per_stage,
+        use_identity_for_missing_modules=True,
     )
 
     # For PP with looped schedules, each item in model_parts is one stage-model-chunk.

From 78d26ff7429fa807e9d6d4a31ae17fba6a7f3285 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 13 Nov 2025 10:09:31 +0000
Subject: [PATCH 106/129] use c4 test for integration_tests

---
 .../experiments/transformers_backend/tests/integration_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
index 1f2a38d322..8bc8a63a31 100644
--- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py
+++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
@@ -22,7 +22,7 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]:
             [
                 [
                     "--model.name meta-llama/Llama-3.2-1B",
-                    "--training.dataset wikitext2-test",
+                    "--training.dataset c4-test",
                     "--parallelism.data_parallel_shard_degree 2",
                     "--parallelism.tensor_parallel_degree 2",
                     "--parallelism.pipeline_parallel_degree 2",

From 524379546edbe48275eba94f4a33e56e3b01f449 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 13 Nov 2025 10:22:21 +0000
Subject: [PATCH 107/129] fix ci

---
 .ci/docker/common/install_conda.sh                              | 1 +
 .../docker/requirements-transformers-backend.txt                | 0
 .ci/docker/ubuntu/Dockerfile                                    | 1 +
 .github/workflows/integration_test_8gpu_huggingface.yaml        | 2 --
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename torchtitan/experiments/transformers_backend/requirements.txt => .ci/docker/requirements-transformers-backend.txt (100%)

diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index c2f316b04b..d3cb20e7a3 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -43,6 +43,7 @@ install_pip_dependencies() {
   pip_install -r /opt/conda/requirements.txt
   pip_install -r /opt/conda/requirements-flux.txt
   pip_install -r /opt/conda/requirements-vlm.txt
+  pip_install -r /opt/conda/requirements-transformers-backend.txt
   popd
 }
 
diff --git a/torchtitan/experiments/transformers_backend/requirements.txt b/.ci/docker/requirements-transformers-backend.txt
similarity index 100%
rename from torchtitan/experiments/transformers_backend/requirements.txt
rename to .ci/docker/requirements-transformers-backend.txt
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 8f3bb9789f..7c53d3f1a1 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -33,6 +33,7 @@ COPY requirements-dev.txt /opt/conda/
 COPY requirements.txt /opt/conda/
 COPY requirements-flux.txt /opt/conda/
 COPY requirements-vlm.txt /opt/conda/
+COPY requirements-transformers-backend.txt /opt/conda/
 COPY conda-env-ci.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/utils.sh utils.sh
diff --git a/.github/workflows/integration_test_8gpu_huggingface.yaml b/.github/workflows/integration_test_8gpu_huggingface.yaml
index cde7959510..aea5189d81 100644
--- a/.github/workflows/integration_test_8gpu_huggingface.yaml
+++ b/.github/workflows/integration_test_8gpu_huggingface.yaml
@@ -49,7 +49,5 @@ jobs:
 
         USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 
-        python -m pip install transformers==4.55.4
-
         mkdir artifacts-to-be-uploaded
         python -m torchtitan.experiments.transformers_backend.tests.integration_tests artifacts-to-be-uploaded --ngpu 8

From fe691b892825c249bf43b435e29adfd0b87e7310 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 13 Nov 2025 10:51:48 +0000
Subject: [PATCH 108/129] fix linting

---
 torchtitan/distributed/pipeline_parallel.py   |  4 ++-
 .../transformers_backend/README.md            |  2 +-
 .../transformers_backend/__init__.py          |  3 +-
 .../transformers_backend/infra/parallelize.py | 22 +++++++++-----
 .../transformers_backend/infra/pipeline.py    |  2 +-
 .../transformers_backend/job_config.py        | 10 ++++++-
 .../transformers_backend/model/args.py        |  2 ++
 .../transformers_backend/model/model.py       | 30 +++++++++++--------
 8 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py
index 0c0eb89dcc..b954d32c19 100644
--- a/torchtitan/distributed/pipeline_parallel.py
+++ b/torchtitan/distributed/pipeline_parallel.py
@@ -428,7 +428,9 @@ def _build_stage_from_modules(
             # Handle simple module attributes (e.g., "linear", "norm")
             elif module_name not in modules_to_keep:
                 # Replace with Identity or None based on configuration
-                replacement = nn.Identity() if use_identity_for_missing_modules else None
+                replacement = (
+                    nn.Identity() if use_identity_for_missing_modules else None
+                )
                 setattr(model, module_name, replacement)
 
         stage = PipelineStage(
diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
index 8fbd19f0e8..a5b4059c07 100644
--- a/torchtitan/experiments/transformers_backend/README.md
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -18,7 +18,7 @@ hf_assets_path = "./tests/assets/tokenizer"
 ...
 ```
 - Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATHQ/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --job.custom_config_module=torchtitan.experiments.transformers_backend.job_config --compile.enable`
-    - Make sure you have created the tokenizers beforehand 
+    - Make sure you have created the tokenizers beforehand
 <img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" />
 
 ## Supported Features
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index b72b77760c..fd0cd9b689 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -43,11 +43,12 @@ class TitanDenseModelArgs:
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
 
+
 flavors = {
     "debugmodel": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(
             dim=256,
-            n_layers=6,
+            n_layers=2,
             n_heads=16,
             n_kv_heads=16,
         ),
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index db78f7ea24..b2ae3f02a1 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -16,16 +16,16 @@
     RowwiseParallel,
     SequenceParallel,
 )
-from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.config import TORCH_DTYPE_MAP
 from torchtitan.distributed import NoParallel, ParallelDims
 
+from torchtitan.distributed.activation_checkpoint import apply_ac
+
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp
 from torchtitan.tools.logging import logger
 
-from torchtitan.distributed.activation_checkpoint import apply_ac
-from torchtitan.models.llama3.infra.parallelize import apply_ddp
-from torchtitan.models.llama3.infra.parallelize import apply_compile
 
 def parallelize_hf_transformers(
     model: nn.Module,
@@ -223,10 +223,16 @@ def apply_non_moe_tp(
         layer_plan[f"self_attn.{o_proj_name}"] = rowwise_parallel(
             output_layouts=Shard(1)
         )
-        #For model that uses RMSNorm on Q and K (i.e. Qwen3)
-        if hasattr(transformer_block.self_attn, "q_norm") and hasattr(transformer_block.self_attn, "k_norm"):
-            layer_plan["self_attn.q_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True)
-            layer_plan["self_attn.k_norm"] = SequenceParallel(sequence_dim=2, use_local_output=True)
+        # For model that uses RMSNorm on Q and K (i.e. Qwen3)
+        if hasattr(transformer_block.self_attn, "q_norm") and hasattr(
+            transformer_block.self_attn, "k_norm"
+        ):
+            layer_plan["self_attn.q_norm"] = SequenceParallel(
+                sequence_dim=2, use_local_output=True
+            )
+            layer_plan["self_attn.k_norm"] = SequenceParallel(
+                sequence_dim=2, use_local_output=True
+            )
 
         if not transformer_block.moe_enabled:
             mlp_plan = {
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
index bfb876e911..6a891bb271 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py
@@ -14,13 +14,13 @@
 )
 
 from torchtitan.components.loss import LossFunction
-from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.pipeline_parallel import (
     build_pipeline_schedule,
     generate_llm_fqn_per_model_part,
     pipeline_module_split,
 )
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
 
diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py
index 6344529d20..f3b1667798 100644
--- a/torchtitan/experiments/transformers_backend/job_config.py
+++ b/torchtitan/experiments/transformers_backend/job_config.py
@@ -1,10 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from dataclasses import dataclass, field
 
+
 @dataclass
 class HFTransformers:
     model: str = ""
     """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')"""
 
+
 @dataclass
 class JobConfig:
-    hf_transformers: HFTransformers = field(default_factory=HFTransformers)
\ No newline at end of file
+    hf_transformers: HFTransformers = field(default_factory=HFTransformers)
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 4c9ffcae72..5a22edd386 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -82,6 +82,7 @@ def _initialize_dense_attributes(self, titan_dense_args):
 
         # Update passed_args
         self._titan_injected_model_args.update(titan_dense_args.__dict__)
+
     def _configure_hf_attention(self, attn_implementation: str):
         """Configure HuggingFace attention settings."""
         self._titan_injected_model_args["attn_implementation"] = attn_implementation
@@ -153,6 +154,7 @@ def update_from_config(self, job_config: JobConfig):
         self.max_seq_len = job_config.training.seq_len
 
         # Configure HF-specific settings to match TorchTitan settings
+        # TODO: false ?
         self.attention_bias = False
         self.mlp_bias = False
         self.use_cache = False
diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index bb50fd466c..8c35ac4e94 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -16,27 +16,32 @@
 
 from .args import HFTransformerModelArgs
 
+
 class SlicableModuleDict(nn.ModuleDict):
     """
     A ModuleDict that supports slicing like ModuleList.
     Keys are expected to be string representations of integers (e.g., "0", "1", "2").
     """
-    
+
     def __getitem__(self, key):
         if isinstance(key, slice):
             # Handle slicing: convert slice to list of keys
-            keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf'))
+            keys = sorted(
+                self.keys(), key=lambda x: int(x) if x.isdigit() else float("inf")
+            )
             sliced_keys = keys[key]
             # Return a new SlicableModuleDict with the sliced items
             return SlicableModuleDict({k: self[k] for k in sliced_keys})
         return super().__getitem__(key)
-    
+
     def __iter__(self):
         # Iterate over values in sorted order by key (as integers)
-        keys = sorted(self.keys(), key=lambda x: int(x) if x.isdigit() else float('inf'))
+        keys = sorted(
+            self.keys(), key=lambda x: int(x) if x.isdigit() else float("inf")
+        )
         for key in keys:
             yield self[key]
-    
+
     def __len__(self):
         return len(self._modules)
 
@@ -82,9 +87,7 @@ def __init__(self, model_args: HFTransformerModelArgs):
                     mlp_cls=mlp_cls,  # mlp_cls can be None
                 )
             else:
-                missing = [
-                    name for name, cls in required_classes.items() if not cls
-                ]
+                missing = [name for name, cls in required_classes.items() if not cls]
                 logger.warning(
                     f"Could not find required classes ({', '.join(missing)}) for {model_name_prefix}. "
                     "Skipping Llama-like patch."
@@ -103,10 +106,9 @@ def __init__(self, model_args: HFTransformerModelArgs):
         # Convert ModuleList to ModuleDict to preserve original indices
         # This ensures state dict keys match checkpoint keys
         if isinstance(self.model.model.layers, nn.ModuleList):
-            self.model.model.layers = SlicableModuleDict({
-                str(i): layer
-                for i, layer in enumerate(self.model.model.layers)
-            })
+            self.model.model.layers = SlicableModuleDict(
+                {str(i): layer for i, layer in enumerate(self.model.model.layers)}
+            )
 
         for layer in self.model.model.layers.values():
             layer.moe_enabled = False
@@ -171,7 +173,9 @@ def _init_weights_patched(self, module):
 
             if isinstance(module, layer_idx_classes):
                 if not hasattr(module, "layer_idx"):
-                    raise ValueError(f"Module {module} does not have a layer_idx attribute")
+                    raise ValueError(
+                        f"Module {module} does not have a layer_idx attribute"
+                    )
 
                 layer_idx = module.layer_idx
 

From 5d5ce2b8d8bab215e6acecf1225dc69668627083 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 14 Nov 2025 11:01:33 +0000
Subject: [PATCH 109/129] fix head dims in flops counting

---
 torchtitan/experiments/transformers_backend/model/args.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 5a22edd386..db1696e7a5 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 
 from torch import nn
-from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.config.job_config import JobConfig
 from torchtitan.models.utils import get_dense_model_nparams_and_flops
 from torchtitan.protocols import BaseModelArgs
 from transformers import AutoConfig
@@ -132,7 +132,7 @@ def __repr__(self) -> str:
     def update_from_config(self, job_config: JobConfig):
         # Load HF config (overwrites our HF attributes)
         hf_model_config = AutoConfig.from_pretrained(
-            job_config.hf_transformers.model,
+            job_config.model.name,
             attn_implementation=self.attn_implementation,
             trust_remote_code=True,
         )
@@ -174,4 +174,4 @@ def update_from_config(self, job_config: JobConfig):
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        return get_dense_model_nparams_and_flops(self, model, seq_len)
+        return get_dense_model_nparams_and_flops(self, model, head_dims=self.head_dim, seq_len=seq_len)

From 6ace9f43c83fb85afb2335800861b7400915392e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 14 Nov 2025 11:05:36 +0000
Subject: [PATCH 110/129] propose an alternative to passing name

---
 .../experiments/transformers_backend/README.md |  9 ++++-----
 .../configs/qwen3_fsdp2_tp2_pp2.toml           |  5 +----
 .../transformers_backend/infra/parallelize.py  |  2 +-
 .../transformers_backend/infra/pipeline.py     |  2 +-
 .../transformers_backend/job_config.py         | 18 ------------------
 torchtitan/protocols/train_spec.py             |  4 ++++
 6 files changed, 11 insertions(+), 29 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/job_config.py

diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
index a5b4059c07..4ecbbe8c6f 100644
--- a/torchtitan/experiments/transformers_backend/README.md
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -9,15 +9,14 @@
 ...
 [model]
 - name = "llama3"
-+ name = "transformers_backend"
++ name = "Qwen/Qwen3-4B-Instruct-2507"
 flavor = "debugmodel"
 hf_assets_path = "./tests/assets/tokenizer"
-
-+[hf_transformers]
-+model = "Qwen/Qwen3-4B-Instruct-2507"
 ...
 ```
-- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATHQ/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --job.custom_config_module=torchtitan.experiments.transformers_backend.job_config --compile.enable`
+**Note:** Any model name containing "/" is automatically recognized as a HuggingFace model ID and will use the `transformers_backend`.
+
+- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATH>/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --compile.enable`
     - Make sure you have created the tokenizers beforehand
 <img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" />
 
diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
index 13e3f4ddf0..b0e294ccbe 100644
--- a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
+++ b/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
@@ -20,15 +20,12 @@ save_tb_folder = "tb"
 enable_wandb = false
 
 [model]
-name = "transformers_backend"
+name = "Qwen/Qwen3-4B-Instruct-2507"
 flavor = "debugmodel"
 # test folder with tokenizer.json, for debug purpose only
 hf_assets_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
-[hf_transformers]
-model = "Qwen/Qwen3-4B-Instruct-2507"
-
 [optimizer]
 name = "AdamW"
 lr = 8e-4
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index b2ae3f02a1..987fae6049 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -22,7 +22,7 @@
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.config.job_config import JobConfig
 from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp
 from torchtitan.tools.logging import logger
 
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
index 6a891bb271..511297ad7c 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py
@@ -20,7 +20,7 @@
     generate_llm_fqn_per_model_part,
     pipeline_module_split,
 )
-from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.config.job_config import JobConfig
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
 
diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py
deleted file mode 100644
index f3b1667798..0000000000
--- a/torchtitan/experiments/transformers_backend/job_config.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass, field
-
-
-@dataclass
-class HFTransformers:
-    model: str = ""
-    """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')"""
-
-
-@dataclass
-class JobConfig:
-    hf_transformers: HFTransformers = field(default_factory=HFTransformers)
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index 22bfa7df9b..3eed6ddd2f 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -77,6 +77,10 @@ def get_train_spec(name: str) -> TrainSpec:
     from torchtitan.experiments import _supported_experiments
     from torchtitan.models import _supported_models
 
+    if "/" in name:
+        module = import_module("torchtitan.experiments.transformers_backend")
+        return module.get_train_spec()
+
     if name in _supported_models:
         module = import_module(f"torchtitan.models.{name}")
         return module.get_train_spec()

From 97cd6fe0ee601789f719ec92e9be880506a06646 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 14 Nov 2025 11:22:08 +0000
Subject: [PATCH 111/129] fix linting

---
 .../experiments/transformers_backend/infra/parallelize.py     | 2 +-
 torchtitan/experiments/transformers_backend/infra/pipeline.py | 2 +-
 torchtitan/experiments/transformers_backend/model/args.py     | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 987fae6049..a4b7e66ad8 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -17,12 +17,12 @@
     SequenceParallel,
 )
 from torchtitan.config import TORCH_DTYPE_MAP
+from torchtitan.config.job_config import JobConfig
 from torchtitan.distributed import NoParallel, ParallelDims
 
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.config.job_config import JobConfig
 from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp
 from torchtitan.tools.logging import logger
 
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
index 511297ad7c..b813225fe6 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py
@@ -14,13 +14,13 @@
 )
 
 from torchtitan.components.loss import LossFunction
+from torchtitan.config.job_config import JobConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.pipeline_parallel import (
     build_pipeline_schedule,
     generate_llm_fqn_per_model_part,
     pipeline_module_split,
 )
-from torchtitan.config.job_config import JobConfig
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
 
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index db1696e7a5..9a6271980b 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -174,4 +174,6 @@ def update_from_config(self, job_config: JobConfig):
         return self
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        return get_dense_model_nparams_and_flops(self, model, head_dims=self.head_dim, seq_len=seq_len)
+        return get_dense_model_nparams_and_flops(
+            self, model, head_dims=self.head_dim, seq_len=seq_len
+        )

From 5f1695f0a013e573c1ddea8c6cafbc537769a9e7 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Fri, 14 Nov 2025 13:22:16 +0000
Subject: [PATCH 112/129] bump transformers version from 4.55.4 to 4.57.1

---
 .ci/docker/requirements-transformers-backend.txt      | 2 +-
 torchtitan/experiments/transformers_backend/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-transformers-backend.txt b/.ci/docker/requirements-transformers-backend.txt
index 6b0cc637db..76e8886ed0 100644
--- a/.ci/docker/requirements-transformers-backend.txt
+++ b/.ci/docker/requirements-transformers-backend.txt
@@ -1 +1 @@
-transformers==4.55.4
+transformers==4.57.1
diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
index 4ecbbe8c6f..3d1a2dcf0d 100644
--- a/torchtitan/experiments/transformers_backend/README.md
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -2,7 +2,7 @@
 
 ## Quick start
 
-- Requirements `transformers==4.55.4`
+- Requirements `transformers==4.57.1`
 
 - Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml`
 ```diff

From 2d2b6122a249a69c71f9647ec9c034965204c8a1 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 10:11:46 +0000
Subject: [PATCH 113/129] change qwen3 config name

---
 .../configs/{qwen3_fsdp2_tp2_pp2.toml => qwen3.toml}              | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename torchtitan/experiments/transformers_backend/configs/{qwen3_fsdp2_tp2_pp2.toml => qwen3.toml} (100%)

diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml b/torchtitan/experiments/transformers_backend/configs/qwen3.toml
similarity index 100%
rename from torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml
rename to torchtitan/experiments/transformers_backend/configs/qwen3.toml

From a2ea2ef430d6d50742e3adb8bed1c9bde4a24cac Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 10:37:30 +0000
Subject: [PATCH 114/129] reuse fsdp from llama3. Moe will be handle in another
 PR

---
 .../transformers_backend/infra/parallelize.py | 167 +-----------------
 1 file changed, 2 insertions(+), 165 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index a4b7e66ad8..163249e04b 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -4,10 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
 import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -23,7 +21,7 @@
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp
+from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp, apply_fsdp
 from torchtitan.tools.logging import logger
 
 
@@ -271,165 +269,4 @@ def apply_non_moe_tp(
     logger.info(
         f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}"
         "Tensor Parallelism to the model"
-    )
-
-
-def apply_fsdp(
-    model: nn.Module,
-    dp_mesh: DeviceMesh,
-    param_dtype: torch.dtype,
-    reduce_dtype: torch.dtype,
-    pp_enabled: bool,
-    cpu_offload: bool = False,
-    reshard_after_forward_policy: str = "default",
-    ep_degree: int = 1,
-    dp_mod_ep_mesh: DeviceMesh | None = None,
-    gradient_divide_factor: int | None = None,
-):
-    """
-    Apply data parallelism (via FSDP2) to the model.
-
-    Args:
-        model (nn.Module): The model to apply data parallelism to.
-        dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
-        param_dtype (torch.dtype): The data type to use for model parameters.
-        reduce_dtype (torch.dtype): The data type to use for reduction operations.
-        pp_enabled (bool): Whether pipeline parallelism is enabled.
-        cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
-        reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default".
-            Other options: "never", "always".
-            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
-            - "always" will enable `reshard_after_forward` for all forward passes.
-            - "never" will disable `reshard_after_forward` for all forward passes.
-
-    """
-    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
-    fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
-    if cpu_offload:
-        fsdp_config["offload_policy"] = CPUOffloadPolicy()
-
-    match reshard_after_forward_policy:
-        case "always":
-            reshard_after_forward = True
-        case "never":
-            reshard_after_forward = False
-        case "default":
-            # For PP, by default do not reshard after forward to avoid per-microbatch
-            # all-gathers, which can be expensive and non-overlapped
-            reshard_after_forward = not pp_enabled
-        case _:
-            raise ValueError(
-                f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
-            )
-
-    if model.tok_embeddings is not None:
-        fully_shard(
-            model.tok_embeddings,
-            **fsdp_config,
-            reshard_after_forward=reshard_after_forward,
-        )
-
-    for transformer_block in model.layers:
-        # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping
-        # - the router and the shared experts are sharded together with the TransformerBlock
-        # - the routed experts are sharded with the remaining dp_mod_ep_mesh
-        if (
-            hasattr(transformer_block, "moe_enabled")
-            and transformer_block.moe_enabled
-            and ep_degree > 1
-        ):
-            fsdp_mod_ep_config = fsdp_config.copy()
-            fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh
-            moe_block = transformer_block.mlp
-            # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts).
-            #       When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding
-            #       causes inefficiency, so we choose to do FSDP sharding on dim-1.
-            #       Even when EP is not used, we may still want to shard the experts
-            #       on non-0 dim. For now it may not be worth the complexity to support
-            #       shard_placement_fn on the outer TransformerBlock-level FSDP.
-            _experts_shard_placement_fn = None
-            assert dp_mod_ep_mesh is not None
-            if dp_mod_ep_mesh.size() * ep_degree > moe_block.experts.num_experts:
-                _experts_shard_placement_fn = lambda param: Shard(1)
-
-            fully_shard(
-                moe_block.experts,
-                **fsdp_mod_ep_config,
-                reshard_after_forward=reshard_after_forward,
-                shard_placement_fn=_experts_shard_placement_fn,
-            )
-
-            # NOTE: # Although the FSDP sharding of experts is done on a mesh of
-            #       a different size than other parameters, the gradient division
-            #       factor should be consistent with data.
-            moe_block.experts.set_gradient_divide_factor(
-                gradient_divide_factor,
-            )
-
-        fully_shard(
-            transformer_block,
-            **fsdp_config,
-            reshard_after_forward=reshard_after_forward,
-        )
-
-    # As an optimization, do not reshard_after_forward the last layers by default
-    # since FSDP would prefetch them immediately after the forward pass
-    if model.norm is not None and model.output is not None:
-        fully_shard(
-            [model.norm, model.output],
-            **fsdp_config,
-            reshard_after_forward=reshard_after_forward_policy == "always",
-        )
-
-    fully_shard(model, **fsdp_config)
-
-    # NOTE: set up explicit prefetching when EP is enabled, as D2H syncs
-    # in EP could interfere with implicit prefetching in FSDP
-    if ep_degree == 1:
-        return
-
-    # forward
-    transformer_blocks = list(model.layers.values())
-    next_transformer_blocks = transformer_blocks[1:] + [None]
-
-    if model.tok_embeddings is not None and model.layers is not None:
-        model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]])
-
-    for transformer_block, next_transformer_block in zip(
-        transformer_blocks, next_transformer_blocks
-    ):
-        if next_transformer_block is not None:
-            if next_transformer_block.moe_enabled:
-                transformer_block.set_modules_to_forward_prefetch(
-                    [next_transformer_block, next_transformer_block.mlp.experts]
-                )
-            else:
-                transformer_block.set_modules_to_forward_prefetch(
-                    [next_transformer_block]
-                )
-        elif model.norm is not None and model.output is not None:
-            transformer_block.set_modules_to_forward_prefetch(
-                [model.norm, model.output]
-            )
-
-    # backward
-    reversed_transformer_blocks = list(reversed(model.layers.values()))
-    prev_transformer_blocks = reversed_transformer_blocks[1:] + [None]
-
-    if model.norm is not None and model.output is not None and model.layers is not None:
-        model.output.set_modules_to_backward_prefetch([reversed_transformer_blocks[0]])
-
-    for transformer_block, prev_transformer_block in zip(
-        reversed_transformer_blocks, prev_transformer_blocks
-    ):
-        if prev_transformer_block is not None:
-            if prev_transformer_block.moe_enabled:
-                transformer_block.set_modules_to_backward_prefetch(
-                    [prev_transformer_block, prev_transformer_block.mlp.experts]
-                )
-            else:
-                transformer_block.set_modules_to_backward_prefetch(
-                    [prev_transformer_block]
-                )
-        elif model.tok_embeddings is not None:
-            transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings])
+    )
\ No newline at end of file

From 47fb2eab0e01cdebf79544caea04fba84aad6bc1 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 10:41:16 +0000
Subject: [PATCH 115/129] clean logging

---
 .../experiments/transformers_backend/infra/parallelize.py | 8 ++++++--
 .../experiments/transformers_backend/infra/pipeline.py    | 3 ---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index 163249e04b..cb68826e87 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -21,7 +21,11 @@
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp, apply_fsdp
+from torchtitan.models.llama3.infra.parallelize import (
+    apply_compile,
+    apply_ddp,
+    apply_fsdp,
+)
 from torchtitan.tools.logging import logger
 
 
@@ -269,4 +273,4 @@ def apply_non_moe_tp(
     logger.info(
         f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}"
         "Tensor Parallelism to the model"
-    )
\ No newline at end of file
+    )
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
index b813225fe6..c8904f4352 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py
@@ -22,7 +22,6 @@
     pipeline_module_split,
 )
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
-from torchtitan.tools.logging import logger
 
 
 def pipeline_hf_transformers(
@@ -108,8 +107,6 @@ def pipeline_hf_transformers(
             output_weight,
             include_rotary_emb=True,
         )
-    for i, stage_ms in enumerate(module_names_per_stage):
-        logger.debug(f"Stage {i}: {stage_ms}")
 
     stages, model_parts = pipeline_module_split(
         model,

From 20308d31fe0719463fb93b4464d509c8c7d79172 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 10:46:49 +0000
Subject: [PATCH 116/129] move TitanDenseModelArgs to args

---
 .../transformers_backend/__init__.py          | 20 +------------------
 .../transformers_backend/model/args.py        | 17 ++++++++++++++++
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index fd0cd9b689..6d74050608 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -15,33 +15,15 @@
 from .infra.parallelize import parallelize_hf_transformers
 
 from .infra.pipeline import pipeline_hf_transformers
-from .model.args import HFTransformerModelArgs
+from .model.args import HFTransformerModelArgs, TitanDenseModelArgs
 from .model.model import HFTransformerModel
 
-
 __all__ = [
     "HFTransformerModelArgs",
     "HFTransformerModel",
 ]
 
 
-@dataclass
-class TitanDenseModelArgs:
-    """Arguments for the base TorchTitan model."""
-
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: int | None = None
-    vocab_size: int | None = None
-    multiple_of: int = 256
-    ffn_dim_multiplier: float | None = None
-    norm_eps: float = 1e-5
-    rope_theta: float = 10000
-    max_seq_len: int = 2048
-    depth_init: bool = True
-    use_flex_attn: bool = False
-    attn_mask_type: str = "causal"
 
 
 flavors = {
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 9a6271980b..69f4ebc9bc 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -15,6 +15,23 @@
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
 from transformers.modeling_utils import AttentionInterface
 
+@dataclass
+class TitanDenseModelArgs:
+    """Arguments for the base TorchTitan model."""
+
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: int | None = None
+    vocab_size: int | None = None
+    multiple_of: int = 256
+    ffn_dim_multiplier: float | None = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000
+    max_seq_len: int = 2048
+    depth_init: bool = True
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
 
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):

From 019f2cc557110bd2158a5b1971d44cd2e8f92cc8 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 12:58:00 +0000
Subject: [PATCH 117/129] clean

---
 torchtitan/experiments/transformers_backend/model/args.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 69f4ebc9bc..b3d9daa723 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -79,7 +79,6 @@ def __init__(
         self._create_getter_setter_dynamically(has_moe=False)
 
         self._titan_injected_model_args = {}
-        self._titan_injected_model_args.update(kwargs)
         self._configure_hf_attention(attn_implementation)
 
         self._initialize_dense_attributes(titan_dense_args)

From fc93b4f4866ef805750a1e1760a310bf3e8ef171 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 13:02:32 +0000
Subject: [PATCH 118/129] fix integration tests

---
 torchtitan/experiments/transformers_backend/__init__.py       | 4 ----
 torchtitan/experiments/transformers_backend/model/args.py     | 2 ++
 .../transformers_backend/tests/integration_tests.py           | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index 6d74050608..aec28a0bdd 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -3,8 +3,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-from dataclasses import dataclass
-
 from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers
@@ -24,8 +22,6 @@
 ]
 
 
-
-
 flavors = {
     "debugmodel": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index b3d9daa723..d261dcd5e4 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -15,6 +15,7 @@
 from transformers.integrations.sdpa_attention import sdpa_attention_forward
 from transformers.modeling_utils import AttentionInterface
 
+
 @dataclass
 class TitanDenseModelArgs:
     """Arguments for the base TorchTitan model."""
@@ -33,6 +34,7 @@ class TitanDenseModelArgs:
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
 
+
 @dataclass
 class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
     """
diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
index 8bc8a63a31..5629b45f5c 100644
--- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py
+++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
@@ -22,7 +22,6 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]:
             [
                 [
                     "--model.name meta-llama/Llama-3.2-1B",
-                    "--training.dataset c4-test",
                     "--parallelism.data_parallel_shard_degree 2",
                     "--parallelism.tensor_parallel_degree 2",
                     "--parallelism.pipeline_parallel_degree 2",
@@ -63,7 +62,7 @@ def main():
     if os.listdir(args.output_dir):
         raise RuntimeError("Please provide an empty output directory.")
 
-    test_list = _TEST_SUITES_FUNCTION["transformers_backend"]()()
+    test_list = _TEST_SUITES_FUNCTION["transformers_backend"]()
     run_tests(args, test_list)
 
 

From f9e8e11d23160ff16d59fada7664eee6fdc8bcf1 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 13:08:48 +0000
Subject: [PATCH 119/129] rename integration test file

---
 ...gface.yaml => integration_test_8gpu_transformers_backend.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{integration_test_8gpu_huggingface.yaml => integration_test_8gpu_transformers_backend.yaml} (100%)

diff --git a/.github/workflows/integration_test_8gpu_huggingface.yaml b/.github/workflows/integration_test_8gpu_transformers_backend.yaml
similarity index 100%
rename from .github/workflows/integration_test_8gpu_huggingface.yaml
rename to .github/workflows/integration_test_8gpu_transformers_backend.yaml

From 83b0437aeed5fa1d6a84dd6a3306f003e822a8c1 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 13:15:54 +0000
Subject: [PATCH 120/129] update README

---
 torchtitan/experiments/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
index 5a2c0b28e5..02372dbe16 100644
--- a/torchtitan/experiments/README.md
+++ b/torchtitan/experiments/README.md
@@ -30,6 +30,5 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [torchcomms](./torchcomms/) | [![TorchComms 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_torchcomms.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_torchcomms.yaml?query=branch%3Amain) | [@d4l3k](https://https://github.com/d4l3k) [@fduwjj](https://github.com/fduwjj) [@mori360 ](https://github.com/mori360) |
 | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
-| [compiler_toolkit](./compiler_tookit/) | TBA | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
-| [transformers_backend](./transformers_backend/) | TBA | [@3outeille](https://github.com/3outeille) |
 | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
+| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) |
\ No newline at end of file

From fb978ddbf095249a8beb2fa083794df276fec747 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 13:20:06 +0000
Subject: [PATCH 121/129] revert accidental changes linting

---
 torchtitan/experiments/README.md | 2 +-
 torchtitan/train.py              | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
index 02372dbe16..9b25cdc7a6 100644
--- a/torchtitan/experiments/README.md
+++ b/torchtitan/experiments/README.md
@@ -31,4 +31,4 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
 | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
-| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) |
\ No newline at end of file
+| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) |
diff --git a/torchtitan/train.py b/torchtitan/train.py
index a8dca7efd7..d157a3a307 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -163,6 +163,7 @@ def __init__(self, job_config: JobConfig):
             model_param_count,
             self.metrics_processor.num_flops_per_token,
         ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
+
         logger.info(
             f"{color.blue}Model {job_config.model.name} {job_config.model.flavor} "
             f"{color.red}size: {model_param_count:,} total parameters{color.reset}"
@@ -245,6 +246,7 @@ def __init__(self, job_config: JobConfig):
         else:
             # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
             model = self.train_spec.parallelize_fn(model, parallel_dims, job_config)
+
             model.to_empty(device=init_device)
             with torch.no_grad():
                 model.init_weights(buffer_device=buffer_device)

From 71ff098cb3bb5786015238b0b8d1543cf60ba006 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 13:27:21 +0000
Subject: [PATCH 122/129] typo in naming

---
 .../experiments/transformers_backend/model/model.py       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index 8c35ac4e94..3b589b4d43 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -17,7 +17,7 @@
 from .args import HFTransformerModelArgs
 
 
-class SlicableModuleDict(nn.ModuleDict):
+class SliceableModuleDict(nn.ModuleDict):
     """
     A ModuleDict that supports slicing like ModuleList.
     Keys are expected to be string representations of integers (e.g., "0", "1", "2").
@@ -30,8 +30,8 @@ def __getitem__(self, key):
                 self.keys(), key=lambda x: int(x) if x.isdigit() else float("inf")
             )
             sliced_keys = keys[key]
-            # Return a new SlicableModuleDict with the sliced items
-            return SlicableModuleDict({k: self[k] for k in sliced_keys})
+            # Return a new SliceableModuleDict with the sliced items
+            return SliceableModuleDict({k: self[k] for k in sliced_keys})
         return super().__getitem__(key)
 
     def __iter__(self):
@@ -106,7 +106,7 @@ def __init__(self, model_args: HFTransformerModelArgs):
         # Convert ModuleList to ModuleDict to preserve original indices
         # This ensures state dict keys match checkpoint keys
         if isinstance(self.model.model.layers, nn.ModuleList):
-            self.model.model.layers = SlicableModuleDict(
+            self.model.model.layers = SliceableModuleDict(
                 {str(i): layer for i, layer in enumerate(self.model.model.layers)}
             )
 

From 663a4157727949d76f6683a67a6dffec891a944e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 13:43:34 +0000
Subject: [PATCH 123/129] refactor

---
 torchtitan/distributed/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
index 790d84a5ed..60c05f1612 100644
--- a/torchtitan/distributed/utils.py
+++ b/torchtitan/distributed/utils.py
@@ -106,13 +106,13 @@ def set_determinism(
     if debug_config.deterministic:
         logger.info("Deterministic algorithm enabled (expect perf degradation).")
         torch.use_deterministic_algorithms(True)
-        # Otherwise, HF register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
-        torch.utils.deterministic.fill_uninitialized_memory = False
         torch.use_deterministic_algorithms(
             True, warn_only=debug_config.deterministic_warn_only
         )
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
+        # Otherwise, Huggignface modeling register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
+        torch.utils.deterministic.fill_uninitialized_memory = False
         # env var for deterministic CuBLAS
         # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

From 3dbe6fab79c503a440f82a535fd7d997a844ad49 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Tue, 18 Nov 2025 14:03:09 +0000
Subject: [PATCH 124/129] revert the way we select HF modeling in config

---
 .../transformers_backend/README.md            |  11 +-
 .../transformers_backend/configs/qwen3.toml   |   5 +-
 .../transformers_backend/infra/parallelize.py | 171 +++++++++++++++++-
 .../transformers_backend/infra/pipeline.py    |   2 +-
 .../transformers_backend/job_config.py        |  18 ++
 .../transformers_backend/model/args.py        |   2 +-
 torchtitan/protocols/train_spec.py            |   4 -
 7 files changed, 195 insertions(+), 18 deletions(-)
 create mode 100644 torchtitan/experiments/transformers_backend/job_config.py

diff --git a/torchtitan/experiments/transformers_backend/README.md b/torchtitan/experiments/transformers_backend/README.md
index 3d1a2dcf0d..805afb9ab9 100644
--- a/torchtitan/experiments/transformers_backend/README.md
+++ b/torchtitan/experiments/transformers_backend/README.md
@@ -4,19 +4,20 @@
 
 - Requirements `transformers==4.57.1`
 
-- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml`
+- Config: `torchtitan/torchtitan/experiments/transformers_backend/configs/qwen3.toml`
 ```diff
 ...
 [model]
 - name = "llama3"
-+ name = "Qwen/Qwen3-4B-Instruct-2507"
++ name = "transformers_backend"
 flavor = "debugmodel"
 hf_assets_path = "./tests/assets/tokenizer"
+
++[hf_transformers]
++model = "Qwen/Qwen3-4B-Instruct-2507"
 ...
 ```
-**Note:** Any model name containing "/" is automatically recognized as a HuggingFace model ID and will use the `transformers_backend`.
-
-- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATH>/torchtitan/experiments/transformers_backend/configs/qwen3_fsdp2_tp2_pp2.toml ./run_train.sh --compile.enable`
+- Train: `LOG_RANK=7 CONFIG_FILE=<YOUR_PATH>/torchtitan/experiments/transformers_backend/configs/qwen3.toml ./run_train.sh --job.custom_config_module=torchtitan.experiments.transformers_backend.job_config --compile.enable`
     - Make sure you have created the tokenizers beforehand
 <img width="1334" height="453" alt="image" src="https://github.com/user-attachments/assets/da459448-027b-4af9-8176-6a3e433a272c" />
 
diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3.toml b/torchtitan/experiments/transformers_backend/configs/qwen3.toml
index b0e294ccbe..13e3f4ddf0 100644
--- a/torchtitan/experiments/transformers_backend/configs/qwen3.toml
+++ b/torchtitan/experiments/transformers_backend/configs/qwen3.toml
@@ -20,12 +20,15 @@ save_tb_folder = "tb"
 enable_wandb = false
 
 [model]
-name = "Qwen/Qwen3-4B-Instruct-2507"
+name = "transformers_backend"
 flavor = "debugmodel"
 # test folder with tokenizer.json, for debug purpose only
 hf_assets_path = "./tests/assets/tokenizer"
 # converters = ["float8"]
 
+[hf_transformers]
+model = "Qwen/Qwen3-4B-Instruct-2507"
+
 [optimizer]
 name = "AdamW"
 lr = 8e-4
diff --git a/torchtitan/experiments/transformers_backend/infra/parallelize.py b/torchtitan/experiments/transformers_backend/infra/parallelize.py
index cb68826e87..b2ae3f02a1 100644
--- a/torchtitan/experiments/transformers_backend/infra/parallelize.py
+++ b/torchtitan/experiments/transformers_backend/infra/parallelize.py
@@ -4,8 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
 import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -15,17 +17,13 @@
     SequenceParallel,
 )
 from torchtitan.config import TORCH_DTYPE_MAP
-from torchtitan.config.job_config import JobConfig
 from torchtitan.distributed import NoParallel, ParallelDims
 
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.models.llama3.infra.parallelize import (
-    apply_compile,
-    apply_ddp,
-    apply_fsdp,
-)
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.models.llama3.infra.parallelize import apply_compile, apply_ddp
 from torchtitan.tools.logging import logger
 
 
@@ -274,3 +272,164 @@ def apply_non_moe_tp(
         f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}"
         "Tensor Parallelism to the model"
     )
+
+
+def apply_fsdp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    param_dtype: torch.dtype,
+    reduce_dtype: torch.dtype,
+    pp_enabled: bool,
+    cpu_offload: bool = False,
+    reshard_after_forward_policy: str = "default",
+    ep_degree: int = 1,
+    dp_mod_ep_mesh: DeviceMesh | None = None,
+    gradient_divide_factor: int | None = None,
+):
+    """
+    Apply data parallelism (via FSDP2) to the model.
+
+    Args:
+        model (nn.Module): The model to apply data parallelism to.
+        dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
+        param_dtype (torch.dtype): The data type to use for model parameters.
+        reduce_dtype (torch.dtype): The data type to use for reduction operations.
+        pp_enabled (bool): Whether pipeline parallelism is enabled.
+        cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
+        reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default".
+            Other options: "never", "always".
+            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
+            - "always" will enable `reshard_after_forward` for all forward passes.
+            - "never" will disable `reshard_after_forward` for all forward passes.
+
+    """
+    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+    fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+    if cpu_offload:
+        fsdp_config["offload_policy"] = CPUOffloadPolicy()
+
+    match reshard_after_forward_policy:
+        case "always":
+            reshard_after_forward = True
+        case "never":
+            reshard_after_forward = False
+        case "default":
+            # For PP, by default do not reshard after forward to avoid per-microbatch
+            # all-gathers, which can be expensive and non-overlapped
+            reshard_after_forward = not pp_enabled
+        case _:
+            raise ValueError(
+                f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
+            )
+
+    if model.tok_embeddings is not None:
+        fully_shard(
+            model.tok_embeddings,
+            **fsdp_config,
+            reshard_after_forward=reshard_after_forward,
+        )
+
+    for transformer_block in model.layers:
+        # NOTE: When EP is enabled, In an MoE layer, we use the following FSDP wrapping
+        # - the router and the shared experts are sharded together with the TransformerBlock
+        # - the routed experts are sharded with the remaining dp_mod_ep_mesh
+        if (
+            hasattr(transformer_block, "moe_enabled")
+            and transformer_block.moe_enabled
+            and ep_degree > 1
+        ):
+            fsdp_mod_ep_config = fsdp_config.copy()
+            fsdp_mod_ep_config["mesh"] = dp_mod_ep_mesh
+            moe_block = transformer_block.mlp
+            # NOTE: EP alreadys shards the routed experts on dim 0 (num_experts).
+            #       When dp_mod_ep * ep > num_experts, FSDP default dim-0 sharding
+            #       causes inefficiency, so we choose to do FSDP sharding on dim-1.
+            #       Even when EP is not used, we may still want to shard the experts
+            #       on non-0 dim. For now it may not be worth the complexity to support
+            #       shard_placement_fn on the outer TransformerBlock-level FSDP.
+            _experts_shard_placement_fn = None
+            assert dp_mod_ep_mesh is not None
+            if dp_mod_ep_mesh.size() * ep_degree > moe_block.experts.num_experts:
+                _experts_shard_placement_fn = lambda param: Shard(1)
+
+            fully_shard(
+                moe_block.experts,
+                **fsdp_mod_ep_config,
+                reshard_after_forward=reshard_after_forward,
+                shard_placement_fn=_experts_shard_placement_fn,
+            )
+
+            # NOTE: # Although the FSDP sharding of experts is done on a mesh of
+            #       a different size than other parameters, the gradient division
+            #       factor should be consistent with data.
+            moe_block.experts.set_gradient_divide_factor(
+                gradient_divide_factor,
+            )
+
+        fully_shard(
+            transformer_block,
+            **fsdp_config,
+            reshard_after_forward=reshard_after_forward,
+        )
+
+    # As an optimization, do not reshard_after_forward the last layers by default
+    # since FSDP would prefetch them immediately after the forward pass
+    if model.norm is not None and model.output is not None:
+        fully_shard(
+            [model.norm, model.output],
+            **fsdp_config,
+            reshard_after_forward=reshard_after_forward_policy == "always",
+        )
+
+    fully_shard(model, **fsdp_config)
+
+    # NOTE: set up explicit prefetching when EP is enabled, as D2H syncs
+    # in EP could interfere with implicit prefetching in FSDP
+    if ep_degree == 1:
+        return
+
+    # forward
+    transformer_blocks = list(model.layers.values())
+    next_transformer_blocks = transformer_blocks[1:] + [None]
+
+    if model.tok_embeddings is not None and model.layers is not None:
+        model.tok_embeddings.set_modules_to_forward_prefetch([transformer_blocks[0]])
+
+    for transformer_block, next_transformer_block in zip(
+        transformer_blocks, next_transformer_blocks
+    ):
+        if next_transformer_block is not None:
+            if next_transformer_block.moe_enabled:
+                transformer_block.set_modules_to_forward_prefetch(
+                    [next_transformer_block, next_transformer_block.mlp.experts]
+                )
+            else:
+                transformer_block.set_modules_to_forward_prefetch(
+                    [next_transformer_block]
+                )
+        elif model.norm is not None and model.output is not None:
+            transformer_block.set_modules_to_forward_prefetch(
+                [model.norm, model.output]
+            )
+
+    # backward
+    reversed_transformer_blocks = list(reversed(model.layers.values()))
+    prev_transformer_blocks = reversed_transformer_blocks[1:] + [None]
+
+    if model.norm is not None and model.output is not None and model.layers is not None:
+        model.output.set_modules_to_backward_prefetch([reversed_transformer_blocks[0]])
+
+    for transformer_block, prev_transformer_block in zip(
+        reversed_transformer_blocks, prev_transformer_blocks
+    ):
+        if prev_transformer_block is not None:
+            if prev_transformer_block.moe_enabled:
+                transformer_block.set_modules_to_backward_prefetch(
+                    [prev_transformer_block, prev_transformer_block.mlp.experts]
+                )
+            else:
+                transformer_block.set_modules_to_backward_prefetch(
+                    [prev_transformer_block]
+                )
+        elif model.tok_embeddings is not None:
+            transformer_block.set_modules_to_backward_prefetch([model.tok_embeddings])
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
index c8904f4352..53aee86180 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline.py
@@ -14,13 +14,13 @@
 )
 
 from torchtitan.components.loss import LossFunction
-from torchtitan.config.job_config import JobConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.pipeline_parallel import (
     build_pipeline_schedule,
     generate_llm_fqn_per_model_part,
     pipeline_module_split,
 )
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 
 
diff --git a/torchtitan/experiments/transformers_backend/job_config.py b/torchtitan/experiments/transformers_backend/job_config.py
new file mode 100644
index 0000000000..f3b1667798
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/job_config.py
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class HFTransformers:
+    model: str = ""
+    """HuggingFace model ID (e.g., 'Qwen/Qwen3-4B-Instruct-2507')"""
+
+
+@dataclass
+class JobConfig:
+    hf_transformers: HFTransformers = field(default_factory=HFTransformers)
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index d261dcd5e4..4093f66194 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -150,7 +150,7 @@ def __repr__(self) -> str:
     def update_from_config(self, job_config: JobConfig):
         # Load HF config (overwrites our HF attributes)
         hf_model_config = AutoConfig.from_pretrained(
-            job_config.model.name,
+            job_config.hf_transformers.model,
             attn_implementation=self.attn_implementation,
             trust_remote_code=True,
         )
diff --git a/torchtitan/protocols/train_spec.py b/torchtitan/protocols/train_spec.py
index 3eed6ddd2f..22bfa7df9b 100644
--- a/torchtitan/protocols/train_spec.py
+++ b/torchtitan/protocols/train_spec.py
@@ -77,10 +77,6 @@ def get_train_spec(name: str) -> TrainSpec:
     from torchtitan.experiments import _supported_experiments
     from torchtitan.models import _supported_models
 
-    if "/" in name:
-        module = import_module("torchtitan.experiments.transformers_backend")
-        return module.get_train_spec()
-
     if name in _supported_models:
         module = import_module(f"torchtitan.models.{name}")
         return module.get_train_spec()

From 9be95dac760a5006d7362de5d629683caddaeb75 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 19 Nov 2025 11:06:44 +0000
Subject: [PATCH 125/129] Revert "reuse pipeline from torchtitan"

This reverts commit 09f0c94790a5817eb9c2f5d40f5d11236f7c79b9.
---
 torchtitan/distributed/pipeline_parallel.py   |  19 +-
 .../transformers_backend/__init__.py          |   2 +-
 .../transformers_backend/infra/pipeline.py    | 142 -------
 .../infra/pipeline_parallel.py                | 390 ++++++++++++++++++
 4 files changed, 394 insertions(+), 159 deletions(-)
 delete mode 100644 torchtitan/experiments/transformers_backend/infra/pipeline.py
 create mode 100644 torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py

diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py
index b954d32c19..06dba40d6f 100644
--- a/torchtitan/distributed/pipeline_parallel.py
+++ b/torchtitan/distributed/pipeline_parallel.py
@@ -228,7 +228,6 @@ def generate_llm_fqn_per_model_part(
     num_layers: int,
     input_weight: int = 1,
     output_weight: int = 1,
-    include_rotary_emb: bool = False,
 ) -> list[list[str]]:
     """
     Programmatically generates module names model part, focused on LLMs models.
@@ -238,7 +237,6 @@ def generate_llm_fqn_per_model_part(
         num_layers: Total number of transformer layers in the model
         input_weight: Weight for input modules (tok_embeddings) in layer calculation
         output_weight: Weight for output modules (norm + output) in layer calculation
-        include_rotary_emb: Whether to include rotary_emb in each model part
 
     Returns:
         List of lists containing module names for each model part
@@ -253,10 +251,7 @@ def generate_llm_fqn_per_model_part(
     if num_stages == 1:
         # Single stage gets everything
         layer_names = [f"layers.{i}" for i in range(num_layers)]
-        result = [["tok_embeddings"] + layer_names + ["norm", "output"]]
-        if include_rotary_emb:
-            result[0].append("rotary_emb")
-        return result
+        return [["tok_embeddings"] + layer_names + ["norm", "output"]]
 
     # Calculate effective layers including weights
     num_effective_layers = num_layers + input_weight + output_weight
@@ -334,8 +329,6 @@ def generate_llm_fqn_per_model_part(
                     stage_modules.append(f"layers.{current_layer}")
                     current_layer += 1
 
-        if include_rotary_emb:
-            stage_modules.append("rotary_emb")
         module_names_per_stage.append(stage_modules)
 
     return module_names_per_stage
@@ -347,7 +340,6 @@ def pipeline_module_split(
     pp_schedule: str,
     device: torch.device,
     module_names_per_stage: list[list[str]],
-    use_identity_for_missing_modules: bool = False,
 ) -> tuple[list[PipelineStage], list[nn.Module]]:
     """
     This API creates pipeline stages based on specified module names for each stage.
@@ -369,8 +361,6 @@ def pipeline_module_split(
                                - "layers.0", "layers.1" for specific transformer layers
                                - "norm" for the final normalization layer
                                - "output" for the output projection layer
-        use_identity_for_missing_modules: If True, replace missing modules with nn.Identity(),
-                                         otherwise replace with None
 
     Returns:
         Tuple of (stages, models) where stages are PipelineStage objects and models are the
@@ -427,11 +417,8 @@ def _build_stage_from_modules(
                         setattr(model, module_name, nn.ModuleList())
             # Handle simple module attributes (e.g., "linear", "norm")
             elif module_name not in modules_to_keep:
-                # Replace with Identity or None based on configuration
-                replacement = (
-                    nn.Identity() if use_identity_for_missing_modules else None
-                )
-                setattr(model, module_name, replacement)
+                # Replace with None
+                setattr(model, module_name, None)
 
         stage = PipelineStage(
             model,
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index aec28a0bdd..dc4322623b 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -12,7 +12,7 @@
 
 from .infra.parallelize import parallelize_hf_transformers
 
-from .infra.pipeline import pipeline_hf_transformers
+from .infra.pipeline_parallel import pipeline_hf_transformers
 from .model.args import HFTransformerModelArgs, TitanDenseModelArgs
 from .model.model import HFTransformerModel
 
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
deleted file mode 100644
index 53aee86180..0000000000
--- a/torchtitan/experiments/transformers_backend/infra/pipeline.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-import math
-
-import torch
-import torch.nn as nn
-from torch.distributed.pipelining.schedules import (
-    _PipelineSchedule,
-    get_schedule_class,
-    PipelineScheduleSingle,
-)
-
-from torchtitan.components.loss import LossFunction
-from torchtitan.distributed import ParallelDims
-from torchtitan.distributed.pipeline_parallel import (
-    build_pipeline_schedule,
-    generate_llm_fqn_per_model_part,
-    pipeline_module_split,
-)
-from torchtitan.experiments.transformers_backend.job_config import JobConfig
-from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
-
-
-def pipeline_hf_transformers(
-    model: nn.Module,
-    parallel_dims: ParallelDims,
-    job_config: JobConfig,
-    device: torch.device,
-    model_args: BaseModelArgs,
-    parallelize_fn: ParallelizeFunction,
-    loss_fn: LossFunction,
-) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]:
-    pp_mesh = parallel_dims.world_mesh["pp"]
-
-    # Determine the number of virtual stages based on schedule type
-    schedule_class = get_schedule_class(
-        job_config.parallelism.pipeline_parallel_schedule
-    )
-    is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle)
-    layers_per_stage = job_config.parallelism.pipeline_parallel_layers_per_stage
-    if hasattr(model_args, "n_layers"):
-        num_layers = model_args.n_layers
-    else:
-        raise ValueError("Model does not have n_layers attribute.")
-
-    # You can adjust these weights based on the computational cost of embeddings and output layers
-    # Higher weights mean these modules are treated as "heavier" in the distribution
-    input_weight = job_config.parallelism.pipeline_parallel_first_stage_less_layers
-    output_weight = job_config.parallelism.pipeline_parallel_last_stage_less_layers
-
-    # Calculate number of virtual stages
-    if layers_per_stage is not None:
-
-        # Calculate number of virtual stages needed (using ceiling division)
-        # This allows for unequal distribution where stages can differ by at most 1 layer
-        num_virtual_stages = math.ceil(
-            (num_layers + input_weight + output_weight) / layers_per_stage
-        )
-
-        # Validation: check stages per rank based on schedule type
-        model_config_info = f"Model has {num_layers} layers with pipeline_parallel_layers_per_stage={layers_per_stage}"
-        stage_distribution_info = (
-            f"resulting in {num_virtual_stages=} across {parallel_dims.pp} PP ranks"
-        )
-
-        if num_virtual_stages % parallel_dims.pp != 0:
-            raise ValueError(
-                f"Number of virtual stages ({num_virtual_stages}) must be divisible by "
-                f"pipeline parallel size ({parallel_dims.pp}). "
-                f"{model_config_info}. "
-                f"Please adjust pipeline_parallel_layers_per_stage to a value that results in a number of stages "
-                f"divisible by {parallel_dims.pp}."
-            )
-
-        stages_per_rank = num_virtual_stages // parallel_dims.pp
-
-        if is_single_stage_schedule and stages_per_rank != 1:
-            raise ValueError(
-                f"Single stage schedule requires exactly 1 stage per rank, but got {stages_per_rank} stages per rank. "
-                f"{model_config_info}, {stage_distribution_info}. "
-                f"Please increase pipeline_parallel_layers_per_stage to {num_layers // parallel_dims.pp} or higher "
-                f"to achieve 1 stage per rank."
-            )
-
-        if not is_single_stage_schedule and stages_per_rank < 2:
-            raise ValueError(
-                f"Multi-stage schedule requires at least 2 stages per rank, but got {stages_per_rank} stages per rank. "
-                f"{model_config_info}, {stage_distribution_info}. "
-                f"Please decrease pipeline_parallel_layers_per_stage to achieve at least 2 stages per rank."
-            )
-    else:
-        # Fallback to default behavior when layers_per_stage is not provided
-        # For multi-stage schedules, default is 2 virtual stages per rank
-        # For single-stage schedules, default is 1 virtual stage per rank
-        stages_per_rank = 1 if is_single_stage_schedule else 2
-        num_virtual_stages = parallel_dims.pp * stages_per_rank
-
-    module_names_per_stage = job_config.parallelism.module_fqns_per_model_part
-    if module_names_per_stage is None:
-        module_names_per_stage = generate_llm_fqn_per_model_part(
-            num_virtual_stages,
-            num_layers,
-            input_weight,
-            output_weight,
-            include_rotary_emb=True,
-        )
-
-    stages, model_parts = pipeline_module_split(
-        model,
-        pp_mesh,
-        job_config.parallelism.pipeline_parallel_schedule,
-        device,
-        module_names_per_stage,
-        use_identity_for_missing_modules=True,
-    )
-
-    # For PP with looped schedules, each item in model_parts is one stage-model-chunk.
-    # We need to iterate through model_parts to apply SPMD parallelisms, compilation,
-    # optimizer, and checkpointing
-    for i, m in enumerate(model_parts):
-        # apply SPMD-style PT-D techniques
-        m = parallelize_fn(m, parallel_dims, job_config)
-        model_parts[i] = m
-        # NOTE: this is to update the model in the stage
-        #       in case the model is modified e.g. by torch.compile
-        stages[i].submod = m
-
-    pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
-
-    # This is used in the train loop to determine whether to pass in the input_ids and labels
-    has_first_stage = False
-    has_last_stage = False
-    for stage in stages:
-        if stage.is_first:
-            has_first_stage = True
-        if stage.is_last:
-            has_last_stage = True
-
-    return pp_schedule, model_parts, has_first_stage, has_last_stage
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py
new file mode 100644
index 0000000000..8610b201dc
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py
@@ -0,0 +1,390 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import math
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.pipelining import PipelineStage
+from torch.distributed.pipelining.schedules import (
+    _PipelineSchedule,
+    get_schedule_class,
+    PipelineScheduleSingle,
+    ScheduleDualPipeV,
+    ScheduleZBVZeroBubble,
+)
+
+from torchtitan.components.loss import LossFunction
+from torchtitan.distributed import ParallelDims
+from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule
+from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
+from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.tools.logging import logger
+
+# NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part
+
+def generate_llm_fqn_per_model_part(
+    num_stages: int,
+    num_layers: int,
+    input_weight: int = 1,
+    output_weight: int = 1,
+) -> list[list[str]]:
+    """
+    Programmatically generates module names model part, focused on LLMs models.
+    Args:
+        num_stages: Number of pipeline stages
+        num_layers: Total number of transformer layers in the model
+        input_weight: Weight for input modules (embed_tokens) in layer calculation
+        output_weight: Weight for output modules (norm + output) in layer calculation
+    Returns:
+        List of lists containing module names for each model part
+    Example:
+        generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2)
+        treats embeddings as 2 layers and norm+output as 2 layers for distribution
+    """
+    if num_stages < 1:
+        raise ValueError("Number of stages must be at least 1")
+
+    if num_stages == 1:
+        # Single stage gets everything
+        layer_names = [f"layers.{i}" for i in range(num_layers)]
+        return [["tok_embeddings"] + layer_names + ["norm", "output", "rotary_emb"]]
+
+    # Calculate effective layers including weights
+    num_effective_layers = num_layers + input_weight + output_weight
+
+    if num_stages > num_effective_layers:
+        raise ValueError(
+            f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})"
+        )
+
+    # Calculate layers per stage (distribute evenly)
+    layers_per_stage = num_effective_layers // num_stages
+    extra_layers = num_effective_layers % num_stages
+
+    # Feasibility check: Ensure at least 1 layer in each PP stage
+    if layers_per_stage == 0:
+        raise ValueError(
+            f"Configuration would result in empty stages. "
+            f"With {num_stages} stages and {num_effective_layers} effective layers "
+            f"(num_layers={num_layers} + input_weight={input_weight} + output_weight={output_weight}), "
+            f"each stage would get {layers_per_stage} layers on average. "
+            f"Reduce num_stages or increase num_layers/weights."
+        )
+
+    # Balance check: Ensure weights don't exceed minimum layers per stage
+    if input_weight > layers_per_stage:
+        raise ValueError(
+            f"input_weight ({input_weight}) exceeds minimum layers per stage ({layers_per_stage})."
+        )
+    if output_weight > layers_per_stage:
+        raise ValueError(
+            f"output_weight ({output_weight}) exceeds minimum layers per stage ({layers_per_stage})."
+        )
+
+    module_names_per_stage = []
+    current_layer = 0
+
+    for stage_idx in range(num_stages):
+        stage_modules = []
+
+        # Calculate effective layers for this stage
+        effective_layers_for_stage = layers_per_stage
+        if stage_idx < extra_layers:
+            effective_layers_for_stage += 1
+
+        # First stage: handle input modules with weighting
+        if stage_idx == 0:
+            stage_modules.append("tok_embeddings")
+            # Account for input weight in layer distribution
+            remaining_layers_for_stage = effective_layers_for_stage - input_weight
+
+            # Add transformer layers
+            for _ in range(remaining_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"layers.{current_layer}")
+                    current_layer += 1
+
+        # Last stage: handle output modules with weighting
+        elif stage_idx == num_stages - 1:
+            # Account for output weight in layer distribution
+            remaining_layers_for_stage = effective_layers_for_stage - output_weight
+
+            # Add transformer layers
+            for _ in range(remaining_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"layers.{current_layer}")
+                    current_layer += 1
+
+            # Add output modules
+            stage_modules.extend(["norm", "output"])
+
+        # Middle stages: only transformer layers
+        else:
+            for _ in range(effective_layers_for_stage):
+                if current_layer < num_layers:
+                    stage_modules.append(f"layers.{current_layer}")
+                    current_layer += 1
+
+        stage_modules.append("rotary_emb")
+        module_names_per_stage.append(stage_modules)
+
+    return module_names_per_stage
+
+
+def pipeline_module_split(
+    whole_model: nn.Module,
+    pp_mesh: DeviceMesh,
+    pp_schedule: str,
+    device: torch.device,
+    module_names_per_stage: list[list[str]],
+) -> tuple[list[PipelineStage], list[nn.Module]]:
+    """
+    This API creates pipeline stages based on specified module names for each stage.
+
+    Some model restrictions include:
+    - forward() method should tolerate deleted layers
+    - weight initialization methods should tolerate deleted layers
+    - Does not support nested moduledict and modulelist structures
+
+    Args:
+        whole_model: The complete model to be split
+        pp_mesh: Pipeline parallel device mesh
+        pp_schedule: Name of pipeline parallelism schedule
+        device: Device
+        module_names_per_stage: List of lists, where each inner list contains the module names
+                               that should be included in that stage. Module names should be
+                               dot-separated paths. Examples:
+                               - "tok_embeddings" for token embeddings
+                               - "layers.0", "layers.1" for specific transformer layers
+                               - "norm" for the final normalization layer
+                               - "output" for the output projection layer
+
+    Returns:
+        Tuple of (stages, models) where stages are PipelineStage objects and models are the
+        corresponding model chunks
+
+    Example usage:
+        module_names_per_stage = [
+            ["tok_embeddings", "layers.0"],     # Stage 0: embeddings + first layer
+            ["layers.1", "layers.2"],           # Stage 1: middle layers
+            ["norm", "output"]                  # Stage 2: final norm + output
+        ]
+    """
+    pp_rank = pp_mesh.get_local_rank()
+    pp_degree = pp_mesh.size()
+
+    def _build_stage_from_modules(
+        stage_idx: int, module_names: list[str], num_stages: int
+    ) -> tuple[PipelineStage, nn.Module]:
+        model = copy.deepcopy(whole_model)
+
+        # Create a set of modules to keep for faster lookup
+        modules_to_keep = set(module_names)
+        for module_name, module_value in model.named_children():
+            # Handle layer-like structures (e.g., "layers.0", "layers.1")
+            if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)):
+                layers_to_keep = {
+                    name.split(".", 1)[1]
+                    for name in modules_to_keep
+                    if name.startswith(f"{module_name}.")
+                }
+                if layers_to_keep:
+                    # Keep only specified layers
+                    if isinstance(module_value, nn.ModuleDict):
+                        for layer_name in list(module_value.keys()):
+                            if layer_name not in layers_to_keep:
+                                del module_value[layer_name]
+                    elif isinstance(module_value, nn.ModuleList):
+                        indices_to_keep = {
+                            int(idx) for idx in layers_to_keep if idx.isdigit()
+                        }
+                        new_layers = nn.ModuleList(
+                            [
+                                layer
+                                for i, layer in enumerate(module_value)
+                                if i in indices_to_keep
+                            ]
+                        )
+                        setattr(model, module_name, new_layers)
+                else:
+                    # No layers from this structure needed, set to empty structure
+                    if isinstance(module_value, nn.ModuleDict):
+                        setattr(model, module_name, nn.ModuleDict())
+                    elif isinstance(module_value, nn.ModuleList):
+                        setattr(model, module_name, nn.ModuleList())
+            # Handle simple module attributes (e.g., "linear", "norm")
+            elif module_name not in modules_to_keep:
+                # Replace with Identity
+                setattr(model, module_name, nn.Identity())
+
+        stage = PipelineStage(
+            model,
+            stage_idx,
+            num_stages,
+            device,
+            group=pp_mesh.get_group("pp"),
+        )
+        return stage, model
+
+    num_stages = len(module_names_per_stage)
+    stages = []
+    models = []
+
+    schedule_class = get_schedule_class(pp_schedule)
+    style = (
+        "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop"
+    )
+
+    def _get_stage_indices() -> tuple[int]:
+        """
+        Compute the stage ids for the stages that will run on this pp rank
+        for either a looped or V style schedule
+        """
+        assert (
+            num_stages % pp_degree == 0
+        ), f"num_stages {num_stages} must be evenly divisible by pp_degree {pp_degree}"
+        stages_per_rank = num_stages // pp_degree
+        if style == "loop":
+            return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank))
+        elif style == "v":
+            assert (
+                stages_per_rank == 2
+            ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
+            stage_v_pairs = list(
+                zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1))
+            )
+            return stage_v_pairs[pp_rank]
+
+    for stage_idx in _get_stage_indices():
+        module_names = module_names_per_stage[stage_idx]
+        stage, model_chunk = _build_stage_from_modules(
+            stage_idx,
+            module_names,
+            num_stages,
+        )
+        logger.info(
+            f"PP rank {pp_rank} is building stage_idx {stage_idx} "
+            f"with modules {module_names}"
+        )
+        stages.append(stage)
+        models.append(model_chunk)
+
+    return stages, models
+
+
+def pipeline_hf_transformers(
+    model: nn.Module,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: torch.device,
+    model_args: BaseModelArgs,
+    parallelize_fn: ParallelizeFunction,
+    loss_fn: LossFunction,
+) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]:
+    pp_mesh = parallel_dims.world_mesh["pp"]
+
+    # Determine the number of virtual stages based on schedule type
+    schedule_class = get_schedule_class(
+        job_config.parallelism.pipeline_parallel_schedule
+    )
+    is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle)
+    layers_per_stage = job_config.parallelism.pipeline_parallel_layers_per_stage
+    if hasattr(model_args, "n_layers"):
+        num_layers = model_args.n_layers
+    else:
+        raise ValueError("Model does not have n_layers attribute.")
+
+    # You can adjust these weights based on the computational cost of embeddings and output layers
+    # Higher weights mean these modules are treated as "heavier" in the distribution
+    input_weight = job_config.parallelism.pipeline_parallel_first_stage_less_layers
+    output_weight = job_config.parallelism.pipeline_parallel_last_stage_less_layers
+
+    # Calculate number of virtual stages
+    if layers_per_stage is not None:
+
+        # Calculate number of virtual stages needed (using ceiling division)
+        # This allows for unequal distribution where stages can differ by at most 1 layer
+        num_virtual_stages = math.ceil(
+            (num_layers + input_weight + output_weight) / layers_per_stage
+        )
+
+        # Validation: check stages per rank based on schedule type
+        model_config_info = f"Model has {num_layers} layers with pipeline_parallel_layers_per_stage={layers_per_stage}"
+        stage_distribution_info = (
+            f"resulting in {num_virtual_stages=} across {parallel_dims.pp} PP ranks"
+        )
+
+        if num_virtual_stages % parallel_dims.pp != 0:
+            raise ValueError(
+                f"Number of virtual stages ({num_virtual_stages}) must be divisible by "
+                f"pipeline parallel size ({parallel_dims.pp}). "
+                f"{model_config_info}. "
+                f"Please adjust pipeline_parallel_layers_per_stage to a value that results in a number of stages "
+                f"divisible by {parallel_dims.pp}."
+            )
+
+        stages_per_rank = num_virtual_stages // parallel_dims.pp
+
+        if is_single_stage_schedule and stages_per_rank != 1:
+            raise ValueError(
+                f"Single stage schedule requires exactly 1 stage per rank, but got {stages_per_rank} stages per rank. "
+                f"{model_config_info}, {stage_distribution_info}. "
+                f"Please increase pipeline_parallel_layers_per_stage to {num_layers // parallel_dims.pp} or higher "
+                f"to achieve 1 stage per rank."
+            )
+
+        if not is_single_stage_schedule and stages_per_rank < 2:
+            raise ValueError(
+                f"Multi-stage schedule requires at least 2 stages per rank, but got {stages_per_rank} stages per rank. "
+                f"{model_config_info}, {stage_distribution_info}. "
+                f"Please decrease pipeline_parallel_layers_per_stage to achieve at least 2 stages per rank."
+            )
+    else:
+        # Fallback to default behavior when layers_per_stage is not provided
+        # For multi-stage schedules, default is 2 virtual stages per rank
+        # For single-stage schedules, default is 1 virtual stage per rank
+        stages_per_rank = 1 if is_single_stage_schedule else 2
+        num_virtual_stages = parallel_dims.pp * stages_per_rank
+
+    module_names_per_stage = job_config.parallelism.module_fqns_per_model_part
+    if module_names_per_stage is None:
+        module_names_per_stage = generate_llm_fqn_per_model_part(
+            num_virtual_stages, num_layers, input_weight, output_weight
+        )
+
+    stages, model_parts = pipeline_module_split(
+        model,
+        pp_mesh,
+        job_config.parallelism.pipeline_parallel_schedule,
+        device,
+        module_names_per_stage,
+    )
+
+    # For PP with looped schedules, each item in model_parts is one stage-model-chunk.
+    # We need to iterate through model_parts to apply SPMD parallelisms, compilation,
+    # optimizer, and checkpointing
+    for i, m in enumerate(model_parts):
+        # apply SPMD-style PT-D techniques
+        m = parallelize_fn(m, parallel_dims, job_config)
+        model_parts[i] = m
+        # NOTE: this is to update the model in the stage
+        #       in case the model is modified e.g. by torch.compile
+        stages[i].submod = m
+
+    pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
+
+    # This is used in the train loop to determine whether to pass in the input_ids and labels
+    has_first_stage = False
+    has_last_stage = False
+    for stage in stages:
+        if stage.is_first:
+            has_first_stage = True
+        if stage.is_last:
+            has_last_stage = True
+
+    return pp_schedule, model_parts, has_first_stage, has_last_stage

From c0c273c5e9071db19c7e735ffd360860cc2cff2e Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 19 Nov 2025 11:24:46 +0000
Subject: [PATCH 126/129] pass deterministic.fill_uninitialized_memory  to HF
 model

---
 torchtitan/distributed/utils.py                            | 2 --
 torchtitan/experiments/transformers_backend/model/args.py  | 2 ++
 torchtitan/experiments/transformers_backend/model/model.py | 4 ++++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
index 60c05f1612..b209ddfd68 100644
--- a/torchtitan/distributed/utils.py
+++ b/torchtitan/distributed/utils.py
@@ -111,8 +111,6 @@ def set_determinism(
         )
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
-        # Otherwise, Huggignface modeling register buffer for ROPE (inv_freq) and this will be by default be initialized to Nan
-        torch.utils.deterministic.fill_uninitialized_memory = False
         # env var for deterministic CuBLAS
         # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
index 4093f66194..25ab328f15 100644
--- a/torchtitan/experiments/transformers_backend/model/args.py
+++ b/torchtitan/experiments/transformers_backend/model/args.py
@@ -171,6 +171,8 @@ def update_from_config(self, job_config: JobConfig):
 
         self.max_seq_len = job_config.training.seq_len
 
+        self.deterministic = job_config.debug.deterministic
+
         # Configure HF-specific settings to match TorchTitan settings
         # TODO: false ?
         self.attention_bias = False
diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index 3b589b4d43..2b42a1abc6 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -50,6 +50,10 @@ class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
         super().__init__()
 
+        #NOTE(3outeille): This prevents Hugging Face modeling from initializing ROPE (inv_freq) buffers to NaN. Usefull when loading from seed checkpoint.
+        if hasattr(model_args, 'deterministic') and model_args.deterministic:
+            torch.utils.deterministic.fill_uninitialized_memory = False
+
         # Try to import the model class dynamically from the transformers library if not found in globals
         model_class_name = model_args.architectures[0]
         model_cls = globals().get(model_class_name, None)

From 4c50a0005a93c606ad2cb1cb3157663c3458c4c9 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 19 Nov 2025 11:27:07 +0000
Subject: [PATCH 127/129] fix linting

---
 .../transformers_backend/infra/pipeline_parallel.py          | 3 ++-
 torchtitan/experiments/transformers_backend/model/model.py   | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py
index 8610b201dc..04452c5ede 100644
--- a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py
+++ b/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py
@@ -21,12 +21,13 @@
 from torchtitan.components.loss import LossFunction
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule
-from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.experiments.transformers_backend.job_config import JobConfig
+from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
 
 # NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part
 
+
 def generate_llm_fqn_per_model_part(
     num_stages: int,
     num_layers: int,
diff --git a/torchtitan/experiments/transformers_backend/model/model.py b/torchtitan/experiments/transformers_backend/model/model.py
index 2b42a1abc6..b88fffc54b 100644
--- a/torchtitan/experiments/transformers_backend/model/model.py
+++ b/torchtitan/experiments/transformers_backend/model/model.py
@@ -50,8 +50,9 @@ class HFTransformerModel(nn.Module):
     def __init__(self, model_args: HFTransformerModelArgs):
         super().__init__()
 
-        #NOTE(3outeille): This prevents Hugging Face modeling from initializing ROPE (inv_freq) buffers to NaN. Usefull when loading from seed checkpoint.
-        if hasattr(model_args, 'deterministic') and model_args.deterministic:
+        # NOTE(3outeille): This prevents Hugging Face modeling from initializing ROPE (inv_freq) buffers to NaN.
+        # Needed when loading from seed checkpoint.
+        if hasattr(model_args, "deterministic") and model_args.deterministic:
             torch.utils.deterministic.fill_uninitialized_memory = False
 
         # Try to import the model class dynamically from the transformers library if not found in globals

From 5b8d38c1c32f0e8cadad6c08ace83e87adad8e8b Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Wed, 19 Nov 2025 11:47:27 +0000
Subject: [PATCH 128/129] fix integration tests

---
 .../transformers_backend/tests/integration_tests.py            | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
index 5629b45f5c..f8a5c4e7e3 100644
--- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py
+++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
@@ -21,7 +21,8 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]:
         OverrideDefinitions(
             [
                 [
-                    "--model.name meta-llama/Llama-3.2-1B",
+                    "--job.custom_config_module=torchtitan.experiments.transformers_backend.job_config",
+                    "--hf_transformers.model Qwen/Qwen2.5-7B",
                     "--parallelism.data_parallel_shard_degree 2",
                     "--parallelism.tensor_parallel_degree 2",
                     "--parallelism.pipeline_parallel_degree 2",

From 57bb8dd872b9253f5441f80f8d125a52c2a43074 Mon Sep 17 00:00:00 2001
From: 3outeille <ferdinand.mom@epita.fr>
Date: Thu, 20 Nov 2025 17:38:29 +0000
Subject: [PATCH 129/129] fix minor stuff

---
 torchtitan/experiments/README.md              |  2 +-
 .../transformers_backend/__init__.py          |  2 +-
 .../configs/{qwen3.toml => debug_model.toml}  |  8 +-
 .../transformers_backend/configs/full.toml    | 87 +++++++++++++++++++
 .../{pipeline_parallel.py => pipeline.py}     |  0
 .../tests/integration_tests.py                |  1 +
 6 files changed, 93 insertions(+), 7 deletions(-)
 rename torchtitan/experiments/transformers_backend/configs/{qwen3.toml => debug_model.toml} (91%)
 create mode 100644 torchtitan/experiments/transformers_backend/configs/full.toml
 rename torchtitan/experiments/transformers_backend/infra/{pipeline_parallel.py => pipeline.py} (100%)

diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
index 9b25cdc7a6..08dc692bf9 100644
--- a/torchtitan/experiments/README.md
+++ b/torchtitan/experiments/README.md
@@ -31,4 +31,4 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [moe_symm_mem_kernels](./moe_symm_mem_kernels/) | TBA | [@kwen2501](https://github.com/kwen2501) |
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
 | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
-| [transformers_backend](./transformers_backend/) | ![Transformers Backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main) | [@3outeille](https://github.com/3outeille) |
+| [transformers_backend](./transformers_backend/) | [![Transformers backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml?query=branch%3Amain) | [@3outeille](https://github.com/3outeille) |
diff --git a/torchtitan/experiments/transformers_backend/__init__.py b/torchtitan/experiments/transformers_backend/__init__.py
index dc4322623b..aec28a0bdd 100644
--- a/torchtitan/experiments/transformers_backend/__init__.py
+++ b/torchtitan/experiments/transformers_backend/__init__.py
@@ -12,7 +12,7 @@
 
 from .infra.parallelize import parallelize_hf_transformers
 
-from .infra.pipeline_parallel import pipeline_hf_transformers
+from .infra.pipeline import pipeline_hf_transformers
 from .model.args import HFTransformerModelArgs, TitanDenseModelArgs
 from .model.model import HFTransformerModel
 
diff --git a/torchtitan/experiments/transformers_backend/configs/qwen3.toml b/torchtitan/experiments/transformers_backend/configs/debug_model.toml
similarity index 91%
rename from torchtitan/experiments/transformers_backend/configs/qwen3.toml
rename to torchtitan/experiments/transformers_backend/configs/debug_model.toml
index 13e3f4ddf0..7b3de04b87 100644
--- a/torchtitan/experiments/transformers_backend/configs/qwen3.toml
+++ b/torchtitan/experiments/transformers_backend/configs/debug_model.toml
@@ -47,16 +47,14 @@ max_norm = 1.0  # grad norm clipping
 steps = 10
 dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
 dataset_path = "./tests/assets/c4_test"
-mixed_precision_param = "float32" # force float32 for comparison
-mixed_precision_reduce = "float32"
 
 [parallelism]
 data_parallel_replicate_degree = 1
-data_parallel_shard_degree = 2
+data_parallel_shard_degree = -1
 fsdp_reshard_after_forward = "default" # default / never / always
-tensor_parallel_degree = 2
+tensor_parallel_degree = 1
 enable_async_tensor_parallel = false
-pipeline_parallel_degree = 2
+pipeline_parallel_degree = 1
 pipeline_parallel_schedule = "1F1B"
 context_parallel_degree = 1
 expert_parallel_degree = 1
diff --git a/torchtitan/experiments/transformers_backend/configs/full.toml b/torchtitan/experiments/transformers_backend/configs/full.toml
new file mode 100644
index 0000000000..45eaa785de
--- /dev/null
+++ b/torchtitan/experiments/transformers_backend/configs/full.toml
@@ -0,0 +1,87 @@
+# torchtitan Config.toml
+
+[job]
+dump_folder = "./outputs"
+description = "Qwen 3 full training"
+print_config = true
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 5
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "transformers_backend"
+flavor = "full"
+# test folder with tokenizer.json, for debug purpose only
+hf_assets_path = "./tests/assets/tokenizer"
+# converters = ["float8"]
+
+[hf_transformers]
+model = "Qwen/Qwen3-4B-Instruct-2507"
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+min_lr_factor = 0.0
+
+[training]
+local_batch_size = 2
+seq_len = 2048
+max_norm = 1.0  # grad norm clipping
+steps = 10
+dataset = "c4"  # supported datasets: c4_test (2K), c4 (177M)
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+pipeline_parallel_schedule = "1F1B"
+context_parallel_degree = 1
+expert_parallel_degree = 1
+expert_tensor_parallel_degree = 1
+
+[checkpoint]
+enable = false
+folder = "checkpoint"
+interval = 10
+last_save_model_only = false
+export_dtype = "float32"
+async_mode = "disabled"  # ["disabled", "async", "async_with_pinned_mem"]
+
+[activation_checkpoint]
+mode = "selective"  # ["none", "selective", "full"]
+selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+
+[compile]
+enable=false
+components = ["model", "loss"]
+
+[quantize.linear.float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+filter_fqns = ["output"]
+
+[validation]
+enable = false
+dataset = "c4_validation"
+freq = 5
+steps = 10
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
similarity index 100%
rename from torchtitan/experiments/transformers_backend/infra/pipeline_parallel.py
rename to torchtitan/experiments/transformers_backend/infra/pipeline.py
diff --git a/torchtitan/experiments/transformers_backend/tests/integration_tests.py b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
index f8a5c4e7e3..35d09d6a94 100644
--- a/torchtitan/experiments/transformers_backend/tests/integration_tests.py
+++ b/torchtitan/experiments/transformers_backend/tests/integration_tests.py
@@ -21,6 +21,7 @@ def build_transformers_backend_test_list() -> list[OverrideDefinitions]:
         OverrideDefinitions(
             [
                 [
+                    "--model.name transformers_backend",
                     "--job.custom_config_module=torchtitan.experiments.transformers_backend.job_config",
                     "--hf_transformers.model Qwen/Qwen2.5-7B",
                     "--parallelism.data_parallel_shard_degree 2",