From 7b98a3645f8f217867441a5de3721c5fe98e24d6 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Tue, 23 Sep 2025 13:38:18 -0700 Subject: [PATCH] Allows to configure flight recorder file prefix Allows to configure flight recorder file prefix --- torchtitan/config/job_config.py | 3 +++ torchtitan/distributed/utils.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py index fc8423c7a..0eb06a0d4 100644 --- a/torchtitan/config/job_config.py +++ b/torchtitan/config/job_config.py @@ -645,6 +645,9 @@ class Comm: save_traces_folder: str = "comm_traces" """Flight recorder trace files location""" + save_traces_file_prefix: str = "rank_" + """Flight recorder trace files prefix""" + @dataclass class MemoryEstimation: diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py index 72700fb1a..159b6229d 100644 --- a/torchtitan/distributed/utils.py +++ b/torchtitan/distributed/utils.py @@ -258,7 +258,7 @@ def _get_distributed_backend(enable_cpu_backend): return backend TRACE_BUFFER_SIZE = "TORCH_FR_BUFFER_SIZE" - TRACE_FILE = "TORCH_NCCL_DEBUG_INFO_TEMP_FILE" + TRACE_FILE = "TORCH_FR_DUMP_TEMP_FILE" DUMP_ON_TIMEOUT = "TORCH_NCCL_DUMP_ON_TIMEOUT" ASYNC_ERROR_HANDLING = "TORCH_NCCL_ASYNC_ERROR_HANDLING" SKIP_CLEANUP = "3" @@ -275,8 +275,9 @@ def _get_distributed_backend(enable_cpu_backend): # dump on timeout by default if trace buffer is enabled _warn_overwrite_env(DUMP_ON_TIMEOUT, "1") dump_dir = os.path.join(base_folder, comm_config.save_traces_folder) + prefix = comm_config.save_traces_file_prefix os.makedirs(dump_dir, exist_ok=True) - _warn_overwrite_env(TRACE_FILE, f"{dump_dir}/rank_") + _warn_overwrite_env(TRACE_FILE, f"{dump_dir}/{prefix}") torch.distributed.init_process_group( backend=_get_distributed_backend(enable_cpu_backend),