From 0c3f615beec22adf625e5ca90244539a03827e40 Mon Sep 17 00:00:00 2001 From: Deepak Seshadri Date: Thu, 17 Jul 2025 17:48:08 -0700 Subject: [PATCH] Name threads in caffe2/torch/distributed/checkpoint AsyncCheckpointExecutor (#158612) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/158612 Test Plan: Built `training_platform.worker:c44710f0f91f8b4ed58f517864a7ba8b`, and ran [`f765449601-TrainingApplication_S3BTR`](https://www.internalfb.com/mlhub/pipelines/runs/mast/f765449601-TrainingApplication_S3BTR?version=0&tab=summary&env=PRODUCTION). [SBDive profile](https://www.internalfb.com/intern/sbdive/?id=f765449601-TrainingApplication_S3BTR-84eceeca-8b03-410b-9e39-3840ec9cf185) of the job shows threads named `AsyncCheckpointExecutor` All of rank0's Python threads {F1980370763} All `AsyncCheckpointExecutor` across ranks {F1980371593} Rollback Plan: Reviewed By: d4l3k Differential Revision: D78493333 --- torch/distributed/checkpoint/_async_thread_executor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/distributed/checkpoint/_async_thread_executor.py b/torch/distributed/checkpoint/_async_thread_executor.py index 1038c177529d..3fad17b2dea9 100644 --- a/torch/distributed/checkpoint/_async_thread_executor.py +++ b/torch/distributed/checkpoint/_async_thread_executor.py @@ -37,7 +37,9 @@ def save_wrapper( class _ThreadBasedAsyncCheckpointExecutor(_AsyncCheckpointExecutor): def __init__(self) -> None: - self._executor = ThreadPoolExecutor(max_workers=1) + self._executor = ThreadPoolExecutor( + max_workers=1, thread_name_prefix="AsyncCheckpointExecutor" + ) def execute_save( self,