From 52667b3307783839152a190324e0089e3239e8da Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Wed, 24 Sep 2025 23:42:40 -0700 Subject: [PATCH 1/2] [CI] Disable Async TP CI Async TP related CI started to fail since Sep 22 2025. However even if we roll back the nightly PyTorch to 0919, the tests still failed. ``` python -m pip install --force-reinstall torch==2.10.0.dev20250917+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 ``` This is not an async TP issue but symmetric memory. This simple line can cause issues on the CI machine/docker. ``` symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=1024*1024*64) ``` --- tests/integration_tests/__init__.py | 1 + tests/integration_tests/h100.py | 2 ++ tests/integration_tests/run_tests.py | 3 +++ 3 files changed, 6 insertions(+) diff --git a/tests/integration_tests/__init__.py b/tests/integration_tests/__init__.py index ae7c32cdeb..a9f9d2afeb 100644 --- a/tests/integration_tests/__init__.py +++ b/tests/integration_tests/__init__.py @@ -22,6 +22,7 @@ class OverrideDefinitions: test_descr: str = "default" test_name: str = "default" ngpu: int = 4 + disabled: bool = False def __repr__(self): return self.test_descr diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py index ae1fb5b597..c31dc9af47 100755 --- a/tests/integration_tests/h100.py +++ b/tests/integration_tests/h100.py @@ -29,6 +29,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: ], "2D async TP compile", "2d_asynctp_compile", + disabled=True, ), OverrideDefinitions( [ @@ -57,6 +58,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: "FSDP+async TP+PP+torch.compile+Float8", "fsdp+tp+cp+compile+float8", ngpu=8, + disabled=True, ), OverrideDefinitions( [ diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py index dff179e4a5..a64c69eb61 100644 --- a/tests/integration_tests/run_tests.py +++ b/tests/integration_tests/run_tests.py @@ -84,6 +84,9 @@ def run_tests(args, test_list: list[OverrideDefinitions]): if args.test_name != "all" and test_flavor.test_name != args.test_name: continue + if test_flavor.disabled: + continue + # Check if we have enough GPUs if args.ngpu < test_flavor.ngpu: logger.info( From eccef5dd0f58e378e9c2a4c3396c5f47314ec731 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Thu, 25 Sep 2025 10:38:12 -0700 Subject: [PATCH 2/2] Allow an easy way to disable tests. --- tests/integration_tests/features.py | 44 ++++++++++--------- tests/integration_tests/h100.py | 2 + .../simple_fsdp/tests/integration_tests.py | 25 ++++++----- 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py index a0aa1903a6..31c15017d1 100755 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -65,17 +65,18 @@ def build_features_test_list() -> list[OverrideDefinitions]: "2d_compile", ), # TODO: re-enable this test once the async TP CI issue is fixed - # OverrideDefinitions( - # [ - # [ - # "--compile.enable", - # "--parallelism.tensor_parallel_degree 2", - # "--parallelism.enable_async_tensor_parallel", - # ], - # ], - # "2D async TP compile", - # "2d_asynctp_compile", - # ), + OverrideDefinitions( + [ + [ + "--compile.enable", + "--parallelism.tensor_parallel_degree 2", + "--parallelism.enable_async_tensor_parallel", + ], + ], + "2D async TP compile", + "2d_asynctp_compile", + disabled=True, + ), OverrideDefinitions( [ [ @@ -432,16 +433,17 @@ def build_features_test_list() -> list[OverrideDefinitions]: "cpu_offload+opt_in_bwd+TP+DP+CP", ngpu=8, ), - # OverrideDefinitions( - # [ - # [ - # "--memory_estimation.enable", - # ] - # ], - # "FSDP2 Memory Tracking and Estimation", - # "fsdp2_memory_estimation", - # ngpu=2, - # ), + OverrideDefinitions( + [ + [ + "--memory_estimation.enable", + ] + ], + "FSDP2 Memory Tracking and Estimation", + "fsdp2_memory_estimation", + ngpu=2, + disabled=True, + ), OverrideDefinitions( [ [ diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py index c31dc9af47..53af87a2ea 100755 --- a/tests/integration_tests/h100.py +++ b/tests/integration_tests/h100.py @@ -19,6 +19,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: same root config file. """ integration_tests_flavors = [ + # TODO: re-enable this test once the async TP issue is fixed OverrideDefinitions( [ [ @@ -42,6 +43,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: "Float8 test", "float8", ), + # TODO: re-enable this test once the async TP issue is fixed OverrideDefinitions( [ [ diff --git a/torchtitan/experiments/simple_fsdp/tests/integration_tests.py b/torchtitan/experiments/simple_fsdp/tests/integration_tests.py index c680f84a73..33ccbd8903 100755 --- a/torchtitan/experiments/simple_fsdp/tests/integration_tests.py +++ b/torchtitan/experiments/simple_fsdp/tests/integration_tests.py @@ -63,18 +63,19 @@ def build_simple_fsdp_test_list() -> list[OverrideDefinitions]: "2d", ), # TODO: re-enable this test once the async TP issue is fixed - # OverrideDefinitions( - # [ - # [ - # "--model.name simple_fsdp", - # "--compile.enable", - # "--parallelism.tensor_parallel_degree 2", - # "--parallelism.enable_async_tensor_parallel", - # ], - # ], - # "2D async TP", - # "2d_asynctp", - # ), + OverrideDefinitions( + [ + [ + "--model.name simple_fsdp", + "--compile.enable", + "--parallelism.tensor_parallel_degree 2", + "--parallelism.enable_async_tensor_parallel", + ], + ], + "2D async TP", + "2d_asynctp", + disabled=True, + ), OverrideDefinitions( [ [