Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions tests/unit_tests/test_set_determinism.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,27 @@ def test_seed_uniqueness_3d_mesh(self, mock_get_rank, mock_get_world_size):
f"Expected {mesh_sizes[0] * mesh_sizes[1]} unique seeds for (dp_shard, dp_replicate) combinations",
)

@patch("torch.distributed.distributed_c10d.get_world_size")
@patch("torch.distributed.distributed_c10d.get_rank")
def test_set_determinism_single_gpu(self, mock_get_rank, mock_get_world_size):
"""Test set_determinism for single GPU (empty mesh)"""
mock_get_world_size.return_value = 1
mock_get_rank.return_value = 0

base_seed = 42

fake_mesh = MagicMock()
fake_mesh.mesh_dim_names = None
fake_mesh.get_coordinate.return_value = None

debug_config = DebugConfig(seed=base_seed, deterministic=False)
set_determinism(
world_mesh=fake_mesh,
device=self.device,
debug_config=debug_config,
distinct_seed_mesh_dims=["pp"],
)


if __name__ == "__main__":
unittest.main()
4 changes: 3 additions & 1 deletion torchtitan/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ def set_determinism(
# and choose a unique seed for each rank on the PP mesh.
# We support multiple distinct dimensions by adding each distinct dimension's local rank to the seed.
distinct_dims_in_mesh = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching this! I have a n00b question, will world_mesh.mesh_dim_names be empty or empty list: https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py#L159, if we init_device_mesh with mesh = init_device_mesh(device_type, dims=[], mesh_dim_names=[])

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

world_mesh.mesh_dim_names is empty with type NoneType

dim for dim in distinct_seed_mesh_dims if dim in world_mesh.mesh_dim_names
dim
for dim in distinct_seed_mesh_dims
if world_mesh.mesh_dim_names and dim in world_mesh.mesh_dim_names
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fegin It seems if NGPU=1, world_mesh is not None but mesh_dim_names is None, due to this code https://github.com/pytorch/torchtitan/blob/main/torchtitan/distributed/parallel_dims.py#L154-L156.

Does this sound right to you? I somehow feel we should have default mesh_dim_names, but I can't find a perfect option for it.

I'm OK with this change to unblock.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can land this PR to unblock. My new DeviceMesh PR should address this problem. I will also ensure that the newly added unittest pass in my PR.

]

if c10d.get_world_size() > 1 and distinct_dims_in_mesh:
Expand Down
Loading