pytorch · fegin · Nov 3, 2023 · Nov 3, 2023 · Nov 3, 2023 · Nov 7, 2023
diff --git a/test/distributed/checkpoint/test_state_dict_utils.py b/test/distributed/checkpoint/test_state_dict_utils.py
@@ -18,7 +18,7 @@
 class TestStateDictUtils(DTensorTestBase):
     @property
     def world_size(self):
-        return 2
+        return min(4, torch.cuda.device_count())
 
     @with_comms
     @skip_if_lt_x_gpu(2)
@@ -35,6 +35,29 @@ def test_gather_state_dict_dtensor(self):
             dist_tensor.to_local(), gather_dim=0, group=(device_mesh, 0)
         )
         self.assertEqual(expected_gathered_dtensor, gathered_state_dict["dtensor"])
+        self.assertEqual(gathered_state_dict["dtensor"].is_cuda, True)
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_cpu_and_ranks_only(self):
+        device_mesh = self.build_device_mesh()
+        shard_spec = [Shard(0)]
+        torch.random.manual_seed(dist.get_rank())
+        local_tensor = torch.randn(3, 3, 3)
+        dist_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
+        state_dict = {"dtensor": dist_tensor}
+
+        gathered_state_dict = _gather_state_dict(
+            state_dict, cpu_offload=True, ranks_only=(0, 2)
+        )
+        expected_gathered_dtensor = funcol.all_gather_tensor(
+            dist_tensor.to_local(), gather_dim=0, group=(device_mesh, 0)
+        )
+        if dist.get_rank() in (0, 2):
+            self.assertEqual(expected_gathered_dtensor, gathered_state_dict["dtensor"])
+            self.assertEqual(gathered_state_dict["dtensor"].is_cuda, False)
+        else:
+            self.assertEqual(gathered_state_dict, {})
 
 
 if __name__ == "__main__":

diff --git a/torch/distributed/checkpoint/_state_dict_utils.py b/torch/distributed/checkpoint/_state_dict_utils.py
@@ -1,5 +1,5 @@
 import math
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 
 import torch
 import torch.distributed as dist
@@ -49,13 +49,34 @@ def _all_gather_sharded_tensor(
 
 def _gather_state_dict(
     state_dict: Dict[str, Any],
+    *,
     pg: Optional[dist.ProcessGroup] = None,
     device: Optional[torch.device] = None,
+    cpu_offload: bool = False,
+    ranks_only: Tuple[int, ...] = tuple(),
 ) -> Dict[str, Any]:
     """
-    Given a state_dict, this API gathers all the ShardedTensors or DTensors in the state_dict.
+    Given a state_dict, this API gathers all the ShardedTensors or DTensors in
+    the state_dict.
+
+
+    Args:
+        state_dict (Dict[str, Any]): the target sharded state_dict.
+        pg (Optional[dist.ProcessGroup]): the process group that is used to
+            gather ShardedTensor.
+        device: (Optional[torch.device]): the device that is used to
+            perform allgather for ShardedTensor.
+        cpu_offload (bool): whether to offload the tensors to CPU memory. The
+            default value is False.
+        ranks_only: (Tuple[int, ...]): if this tuple is empty, all ranks will
+            have the same state_dicts. Otherwise only ranks that in ``ranks_only``
+            have the same state_dicts. Other ranks will get empty state_dicts.
+
+    Returns:
+        The gathered state dictionary.
     """
     new_state_dict = {}
+    cpu_device = torch.device("cpu")
     for key, value in state_dict.items():
         if isinstance(value, ShardedTensor):
             # ShardedTensor does not seem to record the original device type.
@@ -65,7 +86,7 @@ def _gather_state_dict(
             local_shard_device = (
                 value.local_shards()[0].tensor.device
                 if value.local_shards()
-                else torch.device("cpu")
+                else cpu_device
             )
             if output_tensor.device != local_shard_device:
                 value = output_tensor.to(local_shard_device)
@@ -86,7 +107,11 @@ def _gather_state_dict(
             )
             value = value.to_local()
         elif isinstance(value, dict):
-            value = _gather_state_dict(value, pg, device)
+            value = _gather_state_dict(value, pg=pg, device=device)
+
+        if isinstance(value, torch.Tensor) and cpu_offload:
+            value = value.to(cpu_device)
 
-        new_state_dict[key] = value
+        if not cpu_offload or len(ranks_only) == 0 or dist.get_rank(pg) in ranks_only:
+            new_state_dict[key] = value
     return new_state_dict