pytorch · rohan-varma · Jun 3, 2023 · Jun 5, 2023 · Jun 6, 2023 · wz337
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
@@ -1349,6 +1349,7 @@
                 state = (
                     {} if param_key is None else optim_state_dict["state"][param_key]
                 )
+                print(f"RV: calling _Gather_orig_param_state")
                 unflat_state = [
                     _gather_orig_param_state(
                         fsdp_param_info,
@@ -1497,7 +1498,9 @@
     object_list: List[StateInfo] = [
         processed_state for _ in range(fsdp_state.world_size)
     ]
-    dist.all_gather_object(object_list, processed_state)
+    assert fsdp_state.world_size == fsdp_state.process_group.size()
+    print(f"RV: {fsdp_state.process_group.size()} vs {torch.distributed.distributed_c10d._get_default_group().size()}")
+    dist.all_gather_object(object_list, processed_state, group=fsdp_state.process_group)
 
     # Convert the gathered, pre-processed state of each rank to the original one.
     gathered_state: Dict[str, Any] = {}
@@ -1592,6 +1595,7 @@
     ):
         return optim_state
 
+    print(f"RV: gathering state")
     gathered_state = _all_gather_optim_state(fsdp_state, optim_state)
 
     # Unflatten state values.