nn.Module: use swap_tensors for Tensor subclasses (#122755)

This fixes a bug when casting a module that has DTensor parameters. The old behavior will swap the .data field of the Tensor subclass which is incorrect behavior when dealing with tensor subclasses that may have multiple child tensors. This uses the `swap_tensors` method to swap all of the tensors not just the .data field. Test plan: ``` pytest test/distributed/_tensor/test_api.py -k 'test_distribute_module_casting' python test/distributed/fsdp/test_wrap.py -k test_auto_wrap_smoke_test_cuda_init_mode1_cpu_offload0_use_device_id_True ``` Pull Request resolved: #122755 Approved by: https://github.com/wanchaol, https://github.com/mikaylagawarecki
pytorch · Mar 28, 2024 · e6ee832 · e6ee832
1 parent 3e7fd45
commit e6ee832
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 2 deletions.
diff --git a/test/distributed/_tensor/test_api.py b/test/distributed/_tensor/test_api.py
@@ -250,6 +250,44 @@ def output_fn(outputs, device_mesh):
         self.assertIsInstance(local_out, torch.Tensor)
         self.assertNotIsInstance(local_out, DTensor)
 
+    @with_comms
+    def test_distribute_module_casting(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        # check DTensor casting
+        dt = DTensor.from_local(torch.rand(10), device_mesh, [Replicate()])
+        dt = dt.to(torch.bfloat16)
+        self.assertEqual(dt.dtype, torch.bfloat16)
+        self.assertEqual(dt._local_tensor.dtype, torch.bfloat16)
+
+        # check distribute_tensor casting
+        dt = distribute_tensor(torch.rand(10), device_mesh, [Replicate()])
+        dt = dt.to(torch.bfloat16)
+        self.assertEqual(dt.dtype, torch.bfloat16)
+        self.assertEqual(dt._local_tensor.dtype, torch.bfloat16)
+
+        # check distribute_module casting
+        model = MyModel(10, 10, device=self.device_type)
+        replica_model = distribute_module(
+            model,
+            device_mesh,
+        )
+        replica_model = replica_model.to(torch.bfloat16)
+        self.assertEqual(replica_model.seq[0].weight.dtype, torch.bfloat16)
+        self.assertEqual(
+            replica_model.seq[0].weight._local_tensor.dtype, torch.bfloat16
+        )
+
+        # check autocast
+        dt = distribute_tensor(torch.rand(10), device_mesh, [Replicate()])
+        replica_model = distribute_module(
+            model,
+            device_mesh,
+        )
+        with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+            output = replica_model(dt)
+        self.assertEqual(output.dtype, torch.bfloat16)
+
     @with_comms
     def test_distribute_module_meta(self):
         # If  the model is too big, the user may first the create entire model on the meta device and then initialize

diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
@@ -13,6 +13,7 @@
 from typing import Union, Tuple, Any, Callable, Iterator, Set, Optional, overload, TypeVar, Mapping, Dict, List
 from typing_extensions import Self
 from ...utils.hooks import RemovableHandle
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 __all__ = ['register_module_forward_pre_hook', 'register_module_forward_hook',
            'register_module_full_backward_pre_hook', 'register_module_backward_hook',
@@ -802,8 +803,12 @@ def compute_should_use_set_data(tensor, tensor_applied):
             with torch.no_grad():
                 param_applied = fn(param)
             p_should_use_set_data = compute_should_use_set_data(param, param_applied)
+
+            # subclasses may have multiple child tensors so we need to use swap_tensors
+            p_should_use_swap_tensors = should_use_swap_tensors or is_traceable_wrapper_subclass(param_applied)
+
             param_grad = param.grad
-            if should_use_swap_tensors:
+            if p_should_use_swap_tensors:
                 try:
                     if param_grad is not None:
                         # Accessing param.grad makes its at::Tensor's use_count 2, which will prevent swapping.
@@ -829,7 +834,7 @@ def compute_should_use_set_data(tensor, tensor_applied):
                 with torch.no_grad():
                     grad_applied = fn(param_grad)
                 g_should_use_set_data = compute_should_use_set_data(param_grad, grad_applied)
-                if should_use_swap_tensors:
+                if p_should_use_swap_tensors:
                     grad_applied.requires_grad_(param_grad.requires_grad)
                     try:
                         torch.utils.swap_tensors(param_grad, grad_applied)