pytorch · huydhn · Apr 2, 2024 · Mar 28, 2024
diff --git a/test/distributed/_tensor/test_api.py b/test/distributed/_tensor/test_api.py
@@ -250,6 +250,44 @@ def output_fn(outputs, device_mesh):
         self.assertIsInstance(local_out, torch.Tensor)
         self.assertNotIsInstance(local_out, DTensor)
 
+    @with_comms
+    def test_distribute_module_casting(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        # check DTensor casting
+        dt = DTensor.from_local(torch.rand(10), device_mesh, [Replicate()])
+        dt = dt.to(torch.bfloat16)
+        self.assertEqual(dt.dtype, torch.bfloat16)
+        self.assertEqual(dt._local_tensor.dtype, torch.bfloat16)
+
+        # check distribute_tensor casting
+        dt = distribute_tensor(torch.rand(10), device_mesh, [Replicate()])
+        dt = dt.to(torch.bfloat16)
+        self.assertEqual(dt.dtype, torch.bfloat16)
+        self.assertEqual(dt._local_tensor.dtype, torch.bfloat16)
+
+        # check distribute_module casting
+        model = MyModel(10, 10, device=self.device_type)
+        replica_model = distribute_module(
+            model,
+            device_mesh,
+        )
+        replica_model = replica_model.to(torch.bfloat16)
+        self.assertEqual(replica_model.seq[0].weight.dtype, torch.bfloat16)
+        self.assertEqual(
+            replica_model.seq[0].weight._local_tensor.dtype, torch.bfloat16
+        )
+
+        # check autocast
+        dt = distribute_tensor(torch.rand(10), device_mesh, [Replicate()])
+        replica_model = distribute_module(
+            model,
+            device_mesh,
+        )
+        with torch.autocast(device_type=self.device_type, dtype=torch.bfloat16):
+            output = replica_model(dt)
+        self.assertEqual(output.dtype, torch.bfloat16)
+
     @with_comms
     def test_distribute_module_meta(self):
         # If  the model is too big, the user may first the create entire model on the meta device and then initialize

@@ -13,6 +13,7 @@
 from typing import Union, Tuple, Any, Callable, Iterator, Set, Optional, overload, TypeVar, Mapping, Dict, List
 from typing_extensions import Self
 from ...utils.hooks import RemovableHandle
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 __all__ = ['register_module_forward_pre_hook', 'register_module_forward_hook',
            'register_module_full_backward_pre_hook', 'register_module_backward_hook',
@@ -802,8 +803,12 @@ def compute_should_use_set_data(tensor, tensor_applied):
             with torch.no_grad():
                 param_applied = fn(param)
             p_should_use_set_data = compute_should_use_set_data(param, param_applied)
+
+            # subclasses may have multiple child tensors so we need to use swap_tensors
+            p_should_use_swap_tensors = should_use_swap_tensors or is_traceable_wrapper_subclass(param_applied)
+
             param_grad = param.grad
-            if should_use_swap_tensors:
+            if p_should_use_swap_tensors:
                 try:
                     if param_grad is not None:
                         # Accessing param.grad makes its at::Tensor's use_count 2, which will prevent swapping.
@@ -829,7 +834,7 @@ def compute_should_use_set_data(tensor, tensor_applied):
                 with torch.no_grad():
                     grad_applied = fn(param_grad)
                 g_should_use_set_data = compute_should_use_set_data(param_grad, grad_applied)
-                if should_use_swap_tensors:
+                if p_should_use_swap_tensors:
                     grad_applied.requires_grad_(param_grad.requires_grad)
                     try:
                         torch.utils.swap_tensors(param_grad, grad_applied)