[DebugMode] store stringify args by default (#166347)

pianpwk · etaf · commit bce274d06709 · 2025-11-04T05:21:48.000-08:00
DebugMode currently stores dispatch call args & kwargs, which is all intermediate tensors and more. This quickly OOMed on GPU when trying to debug some torchtitan / llama 8b models. This defaults to storing the stringified version, adding a flag `DebugMode(store_original_args=True)` if users want to store the original args as-is (and for BC). Pull Request resolved: #166347 Approved by: https://github.com/yushangdi
diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
@@ -64,6 +64,10 @@ def test_debug_mode_mm(self):
         self.assertTrue(isinstance(debug_mode.operators[2], _RedistributeCall))
         self.assertEqual(next(iter(debug_mode.operators[1])), torch.ops.aten.mm.default)
 
+        # check stringification
+        self.assertTrue(hasattr(debug_mode.operators[0], "args_str"))
+        self.assertFalse(hasattr(debug_mode.operators[0], "args"))
+
     def test_debug_string_inside_context(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
@@ -267,6 +271,7 @@ def test_tensor_attributes(self):
             record_torchfunction=True,
             record_faketensor=True,
             record_tensor_attributes=["a1", "a2"],
+            store_original_args=True,
         ) as debug_mode:
             torch.matmul(y, x)
 
@@ -279,6 +284,9 @@ def test_tensor_attributes(self):
       aten::_unsafe_view(t: f32[64, 8], [8, 8, 8])""",
         )
 
+        self.assertTrue(hasattr(debug_mode.operators[0], "args"))
+        self.assertEqual(id(debug_mode.operators[0].args[0]), id(y))
+
     @parametrize("has_inner_mode", [True, False])
     @parametrize("has_outer_mode", [True, False])
     def test_nested_debug_mode(self, has_inner_mode, has_outer_mode):
diff --git a/torch/utils/_debug_mode.py b/torch/utils/_debug_mode.py
@@ -87,6 +87,14 @@ class _DebugCall:
     def __init__(self, call_depth: int):
         self.call_depth = call_depth
 
+    def stringify_args(self, attributes: list[str]) -> None:
+        """
+        To reduce memory consumption, this method stringifies args/kwargs, stores the result, and deletes original args/kwargs.
+        """
+        raise NotImplementedError(
+            "Subclasses must implement stringify_args(), even if no-op"
+        )
+
     def render(self, attributes: list[str]) -> str:
         raise NotImplementedError("Subclasses must implement string render()")
 
@@ -103,15 +111,35 @@ def __init__(self, op, args: tuple, kwargs: dict, call_depth: int):
         self.args = args
         self.kwargs = kwargs
 
-    def render(self, attributes: list[str]) -> str:
-        args_str = ", ".join(_arg_to_str(arg, attributes) for arg in self.args)
+        self.args_str: Optional[str] = None
+        self.kwargs_str: Optional[str] = None
 
+    def stringify_args(self, attributes: list[str]) -> None:
+        self.args_str = ", ".join(_arg_to_str(arg, attributes) for arg in self.args)
         if self.kwargs:
-            kwargs_str = ", " + ", ".join(
+            self.kwargs_str = ", " + ", ".join(
                 f"{k}={_arg_to_str(v, attributes)}" for k, v in self.kwargs.items()
             )
         else:
-            kwargs_str = ""
+            self.kwargs_str = ""
+        del self.args
+        del self.kwargs
+
+    def render(self, attributes: list[str]) -> str:
+        if self.args_str is not None:
+            args_str = self.args_str
+        else:
+            args_str = ", ".join(_arg_to_str(arg, attributes) for arg in self.args)
+
+        if self.kwargs_str is not None:
+            kwargs_str = self.kwargs_str
+        else:
+            if self.kwargs:
+                kwargs_str = ", " + ", ".join(
+                    f"{k}={_arg_to_str(v, attributes)}" for k, v in self.kwargs.items()
+                )
+            else:
+                kwargs_str = ""
 
         if isinstance(self.op, torch._ops.OpOverload):
             op_name = self.op.__qualname__
@@ -124,7 +152,10 @@ def render(self, attributes: list[str]) -> str:
 
     def __iter__(self):
         # for BC; tuple(self) returns (op, args, kwargs, call_depth)
-        yield from [self.op, self.args, self.kwargs, self.call_depth]
+        if self.args_str is not None:
+            yield from [self.op, self.args_str, self.kwargs_str, self.call_depth]
+        else:
+            yield from [self.op, self.args, self.kwargs, self.call_depth]
 
 
 class _RedistributeCall(_DebugCall):
@@ -139,8 +170,18 @@ def __init__(
         self.dst_placement = dst_placement
         self.transform_info_str = transform_info_str
 
+        self.arg_str: Optional[str] = None
+
+    def stringify_args(self, attributes: list[str]) -> None:
+        self.arg_str = f"{_arg_to_str(self.arg, attributes)}"
+        del self.arg
+
     def render(self, attributes: list[str]) -> str:
-        arg_str = f"{_arg_to_str(self.arg, attributes)}"
+        if self.arg_str is not None:
+            arg_str = self.arg_str
+        else:
+            arg_str = f"{_arg_to_str(self.arg, attributes)}"
+
         if self.transform_info_str is not None:  # prioritize over src/dst placements
             placement_str = f"trace: {self.transform_info_str}"
         else:
@@ -151,11 +192,16 @@ def render(self, attributes: list[str]) -> str:
 
     def __iter__(self):
         # for BC; tuple(self) returns (op, placement info, kwargs, call_depth)
+        if self.arg_str is not None:
+            arg = self.arg_str
+        else:
+            arg = self.arg
+
         yield REDISTRIBUTE_FUNC
         if self.transform_info_str:
-            yield [self.arg, self.transform_info_str]
+            yield [arg, self.transform_info_str]
         else:
-            yield [self.arg, self.src_placement, self.dst_placement]
+            yield [arg, self.src_placement, self.dst_placement]
         yield {}
         yield self.call_depth
 
@@ -167,6 +213,9 @@ def __init__(self, module_name: str, call_depth: int):
         super().__init__(call_depth)
         self.module_name = module_name
 
+    def stringify_args(self, attributes: list[str]) -> None:
+        pass  # nothing to stringify
+
     def render(self, attributes: list[str]) -> str:
         return f"[nn.Mod] {self.module_name}"
 
@@ -188,22 +237,34 @@ def __init__(
         record_realtensor=True,
         record_tensor_attributes=None,
         record_nn_module=False,
+        store_original_args=False,
     ):
         super().__init__()
         import torch.distributed.tensor  # noqa: F401
 
         self.supports_higher_order_operators = True
+
+        # Pushes DebugMode onto the torchfunction stack, and records __torch_function__ calls as well.
+        # WARNING: currently incompatible with torch.compile due to dynamo guard failures.
         self.record_torchfunction = record_torchfunction
+        # Records __torch_dispatch__ calls on FakeTensors.
         self.record_faketensor = record_faketensor
+        # Records __torch_dispatch__ calls on real tensors.
         self.record_realtensor = record_realtensor
+        # Optional list[str] of tensor attributes, to be annotated in the string dump.
         self.record_tensor_attributes = record_tensor_attributes or []
-
+        # Uses ModTracker to record nn.Module entrances, as _NNModuleCall entries.
+        # This flag currently has no effect on torch.compiled-regions.
         self.record_nn_module = record_nn_module
 
         self.module_tracker: Optional[ModTracker] = None
         if self.record_nn_module:
             self.module_tracker_setup()
 
+        # If True, stores call args/kwargs in logs, without immediately stringifying.
+        # Defaults to False for memory concerns.
+        self.store_original_args = store_original_args
+
         self.operators = []
         self.call_depth = 0
 
@@ -214,11 +275,16 @@ def __init__(
     def ignore_compile_internals(cls):
         return True
 
+    def _record_call(self, call):
+        if not self.store_original_args:
+            call.stringify_args(self.record_tensor_attributes)
+        self.operators.append(call)
+
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
 
-        self.operators.append(_OpCall(func, args, kwargs, self.call_depth))
+        self._record_call(_OpCall(func, args, kwargs, self.call_depth))
 
         try:
             self.call_depth += 1
@@ -232,19 +298,17 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         # Record the operation with its call depth
         if torch.distributed.tensor.DTensor in types:
-            self.operators.append(_OpCall(func, args, kwargs, self.call_depth))
+            self._record_call(_OpCall(func, args, kwargs, self.call_depth))
             return NotImplemented
         elif FakeTensor in types or isinstance(
             _get_current_dispatch_mode(), FakeTensorMode
         ):
             if self.record_faketensor:
                 if func != torch.ops.prim.device.default:
-                    self.operators.append(
-                        _OpCall(func, args, kwargs, self.call_depth + 1)
-                    )
+                    self._record_call(_OpCall(func, args, kwargs, self.call_depth + 1))
         elif len(types) == 0:
             if self.record_realtensor:
-                self.operators.append(_OpCall(func, args, kwargs, self.call_depth + 1))
+                self._record_call(_OpCall(func, args, kwargs, self.call_depth + 1))
 
         result = func(*args, **kwargs)
 
@@ -296,7 +360,7 @@ def record_redistribute_calls(
         transform_info_str: Optional[str] = None,
     ):
         try:
-            self.operators.append(
+            self._record_call(
                 _RedistributeCall(
                     arg,
                     src_placement=src_placement,