[Profiler] Include more uncategorized events in memory profile (#101200)

Summary: This PR adds handling for allocations / frees which we cannot prove are for Tensors. (And thus aren't assigned an ID.) These events are still important for judging overall utilization. Test Plan: CI and Unit tests. Differential Revision: D45458885 Pulled By: aaronenyeshi Pull Request resolved: #101200 Approved by: https://github.com/anupambhatnagar, https://github.com/davidberard98
pytorch · Jun 8, 2023 · 2a4fa25 · 2a4fa25
1 parent 675f259
commit 2a4fa25
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 11 deletions.
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
@@ -1552,6 +1552,54 @@ def id_for_testing(key):
             destroy                    GRADIENT                    17(v0)            2 kB
             destroy                    GRADIENT                    13(v0)         1024 kB""")
 
+    def test_memory_timeline_no_id(self) -> None:
+        # On CPU the default behavior is to simply forward to malloc. That
+        # means that when we free `x` the allocator doesn't actually know how
+        # many bytes are in the allocation, and thus there's no point to
+        # calling `c10::reportMemoryUsageToProfiler`. So in order to test that
+        # memory profiler processes this case correctly we need to use CUDA
+        # where we do always keep a record.
+        x = torch.ones((1024,), device="cuda" if torch.cuda.is_available() else "cpu")
+
+        with profile() as prof:
+            # We never see `x` used so we don't know the storage is for a
+            # Tensor, but we do still see the free event.
+            del x
+
+            # For empty we see the allocation and free, but not any use.
+            # So this also cannot be identified as a Tensor.
+            y = torch.empty((64,))
+            del y
+
+            z = torch.empty((256,))
+            z.view_as(z)  # Show `z` to the profiler
+            del z
+
+        memory_profile = prof._memory_profile()
+
+        expected = [
+            # x
+            (_memory_profiler.Action.PREEXISTING, 4096),
+            (_memory_profiler.Action.DESTROY, 4096),
+            #
+            # y
+            (_memory_profiler.Action.CREATE, 256),
+            (_memory_profiler.Action.DESTROY, 256),
+            #
+            # z
+            (_memory_profiler.Action.CREATE, 1024),
+            (_memory_profiler.Action.DESTROY, 1024),
+        ]
+
+        # See above.
+        if not torch.cuda.is_available():
+            expected = expected[2:]
+
+        self.assertEqual(
+            [(action, size) for _, action, _, size in memory_profile.timeline],
+            expected,
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
@@ -30,6 +30,9 @@
 from torch._utils import _element_size
 from torch.profiler import _utils
 
+from typing_extensions import Literal
+
+KeyAndID = Tuple["Key", int]
 TensorAndID = Tuple["TensorKey", int]
 
 log = logging.getLogger(__name__)
@@ -63,6 +66,11 @@ class Action(enum.Enum):
     DESTROY = enum.auto()
 
 
+@dataclasses.dataclass(eq=True, unsafe_hash=False, frozen=True)
+class Key:
+    device: torch.device
+
+
 @dataclasses.dataclass
 class _Storage:
     """Bundle storage pointer and id.
@@ -85,7 +93,7 @@ def __hash__(self) -> int:
 
 
 @dataclasses.dataclass(eq=True, unsafe_hash=True, frozen=True)
-class TensorKey:
+class TensorKey(Key):
     """Hashable identifier for a storage which has been asigned an ID.
 
     A detailed description of Tensor IDs and why they are needed is given in
@@ -97,7 +105,6 @@ class TensorKey:
 
     id: int
     storage: _Storage
-    device: torch.device
 
     def __repr__(self) -> str:
         return f"id={self.id}: {repr(self.storage):<24} ({self.device})"
@@ -117,7 +124,7 @@ def _make(
             and storage_ptr is not None
             and allocation_id is not None
         ):
-            return TensorKey(tensor_id, _Storage(storage_ptr, allocation_id), device)
+            return TensorKey(device, tensor_id, _Storage(storage_ptr, allocation_id))
         return None
 
     @classmethod
@@ -633,7 +640,9 @@ def setdefault_by_version(
     ) -> None:
         self._values[key.id].by_version.setdefault((key, version), category)
 
-    def get(self, key: TensorKey, version: int) -> Optional[Category]:
+    def get(self, key: Key, version: int) -> Optional[Category]:
+        if isinstance(key, Key) and not isinstance(key, TensorKey):
+            return None
         element = self._values[key.id]
         return (
             element.by_id
@@ -658,15 +667,34 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._set_autograd_detail()
 
     @property
-    def timeline(self) -> Tuple[Tuple[int, Action, TensorAndID, int], ...]:
+    def timeline(self) -> Tuple[Tuple[int, Action, KeyAndID, int], ...]:
+        output: List[Tuple[int, Action, KeyAndID, int]] = []
         allocation_times: Dict[Tuple[TensorKey, bool], int] = {}
+        live_unknown: Dict[Tuple[int, torch.device], Literal[True]] = {}
         for event in self._op_tree.dfs():
             if event.typed[0] == _EventType.Allocation:
                 alloc_fields = event.typed[1]
-                key = TensorKey.from_allocation(alloc_fields)
-                if key is not None:
-                    is_allocation = alloc_fields.alloc_size > 0
-                    allocation_times[(key, is_allocation)] = event.start_time_ns
+                alloc_size = alloc_fields.alloc_size
+                is_allocation = alloc_size > 0
+                t = event.start_time_ns
+
+                tkey = TensorKey.from_allocation(alloc_fields)
+                if tkey is not None:
+                    allocation_times[(tkey, is_allocation)] = t
+
+                else:
+                    key = Key(alloc_fields.device)
+                    ptr_and_device = (alloc_fields.ptr, key.device)
+                    if is_allocation:
+                        if ptr_and_device in live_unknown:
+                            output.append((t, Action.INCREMENT_VERSION, (key, 0), alloc_size))
+                        else:
+                            live_unknown[ptr_and_device] = True
+                            output.append((t, Action.CREATE, (key, 0), alloc_size))
+                    else:
+                        output.append((t, Action.DESTROY, (key, 0), -alloc_size))
+                        if not live_unknown.pop(ptr_and_device, False):
+                            output.append((-1, Action.PREEXISTING, (key, 0), -alloc_size))
 
         snapshot = self._category_snapshot()
         last_version = dict(sorted(snapshot.keys()))
@@ -693,12 +721,14 @@ def timeline(self) -> Tuple[Tuple[int, Action, TensorAndID, int], ...]:
                     t = allocation_times[(key, False)]
                     events.append((t, Action.DESTROY, (key, last_version[key])))
 
-        events.sort(key=lambda x: (x[0], x[1].value))
-        return tuple(
+        output.extend(
             (time, action, (key, version), self._size_map[key])
             for time, action, (key, version) in events
         )
 
+        output.sort(key=lambda x: (x[0], x[1].value))
+        return tuple(output)
+
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT