[profiler] Support top-level memory events

Summary: Mark memory events that did not happen within an operator context explicitly in the profiler output. This PR also adds an API to track memory events outside of or partially overlapping with the profiler scope. Test Plan: python test/test_profiler.py -k test_memory_profiler ghstack-source-id: ef780c6b19287f0574dd9fee35ae54f9f0bf83b3 Pull Request resolved: #51421
pytorch · Jan 31, 2021 · 6e8da53 · 6e8da53
1 parent e26fccc
commit 6e8da53
Show file tree

Hide file tree

Showing 8 changed files with 197 additions and 112 deletions.
diff --git a/c10/core/Allocator.cpp b/c10/core/Allocator.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/Allocator.h>
 
 #include <c10/util/ThreadLocalDebugInfo.h>
+#include <atomic>
 
 namespace c10 {
 
@@ -34,7 +35,16 @@ at::Allocator* GetAllocator(const at::DeviceType& t) {
   return alloc;
 }
 
+namespace {
+std::atomic<bool> global_memory_reporting_ {false};
+}
+void enableGlobalMemoryReporting(bool enable) {
+  global_memory_reporting_ = true;
+}
 bool memoryProfilingEnabled() {
+  if (global_memory_reporting_) {
+    return true;
+  }
   auto* reporter_ptr = static_cast<MemoryReportingInfoBase*>(
       ThreadLocalDebugInfo::get(DebugInfoKind::PROFILER_STATE));
   return reporter_ptr && reporter_ptr->memoryProfilingEnabled();

diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
@@ -232,6 +232,7 @@ struct C10_API MemoryReportingInfoBase : public c10::DebugInfoBase {
   virtual bool memoryProfilingEnabled() const = 0;
 };
 
+C10_API void enableGlobalMemoryReporting(bool);
 C10_API bool memoryProfilingEnabled();
 C10_API void reportMemoryUsageToProfiler(void* ptr, int64_t alloc_size, Device device);
 

diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
@@ -292,6 +292,10 @@ void ProfiledCPUMemoryReporter::Delete(void* ptr) {
       allocated = allocated_;
       nbytes = it->second;
       size_table_.erase(it);
+    } else {
+      LOG(WARNING) << "Memory block of unknown size was allocated before the profiling started, "
+                   << "use 'enable_global_memory_reporting' to track tensor sizes outside of "
+                   << "the profiling scope";
     }
   }
   if (nbytes == 0) {

diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -3258,115 +3258,6 @@ def test_profiler_aggregation_lstm(self):
             with tempfile.NamedTemporaryFile() as trace_file:
                 prof.export_chrome_trace(trace_file.name)
 
-    def test_memory_profiler(self):
-        def run_profiler(tensor_creation_fn, metric):
-            # collecting allocs / deallocs
-            with profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
-                x = None
-                with record_function("test_user_scope_alloc"):
-                    x = tensor_creation_fn()
-                with record_function("test_user_scope_dealloc"):
-                    del x
-            stats = prof.key_averages(group_by_input_shape=True)
-            print(stats.table(sort_by=metric))
-            return stats
-
-        def check_metrics(stats, metric, allocs=None, deallocs=None):
-            stat_metrics = {}
-            for stat in stats:
-                stat_metrics[stat.key] = getattr(stat, metric)
-            if allocs is not None:
-                for alloc_fn in allocs:
-                    self.assertTrue(alloc_fn in stat_metrics)
-                    self.assertTrue(stat_metrics[alloc_fn] > 0)
-            if deallocs is not None:
-                for dealloc_fn in deallocs:
-                    self.assertTrue(dealloc_fn in stat_metrics)
-                    self.assertTrue(stat_metrics[dealloc_fn] < 0)
-
-        def create_cpu_tensor():
-            return torch.rand(10, 10)
-
-        def create_cuda_tensor():
-            return torch.rand(10, 10).cuda()
-
-        def create_mkldnn_tensor():
-            return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()
-
-        print("Running CPU test")
-        stats = run_profiler(create_cpu_tensor, "cpu_memory_usage")
-        check_metrics(
-            stats,
-            "cpu_memory_usage",
-            allocs=[
-                "aten::empty",
-                "aten::rand",
-                "test_user_scope_alloc",
-            ],
-            deallocs=[
-                "test_user_scope_dealloc",
-            ]
-        )
-
-        if torch.cuda.is_available():
-            create_cuda_tensor()
-            print("Running CUDA test")
-            stats = run_profiler(create_cuda_tensor, "cuda_memory_usage")
-            check_metrics(
-                stats,
-                "cuda_memory_usage",
-                allocs=[
-                    "test_user_scope_alloc",
-                    "aten::to",
-                    "aten::empty_strided",
-                ],
-                deallocs=[
-                    "test_user_scope_dealloc",
-                ]
-            )
-            check_metrics(
-                stats,
-                "cpu_memory_usage",
-                allocs=[
-                    "aten::rand",
-                    "aten::empty",
-                ]
-            )
-
-        if torch._C.has_mkldnn:
-            create_mkldnn_tensor()
-            print("Running MKLDNN test")
-            stats = run_profiler(create_mkldnn_tensor, "cpu_memory_usage")
-            check_metrics(
-                stats,
-                "cpu_memory_usage",
-                allocs=[
-                    "test_user_scope_alloc",
-                    "aten::rand",
-                    "aten::empty",
-                    "aten::to_mkldnn",
-                ],
-                deallocs=[
-                    "test_user_scope_dealloc",
-                ]
-            )
-
-        # check partial overlap of tensor allocation with memory profiler
-        x = torch.rand(10, 10)
-        with profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
-            del x
-            x = torch.rand(10, 10)
-        del x
-        stats = prof.key_averages(group_by_input_shape=True)
-        check_metrics(
-            stats,
-            "cpu_memory_usage",
-            allocs=[
-                "aten::rand",
-                "aten::empty",
-            ]
-        )
-
     def test_record_function(self):
         x = torch.randn(10, 10)
 

diff --git a/test/test_profiler.py b/test/test_profiler.py
@@ -11,7 +11,9 @@
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS, TemporaryFileName)
 from torch.autograd.profiler import profile as _profile
-from torch.profiler import profile, kineto_available, DeviceType, ProfilerActivity
+from torch.profiler import (
+    kineto_available, profile, record_function, DeviceType, ProfilerActivity
+)
 
 try:
     import psutil
@@ -162,6 +164,140 @@ def test_kineto_multigpu(self):
         self.assertTrue(found_gemm_1)
         self.assertTrue(found_cuda)
 
+    def test_memory_profiler(self):
+        def run_profiler(tensor_creation_fn, metric):
+            # collecting allocs / deallocs
+            with _profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
+                x = None
+                with record_function("test_user_scope_alloc"):
+                    x = tensor_creation_fn()
+                with record_function("test_user_scope_dealloc"):
+                    del x
+            stats = prof.key_averages(group_by_input_shape=True)
+            print(stats.table(sort_by=metric))
+            return stats
+
+        def check_metrics(stats, metric, allocs=None, deallocs=None):
+            stat_metrics = {}
+            for stat in stats:
+                stat_metrics[stat.key] = getattr(stat, metric)
+            if allocs is not None:
+                for alloc_fn in allocs:
+                    self.assertTrue(alloc_fn in stat_metrics)
+                    self.assertTrue(stat_metrics[alloc_fn] > 0)
+            if deallocs is not None:
+                for dealloc_fn in deallocs:
+                    self.assertTrue(dealloc_fn in stat_metrics)
+                    self.assertTrue(stat_metrics[dealloc_fn] < 0)
+
+        def create_cpu_tensor():
+            return torch.rand(10, 10)
+
+        def create_cuda_tensor():
+            return torch.rand(10, 10).cuda()
+
+        def create_mkldnn_tensor():
+            return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()
+
+        print("Running CPU test")
+        stats = run_profiler(create_cpu_tensor, "cpu_memory_usage")
+        check_metrics(
+            stats,
+            "cpu_memory_usage",
+            allocs=[
+                "aten::empty",
+                "aten::rand",
+                "test_user_scope_alloc",
+            ],
+            deallocs=[
+                "test_user_scope_dealloc",
+            ]
+        )
+
+        if torch.cuda.is_available():
+            create_cuda_tensor()
+            print("Running CUDA test")
+            stats = run_profiler(create_cuda_tensor, "cuda_memory_usage")
+            check_metrics(
+                stats,
+                "cuda_memory_usage",
+                allocs=[
+                    "test_user_scope_alloc",
+                    "aten::to",
+                    "aten::empty_strided",
+                ],
+                deallocs=[
+                    "test_user_scope_dealloc",
+                ]
+            )
+            check_metrics(
+                stats,
+                "cpu_memory_usage",
+                allocs=[
+                    "aten::rand",
+                    "aten::empty",
+                ]
+            )
+
+        if torch._C.has_mkldnn:
+            create_mkldnn_tensor()
+            print("Running MKLDNN test")
+            stats = run_profiler(create_mkldnn_tensor, "cpu_memory_usage")
+            check_metrics(
+                stats,
+                "cpu_memory_usage",
+                allocs=[
+                    "test_user_scope_alloc",
+                    "aten::rand",
+                    "aten::empty",
+                    "aten::to_mkldnn",
+                ],
+                deallocs=[
+                    "test_user_scope_dealloc",
+                ]
+            )
+
+        if kineto_available():
+            torch.enable_global_memory_reporting(True)
+            # check top-level memory events and
+            # partial overlap of tensor lifetime and profiler
+            x = torch.rand(10, 10)
+            y = None
+            if torch.cuda.is_available():
+                y = torch.rand(10, 10).cuda()
+            with profile(
+                    # mem events are CPU events
+                    activities=[ProfilerActivity.CPU],
+                    profile_memory=True) as prof:
+                del x
+                if torch.cuda.is_available():
+                    del y
+                gc.collect()
+                x = torch.rand(10, 10)
+            del x
+            stats = prof.key_averages(group_by_input_shape=True)
+            print(stats.table(sort_by="cpu_memory_usage", row_limit=-1))
+            check_metrics(
+                stats,
+                "cpu_memory_usage",
+                allocs=[
+                    "aten::rand",
+                    "aten::empty"
+                ],
+                deallocs=[
+                    "[memory]"
+                ]
+            )
+            if torch.cuda.is_available():
+                check_metrics(
+                    stats,
+                    "cuda_memory_usage",
+                    deallocs=[
+                        "[memory]"
+                    ]
+                )
+            torch.enable_global_memory_reporting(False)
+
     def test_high_level_trace(self):
         """Checks that python side high level events are recorded.
         """

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
@@ -214,6 +214,8 @@ def export_chrome_trace(self, path):
             # this technique is proven to give a 4x speedup.
             f.write("[")
             for evt in self:
+                if evt.trace_name is None:
+                    continue
                 f.write(
                     '{"name": "%s", '
                     '"ph": "X", '
@@ -850,7 +852,7 @@ def __init__(
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
-        self.trace_name: str = trace_name if trace_name is not None else self.name
+        self.trace_name: str = trace_name
         self.time_range: Interval = Interval(start_us, end_us)
         self.thread: int = thread
         self.fwd_thread: Optional[int] = fwd_thread
@@ -1123,12 +1125,14 @@ def parse_kineto_results(result):
     # save memory allocation records
     start_record = None
     mem_records = []
+    covered_mem_records = []
     for record in itertools.chain(*result.legacy_events()):
         if record.kind() == 'mark' and record.name() == '__start_profile':
             assert start_record is None
             start_record = record
         if record.kind() == 'memory_alloc':
             mem_records.append(record)
+            covered_mem_records.append(False)
     assert start_record is not None, "Invalid profiler output, __start_profile is missing"
 
     # Create and return FunctionEvent list
@@ -1145,11 +1149,14 @@ def parse_kineto_results(result):
         cuda_memory_usage = 0
         if kineto_event.device_type() == DeviceType.CPU:
             # find the corresponding memory allocation events
-            for mem_record in mem_records:
+            for mem_record_idx in range(len(mem_records)):
+                mem_record = mem_records[mem_record_idx]
                 if (mem_record.start_us() >= kineto_event.start_us() and
                         mem_record.start_us() <= abs_end_us):
                     cpu_memory_usage += mem_record.cpu_memory_usage()
                     cuda_memory_usage += mem_record.cuda_memory_usage()
+                    covered_mem_records[mem_record_idx] = True
+
         is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id()
         fe = FunctionEvent(
             id=kineto_event.correlation_id(),
@@ -1188,6 +1195,30 @@ def parse_kineto_results(result):
                     k_evt.start_us(),
                     k_evt.start_us() + k_evt.duration_us())
 
+    # output top-level memory events
+    for mem_record_idx in range(len(mem_records)):
+        if not covered_mem_records[mem_record_idx]:
+            mem_record = mem_records[mem_record_idx]
+            fe = FunctionEvent(
+                id=mem_record.handle(),
+                name="[memory]",
+                trace_name=None, # not outputting in the trace
+                thread=mem_record.thread_id(),
+                start_us=mem_record.start_us(),
+                end_us=mem_record.start_us(), # no duration
+                fwd_thread=mem_record.fwd_thread_id(),
+                input_shapes=[],
+                stack=[],
+                scope=mem_record.scope(),
+                cpu_memory_usage=mem_record.cpu_memory_usage(),
+                cuda_memory_usage=mem_record.cuda_memory_usage(),
+                is_async=False,
+                sequence_nr=-1,
+                device_type=DeviceType.CPU,
+                device_index=0,
+            )
+            function_events.append(fe)
+
     function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return function_events