From a4d4124e6d87eb03b09bd304cd4af5c81e7db2f9 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Fri, 16 Oct 2020 07:56:54 -0700
Subject: [PATCH 01/59] Use libkineto in profiler

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
python test/test_profiler.py

[ghstack-poisoned]
---
 test/test_profiler.py            |  20 ++++
 torch/autograd/__init__.py       |   6 +-
 torch/autograd/profiler.py       |  65 ++++++++++---
 torch/csrc/autograd/init.cpp     |  14 ++-
 torch/csrc/autograd/profiler.cpp | 151 +++++++++++++++++++++++++++----
 torch/csrc/autograd/profiler.h   |  40 +++++++-
 6 files changed, 256 insertions(+), 40 deletions(-)

diff --git a/test/test_profiler.py b/test/test_profiler.py
index f1feff1d0af3..44973546429e 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -99,6 +99,26 @@ def forward(self, x):
 
         torch._C._set_graph_executor_optimize(prev_opt)
 
+    @unittest.skipIf(not torch.autograd.kineto_available(), "Kineto is required")
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
+    def test_kineto(self):
+        x = torch.randn(10, 10).cuda()
+        y = torch.randn(10, 10).cuda()
+        with profile(use_cuda=True, use_kineto=True) as p:
+            z = torch.mm(x, y)
+            z = z + y
+            z = z.cpu()
+        print(p.key_averages().table(
+            sort_by="self_cuda_time_total", row_limit=-1))
+        found_gemm = False
+        found_memcpy = False
+        for e in p.function_events:
+            if "gemm" in e.name:
+                found_gemm = True
+            if "Memcpy" in e.name or "memcpy" in e.name:
+                found_memcpy = True
+        self.assertTrue(found_gemm)
+        self.assertTrue(found_memcpy)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 4e44536d931c..cec103ea4c8c 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -242,6 +242,6 @@ def variable(*args, **kwargs):
     raise RuntimeError("autograd initialization failed")
 
 # Import all native method/classes
-from torch._C._autograd import (ProfilerState, ProfilerConfig, ProfilerEvent,
-                                _enable_profiler, _disable_profiler, _profiler_enabled,
-                                _enable_record_function, _set_empty_test_observer)
+from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent,
+                                _prepare_profiler, _enable_profiler, _disable_profiler, _profiler_enabled,
+                                _enable_record_function, _set_empty_test_observer, kineto_available)
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index eba7368cb03e..c4d23f9efeb4 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -364,16 +364,47 @@ def __init__(
             use_cuda=False,
             record_shapes=False,
             profile_memory=False,
-            with_stack=False):
+            with_stack=False,
+            use_kineto=False):
         self.enabled = enabled
-        self.use_cuda = use_cuda
-        self.function_events = None
         if not self.enabled:
             return
+        self.use_cuda = use_cuda
+        self.function_events = None
         self.entered = False
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
         self.with_stack = with_stack
+        self.use_kineto = use_kineto
+
+        self.profiler_kind = None
+        self.kineto_activities = []
+        if self.use_kineto:
+            if self.use_cuda:
+                self.profiler_kind = torch.autograd.ProfilerState.KINETO
+                self.kineto_activities = [
+                    torch.autograd.ProfilerActivity.CPU,
+                    # uses CUPTI
+                    torch.autograd.ProfilerActivity.CUDA_RUNTIME,
+                    torch.autograd.ProfilerActivity.CUDA]
+            else:
+                # intially we're not using Kineto for CPU only case
+                self.profiler_kind = torch.autograd.ProfilerState.CPU
+        elif self.use_cuda:
+            # legacy CUDA mode
+            self.profiler_kind = torch.autograd.ProfilerState.CUDA
+        else:
+            self.profiler_kind = torch.autograd.ProfilerState.CPU
+        self.kineto_activities = set(self.kineto_activities)
+
+        if self.profiler_kind == torch.autograd.ProfilerState.KINETO:
+            assert torch.autograd.kineto_available()
+
+        self.config = torch.autograd.ProfilerConfig(
+            self.profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            self.with_stack)
 
     def __enter__(self):
         if not self.enabled:
@@ -381,15 +412,8 @@ def __enter__(self):
         if self.entered:
             raise RuntimeError("autograd profiler traces are not reentrant")
         self.entered = True
-        profiler_kind = torch.autograd.ProfilerState.CUDA if self.use_cuda \
-            else torch.autograd.ProfilerState.CPU
-
-        config = torch.autograd.ProfilerConfig(
-            profiler_kind,
-            self.record_shapes,
-            self.profile_memory,
-            self.with_stack)
-        torch.autograd._enable_profiler(config)
+        torch.autograd._prepare_profiler(self.config, self.kineto_activities)
+        torch.autograd._enable_profiler(self.config)
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -732,7 +756,7 @@ class FunctionEvent(FormattedTimesMixin):
     def __init__(
             self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
-            is_remote=True, sequence_nr=-1):
+            is_remote=True, sequence_nr=-1, device_id=-1):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
@@ -751,6 +775,7 @@ def __init__(
         self.is_async: bool = is_async
         self.is_remote: bool = is_remote
         self.sequence_nr: int = sequence_nr
+        self.device_id: int = device_id
 
     def append_kernel(self, name, device, start, end):
         self.kernels.append(Kernel(name, device, Interval(start, end)))
@@ -802,15 +827,21 @@ def self_cpu_time_total(self):
 
     @property
     def cuda_time_total(self):
+        if self.device_id >= 0:
+            return self.cpu_interval.elapsed_us()
         return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
 
     @property
     def self_cuda_time_total(self):
+        if self.device_id >= 0:
+            return self.cuda_time_total - sum([child.cuda_time_total for child in self.cpu_children])
         return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \
             sum([child.cuda_time_total for child in self.cpu_children])
 
     @property
     def cpu_time_total(self):
+        if self.device_id >= 0:
+            return 0
         return self.cpu_interval.elapsed_us()
 
     @property
@@ -1045,6 +1076,7 @@ def adjusted_time(cuda_record, cuda_records_map):
                     is_async=is_async,
                     is_remote=is_remote_event,
                     sequence_nr=start.sequence_nr(),
+                    device_id=start.device_id(),
                 )
                 # note: async events have only cpu total time
                 if not is_async and start.has_cuda():
@@ -1180,7 +1212,9 @@ def build_table(
     has_input_shapes = any(
         [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events])
 
+    MAX_NAME_COLUMN_WIDTH = 55
     name_column_width = max([len(evt.key) for evt in events]) + 4
+    name_column_width = min(name_column_width, MAX_NAME_COLUMN_WIDTH)
 
     DEFAULT_COLUMN_WIDTH = 12
 
@@ -1288,8 +1322,11 @@ def append(s):
             continue
         else:
             event_limit += 1
+        name = evt.key
+        if len(name) >= MAX_NAME_COLUMN_WIDTH-3:
+            name = name[:(MAX_NAME_COLUMN_WIDTH-3)] + "..."
         row_values = [
-            evt.key,  # Name
+            name,
             # Self CPU total, 0 for async events. %
             format_time_share(evt.self_cpu_time_total,
                               self_cpu_time_total),
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 045a732a2016..698931911878 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -39,7 +39,13 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("Disabled", ProfilerState::Disabled)
       .value("CPU", ProfilerState::CPU)
       .value("CUDA", ProfilerState::CUDA)
-      .value("NVTX", ProfilerState::NVTX);
+      .value("NVTX", ProfilerState::NVTX)
+      .value("KINETO", ProfilerState::KINETO);
+
+  py::enum_<ActivityType>(m, "ProfilerActivity")
+      .value("CPU", ActivityType::CPU)
+      .value("CUDA_RUNTIME", ActivityType::CUDA_RUNTIME)
+      .value("CUDA", ActivityType::CUDA);
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
       .def(py::init<ProfilerState, bool, bool, bool>());
@@ -61,11 +67,15 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("is_remote", &Event::isRemote)
       .def("sequence_nr", &Event::sequenceNr)
       .def("stack", &Event::stack)
-      .def("scope", &Event::scope);
+      .def("scope", &Event::scope)
+      .def("device_id", &Event::device);
 
   py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")
     .def(py::init<bool, bool>());
 
+  m.def("kineto_available", kinetoAvailable);
+
+  m.def("_prepare_profiler", prepareProfiler);
   m.def("_enable_profiler", enableProfiler);
   m.def(
       "_disable_profiler",
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 5cbb7606e579..0b6203e695fe 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -21,6 +21,10 @@
 
 #include <iostream>
 
+#ifdef USE_KINETO
+#include "libkineto.h"
+#endif
+
 namespace torch { namespace autograd { namespace profiler {
 
 namespace {
@@ -48,23 +52,23 @@ enum ProfilerIValueIdx {
   NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
 };
 
-  const std::unordered_set<std::string> disable_cuda_profiling = {
-      "aten::view",
-      "aten::t",
-      "aten::transpose",
-      "aten::stride",
-      "aten::empty",
-      "aten::empty_like",
-      "aten::empty_strided",
-      "aten::as_strided",
-      "aten::expand",
-      "aten::resize_",
-      "aten::squeeze",
-      "aten::unsqueeze",
-      "aten::slice",
-      "aten::_unsafe_view",
-      "aten::size"
-      };
+const std::unordered_set<std::string> disable_cuda_profiling = {
+  "aten::view",
+  "aten::t",
+  "aten::transpose",
+  "aten::stride",
+  "aten::empty",
+  "aten::empty_like",
+  "aten::empty_strided",
+  "aten::as_strided",
+  "aten::expand",
+  "aten::resize_",
+  "aten::squeeze",
+  "aten::unsqueeze",
+  "aten::slice",
+  "aten::_unsafe_view",
+  "aten::size"
+};
 
 CUDAStubs default_stubs;
 constexpr CUDAStubs* default_stubs_addr = &default_stubs;
@@ -169,6 +173,14 @@ struct FileLineFunc {
   std::string funcname;
 };
 
+static std::atomic<size_t> corr_id_ {};
+size_t next_correlation_id() {
+  return corr_id_++;
+}
+size_t peek_correlation_id() {
+  return corr_id_;
+}
+
 // Profiler state
 struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
@@ -193,6 +205,12 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           std::make_move_iterator(remoteProfiledEvents_->begin()),
           std::make_move_iterator(remoteProfiledEvents_->end()));
     }
+    if (kinetoEvents_) {
+      result.insert(
+          result.end(),
+          std::make_move_iterator(kinetoEvents_->begin()),
+          std::make_move_iterator(kinetoEvents_->end()));
+    }
     return result;
   }
 
@@ -224,6 +242,11 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     }
   }
 
+  void setKinetoEvents(std::vector<std::vector<Event>>&& kinetoEvents) {
+    std::lock_guard<std::mutex> guard(state_mutex_);
+    kinetoEvents_ = std::move(kinetoEvents);
+  }
+
   void pushRange(
       const at::RecordFunction& fn,
       const bool record_cuda,
@@ -247,6 +270,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       evt.setSequenceNr(fn.seqNr());
       evt.setFwdThreadId(fn.forwardThreadId());
       evt.setScope((uint8_t)fn.scope());
+      evt.setCorrelationId(peek_correlation_id());
 #ifndef C10_MOBILE
       // backward nodes source range corresponds to the forward node
       // TODO: consider using C++ stack trace
@@ -409,6 +433,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
   at::CallbackHandle handle_ = 0;
   c10::optional<std::vector<std::vector<Event>>> remoteProfiledEvents_;
+  c10::optional<std::vector<std::vector<Event>>> kinetoEvents_;
 };
 
 ProfilerThreadLocalState* getProfilerTLSState() {
@@ -451,6 +476,11 @@ void pushProfilingCallbacks() {
         } else {
           state_ptr->pushRange(fn, record_cuda, msg);
         }
+#ifdef USE_KINETO
+        if (state_ptr->config().state == ProfilerState::KINETO) {
+          libkineto::api().pushCorrelationId(next_correlation_id());
+        }
+#endif
       },
       [](const at::RecordFunction& fn) {
         auto state_ptr = getProfilerTLSState();
@@ -463,6 +493,11 @@ void pushProfilingCallbacks() {
           record_cuda = false;
         }
         state_ptr->popRange(fn, record_cuda);
+#ifdef USE_KINETO
+        if (state_ptr->config().state == ProfilerState::KINETO) {
+          libkineto::api().popCorrelationId();
+        }
+#endif
       })
     .needsInputs(state_ptr->config().report_input_shapes)
     .needsIds(true));
@@ -519,10 +554,48 @@ bool profilerEnabled() {
   return state_ptr && state_ptr->config().state != ProfilerState::Disabled;
 }
 
+bool kinetoAvailable() {
+#ifdef USE_KINETO
+  return true;
+#else
+  return false;
+#endif
+}
+
+void prepareProfiler(
+    const ProfilerConfig& new_config,
+    const std::set<ActivityType>& activities) {
+#ifdef USE_KINETO
+  if (new_config.state == ProfilerState::KINETO) {
+    std::set<libkineto::ActivityType> k_activities;
+    if (activities.count(ActivityType::CPU)) {
+      k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION);
+    }
+    if (activities.count(ActivityType::CUDA_RUNTIME)) {
+      k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
+    }
+    if (activities.count(ActivityType::CUDA)) {
+      k_activities.insert(libkineto::ActivityType::GPU_MEMCPY);
+      k_activities.insert(libkineto::ActivityType::GPU_MEMSET);
+      k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL);
+    }
+
+    if (!libkineto::api().hasProfilerRegistered()) {
+      libkineto::api().registerProfiler(
+        std::make_unique<libkineto::ActivityProfilerInterface>(false));
+    }
+    libkineto::api().initProfilerIfRegistered();
+    libkineto::api().prepareTrace(k_activities);
+  }
+#endif
+}
+
 void enableProfiler(const ProfilerConfig& new_config) {
   TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(),
     "Can't use NVTX profiler - PyTorch was compiled without CUDA");
 
+  TORCH_CHECK(new_config.state != ProfilerState::KINETO || kinetoAvailable());
+
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
   auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
@@ -530,6 +603,12 @@ void enableProfiler(const ProfilerConfig& new_config) {
 
   pushProfilingCallbacks();
 
+#ifdef USE_KINETO
+  if (new_config.state == ProfilerState::KINETO) {
+    libkineto::api().startTrace();
+  }
+#endif
+
   if (new_config.state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
     // to generate some dummy events first before recording synchronization events
@@ -569,6 +648,44 @@ thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profile
     at::removeCallback(state_ptr->callbackHandle());
   }
 
+#ifdef USE_KINETO
+  if (state_ptr->config().state == ProfilerState::KINETO) {
+    auto k_events = libkineto::api().stopTrace();
+    std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::vector<Event>>> events;
+    for (auto& k_evt : k_events) {
+      auto& evt_list = events[k_evt.deviceId][k_evt.threadId];
+      Event push_evt(
+          EventKind::PushRange,
+          at::StringView(k_evt.name),
+          k_evt.threadId,
+          false,
+          k_evt.correlationId);
+      push_evt.setDevice(k_evt.deviceId);
+      push_evt.setCpuUS(k_evt.startUs);
+      push_evt.setCorrelationId(k_evt.correlationId);
+      evt_list.emplace_back(std::move(push_evt));
+
+      Event pop_evt(
+          EventKind::PopRange,
+          at::StringView(k_evt.name),
+          k_evt.threadId,
+          false,
+          k_evt.correlationId);
+      pop_evt.setDevice(k_evt.deviceId);
+      pop_evt.setCpuUS(k_evt.endUs);
+      pop_evt.setCorrelationId(k_evt.correlationId);
+      evt_list.emplace_back(std::move(pop_evt));
+    }
+    std::vector<std::vector<Event>> events_list;
+    for (const auto&  it : events) {
+      for (const auto& it2 : it.second) {
+        events_list.emplace_back(it2.second);
+      }
+    }
+    state_ptr->setKinetoEvents(std::move(events_list));
+  }
+#endif
+
   if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) {
     return thread_event_lists();
   }
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 9cfe9ea1fd6e..3bc6022b20fa 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -104,10 +104,19 @@ struct TORCH_API ProfilerDisableOptions {
 };
 
 enum class C10_API_ENUM ProfilerState {
-    Disabled,
-    CPU, // CPU-only profiling
-    CUDA, // CPU + CUDA events
-    NVTX,  // only emit NVTX markers
+  Disabled = 0,
+  CPU, // CPU-only profiling
+  CUDA, // CPU + CUDA events
+  NVTX,  // only emit NVTX markers
+  KINETO, // use libkineto
+  NUM_PROFILER_STATES, // must be the last one
+};
+
+enum class C10_API_ENUM ActivityType {
+  CPU = 0,
+  CUDA_RUNTIME, // CUDA host events
+  CUDA, // CUDA kernels
+  NUM_KINETO_ACTIVITIES, // must be the last one
 };
 
 struct TORCH_API ProfilerConfig {
@@ -238,6 +247,10 @@ struct TORCH_API Event final {
     return cpu_ns_ / (1000.0);
   }
 
+  void setCpuUS(double cpu_us) {
+    cpu_ns_ = (int64_t)(cpu_us * 1000);
+  }
+
   double cudaElapsedUs(const Event& e) const;
 
   bool hasCuda() const {
@@ -248,6 +261,10 @@ struct TORCH_API Event final {
     return device_;
   }
 
+  void setDevice(int device) {
+    device_ = device;
+  }
+
   void updateMemoryStats(int64_t alloc_size, c10::Device device) {
     if (device.type() == c10::DeviceType::CUDA ||
         device.type() == c10::DeviceType::HIP) {
@@ -303,6 +320,14 @@ struct TORCH_API Event final {
     return sequence_nr_;
   }
 
+  void setCorrelationId(uint64_t correlation_id) {
+    correlation_id_ = correlation_id;
+  }
+
+  uint64_t correlationId() const {
+    return correlation_id_;
+  }
+
   const std::vector<std::string>& stack() const {
     return stack_;
   }
@@ -347,6 +372,8 @@ struct TORCH_API Event final {
 
   std::vector<std::string> stack_;
   uint8_t scope_;
+
+  uint64_t correlation_id_;
 };
 
 // a linked-list of fixed sized vectors, to avoid
@@ -403,6 +430,11 @@ TORCH_API ProfilerConfig getProfilerConfig();
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<Event*>& events);
 
+TORCH_API bool kinetoAvailable();
+TORCH_API void prepareProfiler(
+    const ProfilerConfig& new_config,
+    const std::set<ActivityType>& activities);
+
 // Usage:
 //   {
 //     RecordProfile guard("filename.trace");

From 662431b7b3804e111b7980018f641a1f7bac72f3 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 03:43:37 -0800
Subject: [PATCH 02/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/__init__.py       |   2 +
 torch/autograd/profiler.py       |  20 ++++--
 torch/csrc/autograd/init.cpp     |   5 +-
 torch/csrc/autograd/profiler.cpp | 115 +++++++++++++++++++------------
 torch/csrc/autograd/profiler.h   |  31 +++++----
 5 files changed, 107 insertions(+), 66 deletions(-)

diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index cec103ea4c8c..e2ccf47ce923 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -243,5 +243,7 @@ def variable(*args, **kwargs):
 
 # Import all native method/classes
 from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent,
+                                ProfilerResult, KinetoEvent,
+                                _enable_profiler_legacy, _disable_profiler_legacy,
                                 _prepare_profiler, _enable_profiler, _disable_profiler, _profiler_enabled,
                                 _enable_record_function, _set_empty_test_observer, kineto_available)
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 49742bb1e099..483802c7c9ec 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -325,6 +325,9 @@ class profile(object):
 
         with_stack (bool, optional): record source information (file and line number) for the ops
 
+        use_kineto (bool, default False): experimental support for Kineto profiler
+            skip_cpu (default False) - whether to skip profiling of CPU events
+
     .. warning:
         Enabling memory profiling or source attribution incurs additional profiler
         overhead
@@ -365,7 +368,8 @@ def __init__(
             record_shapes=False,
             profile_memory=False,
             with_stack=False,
-            use_kineto=False):
+            use_kineto=False,
+            skip_cpu=False):
         self.enabled = enabled
         if not self.enabled:
             return
@@ -376,16 +380,22 @@ def __init__(
         self.profile_memory = profile_memory
         self.with_stack = with_stack
         self.use_kineto = use_kineto
+        self.skip_cpu = skip_cpu
+        if self.skip_cpu:
+            assert self.use_kineto, "skip_cpu is used with use_kineto=True"
 
         self.profiler_kind = None
         self.kineto_activities = []
         if self.use_kineto:
             self.profiler_kind = torch.autograd.ProfilerState.KINETO
-            self.kineto_activities = [torch.autograd.ProfilerActivity.CPU]
+            if not self.skip_cpu:
+                self.kineto_activities = [torch.autograd.ProfilerActivity.CPU]
+            else:
+                self.kineto_activities = []
             if self.use_cuda:
                 self.kineto_activities += [
                     # uses CUPTI
-                    torch.autograd.ProfilerActivity.CUDA_RUNTIME,
+                    # torch.autograd.ProfilerActivity.CUDA_RUNTIME,
                     torch.autograd.ProfilerActivity.CUDA]
         elif self.use_cuda:
             # legacy CUDA mode
@@ -412,7 +422,9 @@ def __enter__(self):
         if self.entered:
             raise RuntimeError("autograd profiler traces are not reentrant")
         self.entered = True
-        torch.autograd._prepare_profiler(self.config, self.kineto_activities)
+        if self.use_kineto:
+            torch.autograd._prepare_profiler(self.config, self.kineto_activities)
+
         torch.autograd._enable_profiler(self.config)
         return self
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 291ec75c79d4..c1b12ebd478b 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -44,7 +44,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
 
   py::enum_<ActivityType>(m, "ProfilerActivity")
       .value("CPU", ActivityType::CPU)
-      .value("CUDA_RUNTIME", ActivityType::CUDA_RUNTIME)
+      //.value("CUDA_RUNTIME", ActivityType::CUDA_RUNTIME)
       .value("CUDA", ActivityType::CUDA);
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
@@ -67,7 +67,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("is_remote", &LegacyEvent::isRemote)
       .def("sequence_nr", &LegacyEvent::sequenceNr)
       .def("stack", &LegacyEvent::stack)
-      .def("scope", &LegacyEvent::scope);
+      .def("scope", &LegacyEvent::scope)
+      .def("correlation_id", &LegacyEvent::correlationId);
 
   py::class_<ProfilerResult>(m, "ProfilerResult")
       .def("kind", &LegacyEvent::kindStr)
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index ef279830fdb5..c9fcf3efabb3 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -205,12 +205,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           std::make_move_iterator(remoteProfiledEvents_->begin()),
           std::make_move_iterator(remoteProfiledEvents_->end()));
     }
-    if (kinetoEvents_) {
-      result.insert(
-          result.end(),
-          std::make_move_iterator(kinetoEvents_->begin()),
-          std::make_move_iterator(kinetoEvents_->end()));
-    }
     return result;
   }
 
@@ -227,6 +221,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           at::RecordFunction::currentThreadId(),
           include_cuda && config_.state == ProfilerState::CUDA);
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
+      evt.setCorrelationId(peek_correlation_id());
       getEventList().record(std::move(evt));
     }
   }
@@ -242,11 +237,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     }
   }
 
-  void setKinetoEvents(std::vector<std::vector<Event>>&& kinetoEvents) {
-    std::lock_guard<std::mutex> guard(state_mutex_);
-    kinetoEvents_ = std::move(kinetoEvents);
-  }
-
   void pushRange(
       const at::RecordFunction& fn,
       const bool record_cuda,
@@ -270,7 +260,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       evt.setSequenceNr(fn.seqNr());
       evt.setFwdThreadId(fn.forwardThreadId());
       evt.setScope((uint8_t)fn.scope());
-      evt.setCorrelationId(peek_correlation_id());
 #ifndef C10_MOBILE
       // backward nodes source range corresponds to the forward node
       // TODO: consider using C++ stack trace
@@ -328,6 +317,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           thread_id,
           config_.state == ProfilerState::CUDA);
       evt.updateMemoryStats(alloc_size, device);
+      evt.setCorrelationId(peek_correlation_id());
       getEventList(thread_id).record(std::move(evt));
     }
   }
@@ -433,7 +423,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
   at::CallbackHandle handle_ = 0;
   c10::optional<std::vector<std::vector<Event>>> remoteProfiledEvents_;
-  c10::optional<std::vector<std::vector<Event>>> kinetoEvents_;
 };
 
 ProfilerThreadLocalState* getProfilerTLSState() {
@@ -450,6 +439,12 @@ void pushProfilingCallbacks() {
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
+#ifdef USE_KINETO
+        if (state_ptr->config().state == ProfilerState::KINETO) {
+          libkineto::api().pushCorrelationId(next_correlation_id());
+          return;
+        }
+#endif
         bool record_cuda =
             state_ptr->config().state == ProfilerState::CUDA;
         if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
@@ -476,28 +471,25 @@ void pushProfilingCallbacks() {
         } else {
           state_ptr->pushRange(fn, record_cuda, msg);
         }
-#ifdef USE_KINETO
-        if (state_ptr->config().state == ProfilerState::KINETO) {
-          libkineto::api().pushCorrelationId(next_correlation_id());
-        }
-#endif
       },
       [](const at::RecordFunction& fn) {
         auto state_ptr = getProfilerTLSState();
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
+#ifdef USE_KINETO
+        if (state_ptr->config().state == ProfilerState::KINETO) {
+                    // push new cpu trace event
+          libkineto::api().popCorrelationId();
+          return;
+        }
+#endif
         bool record_cuda =
             state_ptr->config().state == ProfilerState::CUDA;
         if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
           record_cuda = false;
         }
         state_ptr->popRange(fn, record_cuda);
-#ifdef USE_KINETO
-        if (state_ptr->config().state == ProfilerState::KINETO) {
-          libkineto::api().popCorrelationId();
-        }
-#endif
       })
     .needsInputs(state_ptr->config().report_input_shapes)
     .needsIds(true));
@@ -563,17 +555,18 @@ bool kinetoAvailable() {
 }
 
 void prepareProfiler(
-    const ProfilerConfig& new_config,
+    const ProfilerConfig& config,
     const std::set<ActivityType>& activities) {
 #ifdef USE_KINETO
-  if (new_config.state == ProfilerState::KINETO) {
+  if (config.state == ProfilerState::KINETO) {
     std::set<libkineto::ActivityType> k_activities;
     if (activities.count(ActivityType::CPU)) {
       k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION);
-    }
-    if (activities.count(ActivityType::CUDA_RUNTIME)) {
       k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
     }
+    //if (activities.count(ActivityType::CUDA_RUNTIME)) {
+    //  k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
+    //}
     if (activities.count(ActivityType::CUDA)) {
       k_activities.insert(libkineto::ActivityType::GPU_MEMCPY);
       k_activities.insert(libkineto::ActivityType::GPU_MEMSET);
@@ -586,15 +579,18 @@ void prepareProfiler(
     }
     libkineto::api().initProfilerIfRegistered();
     libkineto::api().prepareTrace(k_activities);
+
+    return;
   }
 #endif
+  TORCH_CHECK(false, "Supported only in Kineto profiler");
 }
 
-void enableProfiler(const ProfilerConfig& new_config) {
+void enableProfilerLegacy(const ProfilerConfig& new_config) {
   TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(),
     "Can't use NVTX profiler - PyTorch was compiled without CUDA");
 
-  TORCH_CHECK(new_config.state != ProfilerState::KINETO || kinetoAvailable());
+  TORCH_CHECK(new_config.state != ProfilerState::KINETO);
 
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
@@ -603,12 +599,6 @@ void enableProfiler(const ProfilerConfig& new_config) {
 
   pushProfilingCallbacks();
 
-#ifdef USE_KINETO
-  if (new_config.state == ProfilerState::KINETO) {
-    libkineto::api().startTrace();
-  }
-#endif
-
   if (new_config.state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
     // to generate some dummy events first before recording synchronization events
@@ -629,7 +619,7 @@ void enableProfiler(const ProfilerConfig& new_config) {
   state->mark("__start_profile", false);
 }
 
-thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profilerDisableOptions) {
+thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions) {
   auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true;
   auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true;
   // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
@@ -648,6 +638,46 @@ thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profile
     at::removeCallback(state_ptr->callbackHandle());
   }
 
+  if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) {
+    return thread_event_lists();
+  }
+
+  state_ptr->mark("__stop_profile");
+  // Note that this will erase the underlying events.
+  return state_ptr->consolidate();
+}
+
+void enableProfiler(const ProfilerConfig& new_config) {
+  TORCH_CHECK(new_config.state == ProfilerState::KINETO && kinetoAvailable());
+
+  auto state_ptr = getProfilerTLSState();
+  TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
+  auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
+  c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
+
+  pushProfilingCallbacks();
+
+#ifdef USE_KINETO
+  if (new_config.state == ProfilerState::KINETO) {
+    libkineto::api().startTrace();
+  }
+#endif
+
+  state->mark("__start_profile", false);
+}
+
+ProfilerResult disableProfiler() {
+  // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
+  auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+
+  auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
+  TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled,
+      "Can't disable profiler when it's not running");
+
+  if (cleanupTLSState) {
+    at::removeCallback(state_ptr->callbackHandle());
+  }
+
 #ifdef USE_KINETO
   if (state_ptr->config().state == ProfilerState::KINETO) {
     auto k_events = libkineto::api().stopTrace();
@@ -660,8 +690,8 @@ thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profile
           k_evt.threadId,
           false,
           k_evt.correlationId);
-      push_evt.setDevice(k_evt.deviceId);
-      push_evt.setCpuUS(k_evt.startUs);
+      push_evt.setDeviceId(k_evt.deviceId);
+      push_evt.setCpuUs(k_evt.startUs);
       push_evt.setCorrelationId(k_evt.correlationId);
       evt_list.emplace_back(std::move(push_evt));
 
@@ -671,8 +701,8 @@ thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profile
           k_evt.threadId,
           false,
           k_evt.correlationId);
-      pop_evt.setDevice(k_evt.deviceId);
-      pop_evt.setCpuUS(k_evt.endUs);
+      pop_evt.setDeviceId(k_evt.deviceId);
+      pop_evt.setCpuUs(k_evt.endUs);
       pop_evt.setCorrelationId(k_evt.correlationId);
       evt_list.emplace_back(std::move(pop_evt));
     }
@@ -682,14 +712,9 @@ thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profile
         events_list.emplace_back(it2.second);
       }
     }
-    state_ptr->setKinetoEvents(std::move(events_list));
   }
 #endif
 
-  if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) {
-    return thread_event_lists();
-  }
-
   state_ptr->mark("__stop_profile");
   // Note that this will erase the underlying events.
   return state_ptr->consolidate();
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 45058d7ef977..64d3d289e443 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -114,7 +114,7 @@ enum class C10_API_ENUM ProfilerState {
 
 enum class C10_API_ENUM ActivityType {
   CPU = 0,
-  CUDA_RUNTIME, // CUDA host events
+  // CUDA_RUNTIME, // CUDA host events
   CUDA, // CUDA kernels
   NUM_KINETO_ACTIVITIES, // must be the last one
 };
@@ -158,10 +158,21 @@ struct TORCH_API Event {
     return kind_;
   }
 
+  std::string kindStr() const {
+    switch (kind_) {
+      case EventKind::Mark: return "mark";
+      case EventKind::PushRange: return "push";
+      case EventKind::PopRange: return "pop";
+      case EventKind::MemoryAlloc: return "memory_alloc";
+    }
+    throw std::runtime_error("unknown event kind");
+  }
+
  protected:
   EventKind kind_;
 }
 
+// To be deprecated, once we switch to Kineto profiling
 struct TORCH_API LegacyEvent : public Event {
   LegacyEvent(
       EventKind kind,
@@ -223,15 +234,6 @@ struct TORCH_API LegacyEvent : public Event {
   static LegacyEvent fromIValue(const at::IValue& eventIValue);
 
   void record(bool record_cuda);
-  std::string kindStr() const {
-    switch (kind_) {
-      case EventKind::Mark: return "mark";
-      case EventKind::PushRange: return "push";
-      case EventKind::PopRange: return "pop";
-      case EventKind::MemoryAlloc: return "memory_alloc";
-    }
-    throw std::runtime_error("unknown EventKind");
-  }
 
   const char* name() const {
     return name_.str();
@@ -373,10 +375,6 @@ struct TORCH_API LegacyEvent : public Event {
   uint64_t correlation_id_;
 };
 
-struct TORCH_API KinetoEvent : public Event {
-
-};
-
 // a linked-list of fixed sized vectors, to avoid
 // a std::vector resize from taking a large amount of time inside
 // a profiling  event
@@ -433,9 +431,12 @@ TORCH_API ProfilerConfig getProfilerConfig();
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
 
+struct TORCH_API KinetoEvent : public Event {
+
+};
 
 struct TORCH_API ProfilerResult {
-  thread_event_lists legacy_events_; // mem alloc, start/stop
+  thread_event_lists legacy_events_; // tensor mem alloc, start/stop
 
   std::vector<std::vector<KinetoEvent>> events_;
 };

From ea956aa6cecd817136fa84c6ec9d59999a6563d7 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 11:23:29 -0800
Subject: [PATCH 03/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 aten/src/ATen/record_function.cpp |  2 ++
 torch/autograd/profiler.py        |  7 +++++--
 torch/csrc/autograd/profiler.cpp  | 30 ++++++++++++++++++------------
 torch/csrc/autograd/profiler.h    |  4 +++-
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index 41f31968688d..d4d2c99d9c37 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -11,11 +11,13 @@ namespace {
 // Used to generate unique callback handles
 CallbackHandle next_unique_callback_handle() {
   static std::atomic<uint64_t> unique_cb_id {0};
+  // starts with 1
   return CallbackHandle(++unique_cb_id);
 }
 
 RecordFunctionHandle next_unique_record_function_handle() {
   static std::atomic<uint64_t> unique_rf_id {0};
+  // starts with 1
   return RecordFunctionHandle(++unique_rf_id);
 }
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 483802c7c9ec..0a4567b32eed 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -382,7 +382,8 @@ def __init__(
         self.use_kineto = use_kineto
         self.skip_cpu = skip_cpu
         if self.skip_cpu:
-            assert self.use_kineto, "skip_cpu is used with use_kineto=True"
+            assert self.use_kineto, \
+                "skip_cpu is supported only with Kineto (use_kineto=True)"
 
         self.profiler_kind = None
         self.kineto_activities = []
@@ -397,6 +398,8 @@ def __init__(
                     # uses CUPTI
                     # torch.autograd.ProfilerActivity.CUDA_RUNTIME,
                     torch.autograd.ProfilerActivity.CUDA]
+            assert len(self.kineto_activities) > 0, \
+                "No activities specified for Kineto profiler"
         elif self.use_cuda:
             # legacy CUDA mode
             self.profiler_kind = torch.autograd.ProfilerState.CUDA
@@ -425,7 +428,7 @@ def __enter__(self):
         if self.use_kineto:
             torch.autograd._prepare_profiler(self.config, self.kineto_activities)
 
-        torch.autograd._enable_profiler(self.config)
+        torch.autograd._enable_profiler(self.config, self.kineto_activities)
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index c9fcf3efabb3..9c15dd4279f4 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -175,9 +175,9 @@ struct FileLineFunc {
 
 thread_local size_t corr_id_ = 0;
 size_t next_correlation_id() {
-  return corr_id_++;
+  return ++corr_id_;
 }
-size_t peek_correlation_id() {
+size_t cur_correlation_id() {
   return corr_id_;
 }
 
@@ -221,7 +221,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           at::RecordFunction::currentThreadId(),
           include_cuda && config_.state == ProfilerState::CUDA);
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
-      evt.setCorrelationId(peek_correlation_id());
+      evt.setCorrelationId(cur_correlation_id());
       getEventList().record(std::move(evt));
     }
   }
@@ -317,7 +317,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           thread_id,
           config_.state == ProfilerState::CUDA);
       evt.updateMemoryStats(alloc_size, device);
-      evt.setCorrelationId(peek_correlation_id());
+      evt.setCorrelationId(cur_correlation_id());
       getEventList(thread_id).record(std::move(evt));
     }
   }
@@ -647,20 +647,26 @@ thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> p
   return state_ptr->consolidate();
 }
 
-void enableProfiler(const ProfilerConfig& new_config) {
-  TORCH_CHECK(new_config.state == ProfilerState::KINETO && kinetoAvailable());
+void enableProfiler(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities) {
+  TORCH_CHECK(config.state == ProfilerState::KINETO && kinetoAvailable());
+  TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler");
 
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
-  auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
+  auto state = std::make_shared<ProfilerThreadLocalState>(config);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
-  pushProfilingCallbacks();
+  if (activities.count(ActivityType::CPU)) {
+    pushProfilingCallbacks();
+  }
 
 #ifdef USE_KINETO
-  if (new_config.state == ProfilerState::KINETO) {
+  while (!libkineto::api().traceActive()) { // sync?
     libkineto::api().startTrace();
   }
+  //TORCH_CHECK(libkineto::api().traceActive());
 #endif
 
   state->mark("__start_profile", false);
@@ -671,10 +677,10 @@ ProfilerResult disableProfiler() {
   auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
 
   auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
-  TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled,
-      "Can't disable profiler when it's not running");
+  TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO,
+      "Can't disable Kineto profiler when it's not running");
 
-  if (cleanupTLSState) {
+  if (state_ptr->callbackHandle() > 0) {
     at::removeCallback(state_ptr->callbackHandle());
   }
 
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 64d3d289e443..d337841ad2ce 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -440,7 +440,9 @@ struct TORCH_API ProfilerResult {
 
   std::vector<std::vector<KinetoEvent>> events_;
 };
-TORCH_API void enableProfiler(const ProfilerConfig&);
+TORCH_API void enableProfiler(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities);
 TORCH_API ProfilerResult disableProfiler();
 
 TORCH_API bool kinetoAvailable();

From 7dfdbc9a7d1e4364350ec7786f60d61d27242429 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 11:44:00 -0800
Subject: [PATCH 04/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 aten/src/ATen/record_function.cpp | 10 ++++------
 torch/autograd/profiler.py        | 13 +++++++------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index d4d2c99d9c37..f48705796f7a 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -10,15 +10,13 @@ namespace {
 
 // Used to generate unique callback handles
 CallbackHandle next_unique_callback_handle() {
-  static std::atomic<uint64_t> unique_cb_id {0};
-  // starts with 1
-  return CallbackHandle(++unique_cb_id);
+  static std::atomic<uint64_t> unique_cb_id {1};
+  return CallbackHandle(unique_cb_id++);
 }
 
 RecordFunctionHandle next_unique_record_function_handle() {
-  static std::atomic<uint64_t> unique_rf_id {0};
-  // starts with 1
-  return RecordFunctionHandle(++unique_rf_id);
+  static std::atomic<uint64_t> unique_rf_id {1};
+  return RecordFunctionHandle(unique_rf_id++);
 }
 
 thread_local RecordFunctionTLS rf_tls_;
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 0a4567b32eed..fdbf059c5a95 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -326,7 +326,8 @@ class profile(object):
         with_stack (bool, optional): record source information (file and line number) for the ops
 
         use_kineto (bool, default False): experimental support for Kineto profiler
-            skip_cpu (default False) - whether to skip profiling of CPU events
+
+        use_cpu (default True) - whether to profile CPU events
 
     .. warning:
         Enabling memory profiling or source attribution incurs additional profiler
@@ -369,7 +370,7 @@ def __init__(
             profile_memory=False,
             with_stack=False,
             use_kineto=False,
-            skip_cpu=False):
+            use_cpu=True):
         self.enabled = enabled
         if not self.enabled:
             return
@@ -380,16 +381,16 @@ def __init__(
         self.profile_memory = profile_memory
         self.with_stack = with_stack
         self.use_kineto = use_kineto
-        self.skip_cpu = skip_cpu
-        if self.skip_cpu:
+        self.use_cpu = use_cpu
+        if not self.use_cpu:
             assert self.use_kineto, \
-                "skip_cpu is supported only with Kineto (use_kineto=True)"
+                "Device-only events supported only with Kineto (use_kineto=True)"
 
         self.profiler_kind = None
         self.kineto_activities = []
         if self.use_kineto:
             self.profiler_kind = torch.autograd.ProfilerState.KINETO
-            if not self.skip_cpu:
+            if self.use_cpu:
                 self.kineto_activities = [torch.autograd.ProfilerActivity.CPU]
             else:
                 self.kineto_activities = []

From 67257785deea6b010db19a496e81fd9bbd282aa5 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 12:39:53 -0800
Subject: [PATCH 05/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py     |  19 +++--
 torch/csrc/autograd/init.cpp   |  17 +++-
 torch/csrc/autograd/profiler.h | 146 +++++++++++++++++++++++++++------
 3 files changed, 148 insertions(+), 34 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index fdbf059c5a95..ac8574b00ead 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -428,18 +428,23 @@ def __enter__(self):
         self.entered = True
         if self.use_kineto:
             torch.autograd._prepare_profiler(self.config, self.kineto_activities)
-
-        torch.autograd._enable_profiler(self.config, self.kineto_activities)
+            torch.autograd._enable_profiler(self.config, self.kineto_activities)
+        else:
+            torch.autograd._enable_profiler_legacy(self.config)
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if not self.enabled:
             return
-        records = torch.autograd._disable_profiler()
-        self.function_events = EventList(
-            parse_event_records(records),
-            use_cuda=self.use_cuda,
-            profile_memory=self.profile_memory)
+        if self.use_kineto:
+            result = torch.autograd._disable_profiler()
+            self.function_events = parse_profiler_result(result)
+        else:
+            records = torch.autograd._disable_profiler_legacy()
+            self.function_events = EventList(
+                parse_event_records(records),
+                use_cuda=self.use_cuda,
+                profile_memory=self.profile_memory)
         if self.with_stack:
             self.function_events.set_backward_stacktraces()
         return False
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index c1b12ebd478b..4eb781b08885 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -70,9 +70,22 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("scope", &LegacyEvent::scope)
       .def("correlation_id", &LegacyEvent::correlationId);
 
+  py::class_<KinetoEvent>(m, "KinetoEvent")
+      .def("name", &KinetoEvent::name)
+      .def("thread_id", &KinetoEvent::threadId)
+      .def("device_index", &KinetoEvent::deviceIndex)
+      .def("start_us", &KinetoEvent::startUs)
+      .def("duration", &KinetoEvent::duration)
+      .def("correlation_id", &KinetoEvent::correlationId)
+      .def("fwd_thread_id", &KinetoEvent::fwdThreadId)
+      .def("shapes", &KinetoEvent::shapes)
+      .def("sequence_nr", &KinetoEvent::sequenceNr)
+      .def("stack", &KinetoEvent::stack)
+      .def("scope", &KinetoEvent::scope);
+
   py::class_<ProfilerResult>(m, "ProfilerResult")
-      .def("kind", &LegacyEvent::kindStr)
-      .def("scope", &LegacyEvent::scope);
+      .def("events", &ProfilerResult::events)
+      .def("legacy_events", &ProfilerResult::legacy_events);
 
   m.def("kineto_available", kinetoAvailable);
   m.def("_enable_profiler", enableProfiler);
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index d337841ad2ce..ebda8ad0f4dd 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -23,6 +23,10 @@
 struct CUevent_st;
 typedef std::shared_ptr<CUevent_st> CUDAEventStub;
 
+namespace libkineto {
+class TraceActivity;
+}
+
 namespace torch { namespace autograd {
 
 struct Node;
@@ -148,32 +152,10 @@ enum class C10_API_ENUM EventKind : uint16_t {
   PushRange,
   PopRange,
   MemoryAlloc,
-  //
-  Kineto,
 };
 
-struct TORCH_API Event {
-  explicit Event(EventKind kind) : kind_(kind) {}
-  EventKind kind() const {
-    return kind_;
-  }
-
-  std::string kindStr() const {
-    switch (kind_) {
-      case EventKind::Mark: return "mark";
-      case EventKind::PushRange: return "push";
-      case EventKind::PopRange: return "pop";
-      case EventKind::MemoryAlloc: return "memory_alloc";
-    }
-    throw std::runtime_error("unknown event kind");
-  }
-
- protected:
-  EventKind kind_;
-}
-
 // To be deprecated, once we switch to Kineto profiling
-struct TORCH_API LegacyEvent : public Event {
+struct TORCH_API LegacyEvent {
   LegacyEvent(
       EventKind kind,
       at::StringView name,
@@ -352,7 +334,18 @@ struct TORCH_API LegacyEvent : public Event {
     scope_ = scope;
   }
 
+  std::string kindStr() const {
+    switch (kind_) {
+      case EventKind::Mark: return "mark";
+      case EventKind::PushRange: return "push";
+      case EventKind::PopRange: return "pop";
+      case EventKind::MemoryAlloc: return "memory_alloc";
+    }
+    throw std::runtime_error("unknown event kind");
+  }
+
  private:
+  EventKind kind_;
   // signed to allow for negative intervals, initialized for safety.
   int64_t cpu_ns_ = 0;
   at::StringView name_;
@@ -431,14 +424,117 @@ TORCH_API ProfilerConfig getProfilerConfig();
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
 
-struct TORCH_API KinetoEvent : public Event {
+enum class C10_API_ENUM KinetoDeviceType : uint16_t {
+  CPU = 0,
+  CUDA,
+  NUM_KINETO_DEVICE_TYPES, // must be the last one
+};
+
+struct TORCH_API KinetoEvent {
+  KinetoEvent(TraceActivity*) : activity_(activity) {}
+
+  std::string name() const;
+  uint64_t deviceIndex() const;
+  uint64_t startUs() const;
+  uint64_t durationUs() const;
+  uint64_t correlationId() const;
+
+  int64_t threadId() const {
+    return thread_id_;
+  }
+
+  KinetoDeviceType deviceType() const {
+    return device_type_;
+  }
+
+  int64_t fwdThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  const std::vector<std::vector<int64_t>>& shapes() const {
+    return shapes_;
+  }
+
+  int64_t sequenceNr() const {
+    return sequence_nr_;
+  }
+
+  const std::vector<std::string>& stack() const {
+    return stack_;
+  }
+
+  uint8_t scope() const {
+    return scope_;
+  }
+
+  KinetoEvent& threadId(int64_t thread_id) {
+    thread_id_ = thread_id;
+    return *this;
+  }
+
+  KinetoEvent& deviceType(KinetoDeviceType device_type) {
+    device_type_ = device_type;
+    return *this;
+  }
+
+  KinetoEvent& fwdThreadId(int64_t fwd_thread_id) {
+    fwd_thread_id_ = fwd_thread_id;
+    return *this;
+  }
+
+  KinetoEvent& shapes(const std::vector<std::vector<int64_t>>& shapes) {
+    shapes_ = shapes;
+    return *this;
+  }
+
+  KinetoEvent& sequenceNr(int64_t sequence_nr) {
+    sequence_nr_ = sequence_nr_;
+    return *this;
+  }
+
+  KinetoEvent& stack(const std::vector<std::string>& st) {
+    stack_ = st;
+    return *this;
+  }
+
+  KinetoEvent& scope(uint8_t scope_id) {
+    scope_id_ = scope_id;
+    return *this;
+  }
 
+ private:
+  //std::string name_;
+  //uint64_t device_index_;
+  //uint64_t start_us_;
+  //uint64_t duration_;
+  //uint64_t correlation_id_;
+
+  TraceActivity* activity_ = nullptr;
+  int64_t thread_id_ = -1;
+  KinetoDeviceType device_type_ = KinetoDeviceType::CPU,
+  int64_t fwd_thread_id_ = -1;
+  std::vector<std::vector<int64_t>> shapes_;
+  int64_t sequence_nr_ = -1;
+  std::vector<std::string> stack_;
+  uint8_t scope_ = 0;
 };
 
 struct TORCH_API ProfilerResult {
-  thread_event_lists legacy_events_; // tensor mem alloc, start/stop
+  ProfilerResult(
+      const std::vector<std::vector<KinetoEvent>>& events,
+      const thread_event_lists& legacy_events)
+    : events_(events), legacy_events_(legacy_events) {}
 
+  const std::vector<std::vector<KinetoEvent>> events() const {
+    return events_;
+  }
+
+  const thread_event_lists& legacy_events() const {
+    return legacy_events_;
+  }
+ private:
   std::vector<std::vector<KinetoEvent>> events_;
+  thread_event_lists legacy_events_; // tensor mem alloc, start/stop
 };
 TORCH_API void enableProfiler(
     const ProfilerConfig& config,

From e9a219b213c57ec96ca210f9568e515200154398 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 12:47:15 -0800
Subject: [PATCH 06/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index ac8574b00ead..6fb37c2a25a8 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -777,7 +777,7 @@ class FunctionEvent(FormattedTimesMixin):
     def __init__(
             self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
-            is_remote=True, sequence_nr=-1, device_id=-1):
+            is_remote=True, sequence_nr=-1):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
@@ -796,7 +796,6 @@ def __init__(
         self.is_async: bool = is_async
         self.is_remote: bool = is_remote
         self.sequence_nr: int = sequence_nr
-        self.device_id: int = device_id
 
     def append_kernel(self, name, device, start, end):
         self.kernels.append(Kernel(name, device, Interval(start, end)))
@@ -848,21 +847,15 @@ def self_cpu_time_total(self):
 
     @property
     def cuda_time_total(self):
-        if self.device_id >= 0:
-            return self.cpu_interval.elapsed_us()
         return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
 
     @property
     def self_cuda_time_total(self):
-        if self.device_id >= 0:
-            return self.cuda_time_total - sum([child.cuda_time_total for child in self.cpu_children])
         return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \
             sum([child.cuda_time_total for child in self.cpu_children])
 
     @property
     def cpu_time_total(self):
-        if self.device_id >= 0:
-            return 0
         return self.cpu_interval.elapsed_us()
 
     @property
@@ -1097,7 +1090,6 @@ def adjusted_time(cuda_record, cuda_records_map):
                     is_async=is_async,
                     is_remote=is_remote_event,
                     sequence_nr=start.sequence_nr(),
-                    device_id=start.device_id(),
                 )
                 # note: async events have only cpu total time
                 if not is_async and start.has_cuda():

From 49a9fee5761f28fd2a982c4e6f61ef1058493d9b Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 13:01:56 -0800
Subject: [PATCH 07/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 9c15dd4279f4..a69ec92dd033 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -326,6 +326,29 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     return config_.profile_memory;
   }
 
+  void reportKinetoClientActivity(const at::RecordFunction& fn) {
+#ifdef USE_KINETO
+    if (config_.state == ProfilerState::KINETO) {
+      libkineto::ClientTraceActivity op;
+      op.startTime = libkineto::timeSinceEpoch(fc.startTime);
+      op.endTime = libkineto::timeSinceEpoch(now);
+      op.opType = fc.name;
+      op.device = fc.deviceType;
+      op.correlation = fc.correlationId;
+      op.threadId = pthread_self();
+      op.inputDims = folly::toJson(fc.input_shapes);
+      op.inputTypes = folly::toJson(fc.input_types);
+      op.outputDims = "null";
+      op.arguments = "null";
+      op.outputTypes = "null";
+      op.inputNames = "null";
+      op.outputNames = "null";
+      return;
+    }
+#endif
+    TORCH_CHECK(false, "Supported only in Kineto profiler");
+  }
+
  private:
   std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
     std::vector<FileLineFunc> entries;
@@ -423,6 +446,10 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
   at::CallbackHandle handle_ = 0;
   c10::optional<std::vector<std::vector<Event>>> remoteProfiledEvents_;
+
+#ifdef USE_KINETO
+  std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
+#endif
 };
 
 ProfilerThreadLocalState* getProfilerTLSState() {
@@ -479,7 +506,7 @@ void pushProfilingCallbacks() {
         }
 #ifdef USE_KINETO
         if (state_ptr->config().state == ProfilerState::KINETO) {
-                    // push new cpu trace event
+          state_ptr->reportKinetoClientActivity(fn);
           libkineto::api().popCorrelationId();
           return;
         }

From 8edb34641d900ef84f73c3b116eeef66ad6b8a34 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 13:14:02 -0800
Subject: [PATCH 08/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index a69ec92dd033..7e27b6b69f4e 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -330,19 +330,18 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
 #ifdef USE_KINETO
     if (config_.state == ProfilerState::KINETO) {
       libkineto::ClientTraceActivity op;
-      op.startTime = libkineto::timeSinceEpoch(fc.startTime);
+      /*op.startTime = libkineto::timeSinceEpoch(fc.startTime);
       op.endTime = libkineto::timeSinceEpoch(now);
-      op.opType = fc.name;
+      op.opType = std::string(fn.name());
       op.device = fc.deviceType;
       op.correlation = fc.correlationId;
       op.threadId = pthread_self();
       op.inputDims = folly::toJson(fc.input_shapes);
-      op.inputTypes = folly::toJson(fc.input_types);
-      op.outputDims = "null";
-      op.arguments = "null";
-      op.outputTypes = "null";
-      op.inputNames = "null";
-      op.outputNames = "null";
+      op.inputTypes = folly::toJson(fc.input_types);*/
+      {
+        std::lock_guard<std::mutex> guard(state_mutex_);
+        kineto_client_activities_.emplace_back(std::move(op));
+      }
       return;
     }
 #endif
@@ -449,6 +448,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
 
 #ifdef USE_KINETO
   std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
+  std::vector<KinetoEvent> kineto_events_;
 #endif
 };
 

From f28862392a29ad92960bbfadf245ad4d0a78adc2 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 13:39:36 -0800
Subject: [PATCH 09/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 42 ++++++++++++++++++++++++++++++--
 torch/csrc/autograd/profiler.h   | 16 ++++--------
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 7e27b6b69f4e..3337b184e884 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -181,6 +181,41 @@ size_t cur_correlation_id() {
   return corr_id_;
 }
 
+#ifdef USE_KINETO
+struct KinetoEventImpl : public KinetoEvent {
+  static void fromClientActivity(const libkineto::ClientTraceActivity* activity) {
+
+  }
+
+  static void fromDeviceActivity(const libkineto::TraceActivity* activity) {
+
+  }
+
+  std::string name() const override  {
+
+  }
+
+  uint64_t deviceIndex() const override {
+
+  }
+
+  uint64_t startUs() const override {
+
+  }
+
+  uint64_t durationUs() const override {
+
+  }
+
+  uint64_t correlationId() const override {
+
+  }
+
+ private:
+  libkineto::TraceActivity* activity_ptr_;
+}
+#endif
+
 // Profiler state
 struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
@@ -340,7 +375,10 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       op.inputTypes = folly::toJson(fc.input_types);*/
       {
         std::lock_guard<std::mutex> guard(state_mutex_);
-        kineto_client_activities_.emplace_back(std::move(op));
+        kineto_client_activities_.emplace_back(op);
+        kineto_events_.emplace_back(
+            KinetoEventImpl::fromClientActivity(
+                &(kineto_client_activities_.back())));
       }
       return;
     }
@@ -448,7 +486,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
 
 #ifdef USE_KINETO
   std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
-  std::vector<KinetoEvent> kineto_events_;
+  std::vector<KinetoEventImpl> kineto_events_;
 #endif
 };
 
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index ebda8ad0f4dd..f7461fbd9632 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -23,10 +23,6 @@
 struct CUevent_st;
 typedef std::shared_ptr<CUevent_st> CUDAEventStub;
 
-namespace libkineto {
-class TraceActivity;
-}
-
 namespace torch { namespace autograd {
 
 struct Node;
@@ -431,13 +427,11 @@ enum class C10_API_ENUM KinetoDeviceType : uint16_t {
 };
 
 struct TORCH_API KinetoEvent {
-  KinetoEvent(TraceActivity*) : activity_(activity) {}
-
-  std::string name() const;
-  uint64_t deviceIndex() const;
-  uint64_t startUs() const;
-  uint64_t durationUs() const;
-  uint64_t correlationId() const;
+  virtual std::string name() const = 0;
+  virtual uint64_t deviceIndex() const = 0;
+  virtual uint64_t startUs() const = 0;
+  virtual uint64_t durationUs() const = 0;
+  virtual uint64_t correlationId() const = 0;
 
   int64_t threadId() const {
     return thread_id_;

From 979cdfa3bb5cadc74a58ba0617d2e9574d70fa7d Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 19:24:36 -0800
Subject: [PATCH 10/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 3337b184e884..10d8e5b6cb98 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -192,27 +192,27 @@ struct KinetoEventImpl : public KinetoEvent {
   }
 
   std::string name() const override  {
-
+    return activity_ptr_->name();
   }
 
   uint64_t deviceIndex() const override {
-
+    return activity_ptr_->deviceId();
   }
 
   uint64_t startUs() const override {
-
+    return activity_ptr_->timestamp();
   }
 
   uint64_t durationUs() const override {
-
+    return activity_ptr_->duration();
   }
 
   uint64_t correlationId() const override {
-
+    return activity_ptr_->correlationId();
   }
 
  private:
-  libkineto::TraceActivity* activity_ptr_;
+  libkineto::TraceActivity* activity_ptr_ = nullptr;
 }
 #endif
 

From c8cbeb00a1527aa4f1b983f4f840c94b27c09b0b Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 19:48:05 -0800
Subject: [PATCH 11/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.h | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index f7461fbd9632..18bfae272fb7 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -20,6 +20,8 @@
 
 #include <ATen/record_function.h>
 
+#include <c10/core/DeviceType.h>
+
 struct CUevent_st;
 typedef std::shared_ptr<CUevent_st> CUDAEventStub;
 
@@ -420,12 +422,6 @@ TORCH_API ProfilerConfig getProfilerConfig();
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
 
-enum class C10_API_ENUM KinetoDeviceType : uint16_t {
-  CPU = 0,
-  CUDA,
-  NUM_KINETO_DEVICE_TYPES, // must be the last one
-};
-
 struct TORCH_API KinetoEvent {
   virtual std::string name() const = 0;
   virtual uint64_t deviceIndex() const = 0;
@@ -437,7 +433,7 @@ struct TORCH_API KinetoEvent {
     return thread_id_;
   }
 
-  KinetoDeviceType deviceType() const {
+  c10::DeviceType deviceType() const {
     return device_type_;
   }
 
@@ -466,7 +462,7 @@ struct TORCH_API KinetoEvent {
     return *this;
   }
 
-  KinetoEvent& deviceType(KinetoDeviceType device_type) {
+  KinetoEvent& deviceType(c10::DeviceType device_type) {
     device_type_ = device_type;
     return *this;
   }
@@ -497,15 +493,8 @@ struct TORCH_API KinetoEvent {
   }
 
  private:
-  //std::string name_;
-  //uint64_t device_index_;
-  //uint64_t start_us_;
-  //uint64_t duration_;
-  //uint64_t correlation_id_;
-
-  TraceActivity* activity_ = nullptr;
   int64_t thread_id_ = -1;
-  KinetoDeviceType device_type_ = KinetoDeviceType::CPU,
+  c10::DeviceType device_type_ = c10::DeviceType::CPU,
   int64_t fwd_thread_id_ = -1;
   std::vector<std::vector<int64_t>> shapes_;
   int64_t sequence_nr_ = -1;

From 226089cb423a3ca9e865f8015437a7107960e6d8 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 20:21:16 -0800
Subject: [PATCH 12/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 53 ++++++++++++++++++--------------
 torch/csrc/autograd/profiler.h   | 16 +++++++---
 2 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 10d8e5b6cb98..d585a89704de 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -182,37 +182,44 @@ size_t cur_correlation_id() {
 }
 
 #ifdef USE_KINETO
-struct KinetoEventImpl : public KinetoEvent {
-  static void fromClientActivity(const libkineto::ClientTraceActivity* activity) {
-
-  }
+std::string Kineto::name() const override  {
+  return activity_->name();
+}
 
-  static void fromDeviceActivity(const libkineto::TraceActivity* activity) {
+uint64_t Kineto::deviceIndex() const override {
+  return activity_->deviceId();
+}
 
-  }
+uint64_t Kineto::startUs() const override {
+  return activity_->timestamp();
+}
 
-  std::string name() const override  {
-    return activity_ptr_->name();
-  }
+uint64_t Kineto::durationUs() const override {
+  return activity_->duration();
+}
 
-  uint64_t deviceIndex() const override {
-    return activity_ptr_->deviceId();
-  }
+uint64_t Kineto::correlationId() const override {
+  return activity_->correlationId();
+}
+#else
+std::string Kineto::name() const override  {
+  TORCH_CHECK(false, "Supported only with Kineto");
+}
 
-  uint64_t startUs() const override {
-    return activity_ptr_->timestamp();
-  }
+uint64_t Kineto::deviceIndex() const override {
+  TORCH_CHECK(false, "Supported only with Kineto");
+}
 
-  uint64_t durationUs() const override {
-    return activity_ptr_->duration();
-  }
+uint64_t Kineto::startUs() const override {
+  TORCH_CHECK(false, "Supported only with Kineto");
+}
 
-  uint64_t correlationId() const override {
-    return activity_ptr_->correlationId();
-  }
+uint64_t Kineto::durationUs() const override {
+  TORCH_CHECK(false, "Supported only with Kineto");
+}
 
- private:
-  libkineto::TraceActivity* activity_ptr_ = nullptr;
+uint64_t Kineto::correlationId() const override {
+  TORCH_CHECK(false, "Supported only with Kineto");
 }
 #endif
 
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 18bfae272fb7..cbd125565cf8 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -423,11 +423,15 @@ TORCH_API ProfilerConfig getProfilerConfig();
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
 
 struct TORCH_API KinetoEvent {
-  virtual std::string name() const = 0;
-  virtual uint64_t deviceIndex() const = 0;
-  virtual uint64_t startUs() const = 0;
-  virtual uint64_t durationUs() const = 0;
-  virtual uint64_t correlationId() const = 0;
+  KinetoEvent(std::unique_ptr<TraceActivity>&& activity) : activity_(activity) {
+    TORCH_CHECK(activity_);
+  }
+
+  std::string name() const;
+  uint64_t deviceIndex() const;
+  uint64_t startUs() const;
+  uint64_t durationUs() const;
+  uint64_t correlationId() const;
 
   int64_t threadId() const {
     return thread_id_;
@@ -500,6 +504,8 @@ struct TORCH_API KinetoEvent {
   int64_t sequence_nr_ = -1;
   std::vector<std::string> stack_;
   uint8_t scope_ = 0;
+
+  std::unique_ptr<libkineto::TraceActivity> activity_;
 };
 
 struct TORCH_API ProfilerResult {

From 266b75fb295099fc5e31b9da50e90d5e86c38d7c Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 20:33:06 -0800
Subject: [PATCH 13/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 129 ++++++++-----------------------
 torch/csrc/autograd/profiler.h   |  30 +++++--
 2 files changed, 57 insertions(+), 102 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index d585a89704de..803d90964939 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -21,10 +21,6 @@
 
 #include <iostream>
 
-#ifdef USE_KINETO
-#include "libkineto.h"
-#endif
-
 namespace torch { namespace autograd { namespace profiler {
 
 namespace {
@@ -181,48 +177,6 @@ size_t cur_correlation_id() {
   return corr_id_;
 }
 
-#ifdef USE_KINETO
-std::string Kineto::name() const override  {
-  return activity_->name();
-}
-
-uint64_t Kineto::deviceIndex() const override {
-  return activity_->deviceId();
-}
-
-uint64_t Kineto::startUs() const override {
-  return activity_->timestamp();
-}
-
-uint64_t Kineto::durationUs() const override {
-  return activity_->duration();
-}
-
-uint64_t Kineto::correlationId() const override {
-  return activity_->correlationId();
-}
-#else
-std::string Kineto::name() const override  {
-  TORCH_CHECK(false, "Supported only with Kineto");
-}
-
-uint64_t Kineto::deviceIndex() const override {
-  TORCH_CHECK(false, "Supported only with Kineto");
-}
-
-uint64_t Kineto::startUs() const override {
-  TORCH_CHECK(false, "Supported only with Kineto");
-}
-
-uint64_t Kineto::durationUs() const override {
-  TORCH_CHECK(false, "Supported only with Kineto");
-}
-
-uint64_t Kineto::correlationId() const override {
-  TORCH_CHECK(false, "Supported only with Kineto");
-}
-#endif
-
 // Profiler state
 struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
@@ -502,7 +456,7 @@ ProfilerThreadLocalState* getProfilerTLSState() {
   return dynamic_cast<ProfilerThreadLocalState*>(state.get());
 }
 
-void pushProfilingCallbacks() {
+void pushProfilingCallbacksLegacy() {
   auto state_ptr = getProfilerTLSState();
   TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
   auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback(
@@ -511,12 +465,6 @@ void pushProfilingCallbacks() {
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
-#ifdef USE_KINETO
-        if (state_ptr->config().state == ProfilerState::KINETO) {
-          libkineto::api().pushCorrelationId(next_correlation_id());
-          return;
-        }
-#endif
         bool record_cuda =
             state_ptr->config().state == ProfilerState::CUDA;
         if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
@@ -549,13 +497,6 @@ void pushProfilingCallbacks() {
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
-#ifdef USE_KINETO
-        if (state_ptr->config().state == ProfilerState::KINETO) {
-          state_ptr->reportKinetoClientActivity(fn);
-          libkineto::api().popCorrelationId();
-          return;
-        }
-#endif
         bool record_cuda =
             state_ptr->config().state == ProfilerState::CUDA;
         if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
@@ -568,6 +509,32 @@ void pushProfilingCallbacks() {
   state_ptr->setCallbackHandle(handle);
 }
 
+#ifdef USE_KINETO
+void pushProfilingCallbacks() {
+  auto state_ptr = getProfilerTLSState();
+  TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
+  auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback(
+      [](const at::RecordFunction& fn) {
+        auto state_ptr = getProfilerTLSState();
+        if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
+          return;
+        }
+
+        libkineto::api().pushCorrelationId(next_correlation_id());
+      },
+      [](const at::RecordFunction& fn) {
+        auto state_ptr = getProfilerTLSState();
+        if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
+          return;
+        }
+        state_ptr->reportKinetoClientActivity(fn);
+        libkineto::api().popCorrelationId();
+      })
+    .needsInputs(state_ptr->config().report_input_shapes)
+    .needsIds(true));
+  state_ptr->setCallbackHandle(handle);
+}
+
 const int kCUDAWarmupStart = 5;
 
 } // namespace
@@ -669,7 +636,7 @@ void enableProfilerLegacy(const ProfilerConfig& new_config) {
   auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
-  pushProfilingCallbacks();
+  pushProfilingCallbacksLegacy();
 
   if (new_config.state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
@@ -719,6 +686,7 @@ thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> p
   return state_ptr->consolidate();
 }
 
+#ifdef USE_KINETO
 void enableProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities) {
@@ -734,12 +702,10 @@ void enableProfiler(
     pushProfilingCallbacks();
   }
 
-#ifdef USE_KINETO
   while (!libkineto::api().traceActive()) { // sync?
     libkineto::api().startTrace();
   }
   //TORCH_CHECK(libkineto::api().traceActive());
-#endif
 
   state->mark("__start_profile", false);
 }
@@ -756,47 +722,16 @@ ProfilerResult disableProfiler() {
     at::removeCallback(state_ptr->callbackHandle());
   }
 
-#ifdef USE_KINETO
   if (state_ptr->config().state == ProfilerState::KINETO) {
-    auto k_events = libkineto::api().stopTrace();
-    std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::vector<Event>>> events;
-    for (auto& k_evt : k_events) {
-      auto& evt_list = events[k_evt.deviceId][k_evt.threadId];
-      Event push_evt(
-          EventKind::PushRange,
-          at::StringView(k_evt.name),
-          k_evt.threadId,
-          false,
-          k_evt.correlationId);
-      push_evt.setDeviceId(k_evt.deviceId);
-      push_evt.setCpuUs(k_evt.startUs);
-      push_evt.setCorrelationId(k_evt.correlationId);
-      evt_list.emplace_back(std::move(push_evt));
-
-      Event pop_evt(
-          EventKind::PopRange,
-          at::StringView(k_evt.name),
-          k_evt.threadId,
-          false,
-          k_evt.correlationId);
-      pop_evt.setDeviceId(k_evt.deviceId);
-      pop_evt.setCpuUs(k_evt.endUs);
-      pop_evt.setCorrelationId(k_evt.correlationId);
-      evt_list.emplace_back(std::move(pop_evt));
-    }
-    std::vector<std::vector<Event>> events_list;
-    for (const auto&  it : events) {
-      for (const auto& it2 : it.second) {
-        events_list.emplace_back(it2.second);
-      }
-    }
+    auto trace = libkineto::api().stopTrace();
+    //
   }
-#endif
 
   state_ptr->mark("__stop_profile");
   // Note that this will erase the underlying events.
   return state_ptr->consolidate();
 }
+#endif
 
 void addEventList(std::vector<Event>&& profiledEvents) {
   auto state_ptr = getProfilerTLSState();
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index cbd125565cf8..62984deb0ced 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -22,6 +22,10 @@
 
 #include <c10/core/DeviceType.h>
 
+#ifdef USE_KINETO
+#include "libkineto.h"
+#endif
+
 struct CUevent_st;
 typedef std::shared_ptr<CUevent_st> CUDAEventStub;
 
@@ -422,16 +426,31 @@ TORCH_API ProfilerConfig getProfilerConfig();
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
 
+#ifdef USE_KINETO
 struct TORCH_API KinetoEvent {
   KinetoEvent(std::unique_ptr<TraceActivity>&& activity) : activity_(activity) {
     TORCH_CHECK(activity_);
   }
 
-  std::string name() const;
-  uint64_t deviceIndex() const;
-  uint64_t startUs() const;
-  uint64_t durationUs() const;
-  uint64_t correlationId() const;
+  std::string name() const override  {
+    return activity_->name();
+  }
+
+  uint64_t deviceIndex() const override {
+    return activity_->deviceId();
+  }
+
+  uint64_t startUs() const override {
+    return activity_->timestamp();
+  }
+
+  uint64_t durationUs() const override {
+    return activity_->duration();
+  }
+
+  uint64_t correlationId() const override {
+    return activity_->correlationId();
+  }
 
   int64_t threadId() const {
     return thread_id_;
@@ -534,6 +553,7 @@ TORCH_API bool kinetoAvailable();
 TORCH_API void prepareProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities);
+#endif // USE_KINETO
 
 // Usage:
 //   {

From 6958eac3863aafefcf565e5a4412bbd309060e42 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 20:34:13 -0800
Subject: [PATCH 14/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 42 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 803d90964939..e0f5c195bc10 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -211,7 +211,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     if (config_.state == ProfilerState::NVTX) {
       cuda_stubs->nvtxMarkA(name.c_str());
     } else {
-      Event evt(
+      LegacyEvent evt(
           EventKind::Mark,
           at::StringView(std::move(name)),
           at::RecordFunction::currentThreadId(),
@@ -223,7 +223,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   }
 
   void setOrAddRemoteProfiledEvents(
-      std::vector<Event>&& remoteProfiledEvents) {
+      std::vector<LegacyEvent>&& remoteProfiledEvents) {
     // Lock to serialize access from multiple callback threads.
     std::lock_guard<std::mutex> guard(state_mutex_);
     if (remoteProfiledEvents_) {
@@ -245,7 +245,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       cuda_stubs->nvtxRangePushA(getNvtxStr(
           fn.name(), msg, fn.seqNr(), shapes).c_str());
     } else {
-      Event evt(
+      LegacyEvent evt(
           EventKind::PushRange,
           fn.name(),
           at::RecordFunction::currentThreadId(),
@@ -282,7 +282,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       // called on a different thread than pushRange
       // As a convention, we put the async pop on the original
       // thread and save current thread id in pop event
-      Event evt(
+      LegacyEvent evt(
           EventKind::PopRange,
           at::StringView(""),
           at::RecordFunction::currentThreadId(),
@@ -307,7 +307,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       c10::Device device) override {
     if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
       uint64_t thread_id = at::RecordFunction::currentThreadId();
-      Event evt(
+      LegacyEvent evt(
           EventKind::MemoryAlloc,
           at::StringView(""),
           thread_id,
@@ -443,7 +443,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
 
   ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
   at::CallbackHandle handle_ = 0;
-  c10::optional<std::vector<std::vector<Event>>> remoteProfiledEvents_;
+  c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
 
 #ifdef USE_KINETO
   std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
@@ -733,13 +733,13 @@ ProfilerResult disableProfiler() {
 }
 #endif
 
-void addEventList(std::vector<Event>&& profiledEvents) {
+void addEventList(std::vector<LegacyEvent>&& profiledEvents) {
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(state_ptr, "Profiler must be enabled.");
   state_ptr->setOrAddRemoteProfiledEvents(std::move(profiledEvents));
 }
 
-void Event::record(bool record_cuda) {
+void LegacyEvent::record(bool record_cuda) {
   if (record_cuda) {
     cuda_stubs->record(&device_, &cuda_event, &cpu_ns_);
     return;
@@ -747,7 +747,7 @@ void Event::record(bool record_cuda) {
   cpu_ns_ = getTime();
 }
 
-/* static */ Event Event::fromIValue(const at::IValue& eventIValue) {
+/* static */ LegacyEvent LegacyEvent::fromIValue(const at::IValue& eventIValue) {
   TORCH_INTERNAL_ASSERT(
       eventIValue.isList(),
       "Expected IValue to contain type c10::impl::GenericList");
@@ -756,7 +756,7 @@ void Event::record(bool record_cuda) {
       ivalues.size() >= NUM_EVENT_IVALUE_IDX,
       "Expected at least ",
       NUM_EVENT_IVALUE_IDX,
-      " elements to reconstruct Event.");
+      " elements to reconstruct LegacyEvent.");
 
   // Reconstruct input shapes from ivalues.
   auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES);
@@ -782,7 +782,7 @@ void Event::record(bool record_cuda) {
     shapes.emplace_back(s);
   }
 
-  Event evt(
+  LegacyEvent evt(
       static_cast<EventKind>(
           ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind
       at::StringView(ivalues.get(EventIValueIdx::NAME).toStringRef()), // name
@@ -802,7 +802,7 @@ void Event::record(bool record_cuda) {
   return evt;
 }
 
-at::IValue Event::toIValue() const {
+at::IValue LegacyEvent::toIValue() const {
   c10::impl::GenericList eventIValueList(at::AnyType::get());
   eventIValueList.reserve(NUM_EVENT_IVALUE_IDX);
   eventIValueList.emplace_back(static_cast<int64_t>(kind_));
@@ -834,7 +834,7 @@ at::IValue Event::toIValue() const {
   return at::IValue(eventIValueList);
 }
 
-double Event::cudaElapsedUs(const Event& e) const {
+double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const {
   TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA");
   TORCH_CHECK(
       e.device() == device(),
@@ -862,10 +862,10 @@ static jit::CodeTemplate event_template(R"(
   "args": {}
 })");
 
-void writeProfilerEventsToStream(std::ostream& out, const std::vector<Event*>& events) {
+void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events) {
   TORCH_CHECK(out, "Could not open file");
-  Event* profiler_start = nullptr;
-  for (Event* e : events) {
+  LegacyEvent* profiler_start = nullptr;
+  for (LegacyEvent* e : events) {
     if (0 == strcmp(e->name(), "__start_profile")) {
       profiler_start = e;
       break;
@@ -879,10 +879,10 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector<Event*>& e
       return std::hash<at::RecordFunctionHandle>()(p.first) ^ std::hash<int64_t>()(p.second);
     }
   };
-  std::unordered_map<std::pair<at::RecordFunctionHandle, int64_t>, Event*, PairHash> events_map;
+  std::unordered_map<std::pair<at::RecordFunctionHandle, int64_t>, LegacyEvent*, PairHash> events_map;
   out << "[\n";
   bool first = true;
-  for (Event* evt : events) {
+  for (LegacyEvent* evt : events) {
     if (evt->kind() == "push") {
       events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt;
     } else if (evt->kind() == "pop") {
@@ -892,7 +892,7 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector<Event*>& e
       first = false;
       auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId()));
       TORCH_CHECK(it != events_map.end(), "Unmatched pop event");
-      Event* evt_start = it->second;
+      LegacyEvent* evt_start = it->second;
       events_map.erase(it);
 
       jit::TemplateEnv env;
@@ -923,7 +923,7 @@ void RecordProfile::init() {
 
 RecordProfile::~RecordProfile() {
   thread_event_lists event_lists = disableProfiler();
-  std::vector<Event*> events;
+  std::vector<LegacyEvent*> events;
   for (auto& l : event_lists) {
     for (auto& e : l) {
         events.push_back(&e);
@@ -935,7 +935,7 @@ RecordProfile::~RecordProfile() {
   }
 }
 
-void RecordProfile::processEvents(const std::vector<Event*>& events) {
+void RecordProfile::processEvents(const std::vector<LegacyEvent*>& events) {
   writeProfilerEventsToStream(out_, events);
 }
 

From 97e5070d0277d807fbff7a5678ad432fb7945ece Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 21:00:00 -0800
Subject: [PATCH 15/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp |  6 +--
 torch/csrc/autograd/profiler.h   | 76 ++++++++++++++++++++------------
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index e0f5c195bc10..c4eae2dbcfc9 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -534,6 +534,7 @@ void pushProfilingCallbacks() {
     .needsIds(true));
   state_ptr->setCallbackHandle(handle);
 }
+#endif
 
 const int kCUDAWarmupStart = 5;
 
@@ -690,7 +691,7 @@ thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> p
 void enableProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities) {
-  TORCH_CHECK(config.state == ProfilerState::KINETO && kinetoAvailable());
+  TORCH_CHECK(config.state == ProfilerState::KINETO);
   TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler");
 
   auto state_ptr = getProfilerTLSState();
@@ -702,10 +703,9 @@ void enableProfiler(
     pushProfilingCallbacks();
   }
 
-  while (!libkineto::api().traceActive()) { // sync?
+  if (!libkineto::api().traceActive()) {
     libkineto::api().startTrace();
   }
-  //TORCH_CHECK(libkineto::api().traceActive());
 
   state->mark("__start_profile", false);
 }
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 62984deb0ced..10eae6efcc67 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -428,30 +428,6 @@ TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<
 
 #ifdef USE_KINETO
 struct TORCH_API KinetoEvent {
-  KinetoEvent(std::unique_ptr<TraceActivity>&& activity) : activity_(activity) {
-    TORCH_CHECK(activity_);
-  }
-
-  std::string name() const override  {
-    return activity_->name();
-  }
-
-  uint64_t deviceIndex() const override {
-    return activity_->deviceId();
-  }
-
-  uint64_t startUs() const override {
-    return activity_->timestamp();
-  }
-
-  uint64_t durationUs() const override {
-    return activity_->duration();
-  }
-
-  uint64_t correlationId() const override {
-    return activity_->correlationId();
-  }
-
   int64_t threadId() const {
     return thread_id_;
   }
@@ -515,16 +491,57 @@ struct TORCH_API KinetoEvent {
     return *this;
   }
 
+  // Kineto fields
+
+  KinetoEvent& activity(const libkineto::TraceActivity& activity) {
+    name_ = activity.name();
+    deviceIndex_ = activity.deviceId();
+    startUs_ = activity.timestamp();
+    durationUs_ = activity.duration();
+    correlationId_ = activity.correlationId();
+    return *this;
+  }
+
+  std::string name() const {
+    return name_;
+  }
+
+  uint64_t deviceIndex() const {
+    return deviceIndex_;
+  }
+
+  uint64_t startUs() const {
+    return startUs_;
+  }
+
+  uint64_t durationUs() const {
+    return durationUs_;
+  }
+
+  uint64_t correlationId() const {
+    return correlationId_;
+  }
+
+  KinetoEvent& correlationId(uint64_t correlationId)  {
+    correlationId_ = correlationId;
+    return *this;
+  }
+
  private:
-  int64_t thread_id_ = -1;
+  int64_t thread_id_ = 0;
   c10::DeviceType device_type_ = c10::DeviceType::CPU,
-  int64_t fwd_thread_id_ = -1;
+  int64_t fwd_thread_id_ = 0;
   std::vector<std::vector<int64_t>> shapes_;
-  int64_t sequence_nr_ = -1;
+  int64_t sequence_nr_ = 0;
   std::vector<std::string> stack_;
   uint8_t scope_ = 0;
 
-  std::unique_ptr<libkineto::TraceActivity> activity_;
+  std::string name_;
+  uint64_t deviceIndex_ = 0;
+  uint64_t startUs_ = 0;
+  uint64_t durationUs_ = 0;
+  uint64_t correlationId_ = 0;
+
 };
 
 struct TORCH_API ProfilerResult {
@@ -549,12 +566,13 @@ TORCH_API void enableProfiler(
     const std::set<ActivityType>& activities);
 TORCH_API ProfilerResult disableProfiler();
 
-TORCH_API bool kinetoAvailable();
 TORCH_API void prepareProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities);
 #endif // USE_KINETO
 
+TORCH_API bool kinetoAvailable();
+
 // Usage:
 //   {
 //     RecordProfile guard("filename.trace");

From 8d111d282b96b7fa44c70a6d440d3f307e23b8c4 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 21:05:43 -0800
Subject: [PATCH 16/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/__init__.py   | 8 +++++---
 torch/csrc/autograd/init.cpp | 5 ++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index e2ccf47ce923..d8d10ab4c2e6 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -243,7 +243,9 @@ def variable(*args, **kwargs):
 
 # Import all native method/classes
 from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent,
-                                ProfilerResult, KinetoEvent,
-                                _enable_profiler_legacy, _disable_profiler_legacy,
-                                _prepare_profiler, _enable_profiler, _disable_profiler, _profiler_enabled,
+                                _enable_profiler_legacy, _disable_profiler_legacy, _profiler_enabled,
                                 _enable_record_function, _set_empty_test_observer, kineto_available)
+
+if kineto_available():
+    from torch._C._autograd import (ProfilerResult, KinetoEvent,
+                                    _prepare_profiler, _enable_profiler, _disable_profiler)
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 4eb781b08885..153c815050fc 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -70,6 +70,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("scope", &LegacyEvent::scope)
       .def("correlation_id", &LegacyEvent::correlationId);
 
+#ifdef USE_KINETO
   py::class_<KinetoEvent>(m, "KinetoEvent")
       .def("name", &KinetoEvent::name)
       .def("thread_id", &KinetoEvent::threadId)
@@ -87,10 +88,12 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("events", &ProfilerResult::events)
       .def("legacy_events", &ProfilerResult::legacy_events);
 
-  m.def("kineto_available", kinetoAvailable);
   m.def("_enable_profiler", enableProfiler);
   m.def("_disable_profiler", disableProfiler);
   m.def("_prepare_profiler", prepareProfiler);
+#endif
+
+  m.def("kineto_available", kinetoAvailable);
 
   m.def("_enable_profiler_legacy", enableProfilerLegacy);
   py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")

From bfb03607bee9686a288c409f0d8e16f55afbf7bf Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 21:16:54 -0800
Subject: [PATCH 17/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.h | 37 ++++++++++++++++------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 10eae6efcc67..9df0c263c714 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -19,7 +19,6 @@
 #endif
 
 #include <ATen/record_function.h>
-
 #include <c10/core/DeviceType.h>
 
 #ifdef USE_KINETO
@@ -175,7 +174,7 @@ struct TORCH_API LegacyEvent {
     record(record_cuda);
   }
 
-  // Constructor to be used in conjunction with v::fromIValue.
+  // Constructor to be used in conjunction with LegacyEvent::fromIValue.
   LegacyEvent(
       EventKind kind,
       at::StringView name,
@@ -219,6 +218,16 @@ struct TORCH_API LegacyEvent {
 
   void record(bool record_cuda);
 
+  std::string kindStr() const {
+    switch (kind_) {
+      case EventKind::Mark: return "mark";
+      case EventKind::PushRange: return "push";
+      case EventKind::PopRange: return "pop";
+      case EventKind::MemoryAlloc: return "memory_alloc";
+    }
+    throw std::runtime_error("unknown event kind");
+  }
+
   const char* name() const {
     return name_.str();
   }
@@ -239,16 +248,16 @@ struct TORCH_API LegacyEvent {
     return cpu_ns_ / (1000.0);
   }
 
-  void setCpuUs(double cpu_us) {
-    cpu_ns_ = (int64_t)(cpu_us * 1000);
-  }
-
   double cudaElapsedUs(const LegacyEvent& e) const;
 
   bool hasCuda() const {
     return cuda_event != nullptr || (isRemote() && device_ != -1);
   }
 
+  int device() const {
+    return device_;
+  }
+
   void updateMemoryStats(int64_t alloc_size, c10::Device device) {
     if (device.type() == c10::DeviceType::CUDA ||
         device.type() == c10::DeviceType::HIP) {
@@ -336,21 +345,11 @@ struct TORCH_API LegacyEvent {
     scope_ = scope;
   }
 
-  std::string kindStr() const {
-    switch (kind_) {
-      case EventKind::Mark: return "mark";
-      case EventKind::PushRange: return "push";
-      case EventKind::PopRange: return "pop";
-      case EventKind::MemoryAlloc: return "memory_alloc";
-    }
-    throw std::runtime_error("unknown event kind");
-  }
-
  private:
-  EventKind kind_;
   // signed to allow for negative intervals, initialized for safety.
   int64_t cpu_ns_ = 0;
   at::StringView name_;
+  EventKind kind_;
   uint64_t thread_id_;
   uint64_t fwd_thread_id_;
   at::RecordFunctionHandle handle_ {0};
@@ -366,7 +365,6 @@ struct TORCH_API LegacyEvent {
 
   std::vector<std::string> stack_;
   uint8_t scope_;
-
   uint64_t correlation_id_;
 };
 
@@ -412,7 +410,6 @@ struct RangeEventList {
 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfilerLegacy(const ProfilerConfig&);
-
 using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
 TORCH_API thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions = c10::nullopt);
 
@@ -541,7 +538,6 @@ struct TORCH_API KinetoEvent {
   uint64_t startUs_ = 0;
   uint64_t durationUs_ = 0;
   uint64_t correlationId_ = 0;
-
 };
 
 struct TORCH_API ProfilerResult {
@@ -557,6 +553,7 @@ struct TORCH_API ProfilerResult {
   const thread_event_lists& legacy_events() const {
     return legacy_events_;
   }
+
  private:
   std::vector<std::vector<KinetoEvent>> events_;
   thread_event_lists legacy_events_; // tensor mem alloc, start/stop

From 1ff1a124ec35324b814c0f0c0be9ca6d2810378b Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 21:46:12 -0800
Subject: [PATCH 18/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 50 ++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index c4eae2dbcfc9..ee79c7f9cddd 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -169,14 +169,18 @@ struct FileLineFunc {
   std::string funcname;
 };
 
-thread_local size_t corr_id_ = 0;
+thread_local size_t corr_id_ = 1;
 size_t next_correlation_id() {
-  return ++corr_id_;
-}
-size_t cur_correlation_id() {
-  return corr_id_;
+  return corr_id_++;
 }
 
+#ifdef USE_KINETO
+struct KinetoObserverContext : public at::ObserverContext {
+  int64_t startUs;
+  uint64_t correlationId;
+};
+#endif
+
 // Profiler state
 struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
@@ -217,7 +221,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           at::RecordFunction::currentThreadId(),
           include_cuda && config_.state == ProfilerState::CUDA);
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
-      evt.setCorrelationId(cur_correlation_id());
       getEventList().record(std::move(evt));
     }
   }
@@ -313,7 +316,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           thread_id,
           config_.state == ProfilerState::CUDA);
       evt.updateMemoryStats(alloc_size, device);
-      evt.setCorrelationId(cur_correlation_id());
       getEventList(thread_id).record(std::move(evt));
     }
   }
@@ -322,24 +324,26 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     return config_.profile_memory;
   }
 
-  void reportKinetoClientActivity(const at::RecordFunction& fn) {
+  void reportKinetoClientActivity(
+      const at::RecordFunction& fn,
+      const KinetoObserverContext& ctx) {
 #ifdef USE_KINETO
     if (config_.state == ProfilerState::KINETO) {
       libkineto::ClientTraceActivity op;
-      /*op.startTime = libkineto::timeSinceEpoch(fc.startTime);
-      op.endTime = libkineto::timeSinceEpoch(now);
+      op.startTime = ctx.startUs;
+      op.endTime = (getTime() / 1000);
       op.opType = std::string(fn.name());
-      op.device = fc.deviceType;
-      op.correlation = fc.correlationId;
+      op.device = 0; // CPU
+      op.correlation = ctx.correlationId;
+      /*
       op.threadId = pthread_self();
       op.inputDims = folly::toJson(fc.input_shapes);
-      op.inputTypes = folly::toJson(fc.input_types);*/
+      op.inputTypes = folly::toJson(fc.input_types);
+      */
       {
         std::lock_guard<std::mutex> guard(state_mutex_);
         kineto_client_activities_.emplace_back(op);
-        kineto_events_.emplace_back(
-            KinetoEventImpl::fromClientActivity(
-                &(kineto_client_activities_.back())));
+        //kineto_events_.emplace_back();
       }
       return;
     }
@@ -520,14 +524,22 @@ void pushProfilingCallbacks() {
           return;
         }
 
-        libkineto::api().pushCorrelationId(next_correlation_id());
+        auto corr_id = next_correlation_id();
+        libkineto::api().pushCorrelationId(corr_id);
+
+        auto ctx_ptr = std::make_unique<KinetoObserverContext>();
+        ctx_ptr->startUs = getTime() / 1000;
+        ctx_ptr->correlationId = corr_id;
+        return ctx_ptr;
       },
-      [](const at::RecordFunction& fn) {
+      [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) {
         auto state_ptr = getProfilerTLSState();
         if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
           return;
         }
-        state_ptr->reportKinetoClientActivity(fn);
+        auto kineto_ctx_ptr = dynamic_cast<KinetoObserverContext*>(ctx_ptr);
+        TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
+        state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr);
         libkineto::api().popCorrelationId();
       })
     .needsInputs(state_ptr->config().report_input_shapes)

From b3b69d8a1b7d2c577dbf81c37448520ffe7dadff Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 22:37:26 -0800
Subject: [PATCH 19/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 67 ++++++++++++++++++++++++---
 torch/csrc/autograd/profiler.h   | 77 ++++++++++++++++++++------------
 2 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index ee79c7f9cddd..834c7f99facd 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -169,8 +169,9 @@ struct FileLineFunc {
   std::string funcname;
 };
 
-thread_local size_t corr_id_ = 1;
-size_t next_correlation_id() {
+// TODO: figure if we can use TLS
+std::atomic<uint64_t> corr_id_ {1};
+uint64_t next_correlation_id() {
   return corr_id_++;
 }
 
@@ -178,6 +179,13 @@ size_t next_correlation_id() {
 struct KinetoObserverContext : public at::ObserverContext {
   int64_t startUs;
   uint64_t correlationId;
+  uint64_t startThreadId;
+  uint64_t endThreadId;
+  std::vector<std::vector<int64_t>> shapes;
+  int64_t sequenceNr;
+  uint64_t fwdThreadId;
+  uint8_t recFunScope;
+  std::vector<std::string> stack;
 };
 #endif
 
@@ -324,7 +332,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     return config_.profile_memory;
   }
 
-  void reportKinetoClientActivity(
+  KinetoEvent& reportKinetoClientActivity(
       const at::RecordFunction& fn,
       const KinetoObserverContext& ctx) {
 #ifdef USE_KINETO
@@ -335,15 +343,22 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       op.opType = std::string(fn.name());
       op.device = 0; // CPU
       op.correlation = ctx.correlationId;
+      //op.inputDims = toStr(ctx.shapes); //
       /*
       op.threadId = pthread_self();
-      op.inputDims = folly::toJson(fc.input_shapes);
-      op.inputTypes = folly::toJson(fc.input_types);
       */
+
       {
         std::lock_guard<std::mutex> guard(state_mutex_);
         kineto_client_activities_.emplace_back(op);
-        //kineto_events_.emplace_back();
+        kineto_events_.emplace_back();
+        kineto_events_.back()
+            .startThreadId(ctx.startThreadId)
+            .endThreadId(ctx.endThreadId)
+            .sequenceNr(ctx.sequenceNr)
+            .fwdThreadId(ctx.fwdThreadId)
+            .scope(ctx.recFunScope)
+            .stack(stack); //
       }
       return;
     }
@@ -530,6 +545,43 @@ void pushProfilingCallbacks() {
         auto ctx_ptr = std::make_unique<KinetoObserverContext>();
         ctx_ptr->startUs = getTime() / 1000;
         ctx_ptr->correlationId = corr_id;
+        ctx_ptr->startThreadId = at::RecordFunction::currentThreadId();
+
+        if (state_ptr->config().report_input_shapes) {
+          std::vector<std::vector<int64_t>> inputSizes;
+          inputSizes.reserve(fn.inputs().size());
+          for (const c10::IValue& input : fn.inputs()) {
+            if (!input.isTensor()) {
+              inputSizes.emplace_back();
+              continue;
+            }
+            const at::Tensor& tensor = input.toTensor();
+            if (tensor.defined()) {
+              inputSizes.push_back(input.toTensor().sizes().vec());
+            } else {
+              inputSizes.emplace_back();
+            }
+          }
+          ctx_ptr->shapes = inputSizes;
+        }
+
+        ctx_ptr->sequenceNr = fn.seqNr();
+        ctx_ptr->fwdThreadId = fn.forwardThreadId();
+        ctx_ptr->recFunScope = (uint8_t)fn.scope();
+
+#ifndef C10_MOBILE
+        // backward nodes source range corresponds to the forward node
+        // TODO: consider using C++ stack trace
+        if (state_ptr->config().with_stack &&
+            fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+          auto cs = prepareCallstack(jit::currentCallstack());
+          if (cs.empty()) {
+            cs = prepareCallstack(jit::tracer::pythonCallstack());
+          }
+          ctx_ptr->stack = callstackStr(cs);
+        }
+#endif
+
         return ctx_ptr;
       },
       [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) {
@@ -539,6 +591,9 @@ void pushProfilingCallbacks() {
         }
         auto kineto_ctx_ptr = dynamic_cast<KinetoObserverContext*>(ctx_ptr);
         TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
+
+        kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId();
+
         state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr);
         libkineto::api().popCorrelationId();
       })
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 9df0c263c714..3fe99d7f54a8 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -425,36 +425,53 @@ TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<
 
 #ifdef USE_KINETO
 struct TORCH_API KinetoEvent {
-  int64_t threadId() const {
-    return thread_id_;
+  uint64_t startThreadId() const {
+    return start_thread_id_;
+  }
+
+  uint64_t endThreadId() const {
+    return end_thread_id_;
   }
 
   c10::DeviceType deviceType() const {
     return device_type_;
   }
 
-  int64_t fwdThreadId() const {
+  uint64_t fwdThreadId() const {
     return fwd_thread_id_;
   }
 
+  bool hasShapes() const {
+    return shapes_ != c10::nullopt;
+  }
+
   const std::vector<std::vector<int64_t>>& shapes() const {
-    return shapes_;
+    return *shapes_;
   }
 
   int64_t sequenceNr() const {
     return sequence_nr_;
   }
 
+  bool hasStack() const {
+    return stack_ != c10::nullopt;
+  }
+
   const std::vector<std::string>& stack() const {
-    return stack_;
+    return *stack_;
   }
 
   uint8_t scope() const {
     return scope_;
   }
 
-  KinetoEvent& threadId(int64_t thread_id) {
-    thread_id_ = thread_id;
+  KinetoEvent& startThreadId(uint64_t start_thread_id) {
+    start_thread_id_ = start_thread_id;
+    return *this;
+  }
+
+  KinetoEvent& endThreadId(uint64_t end_thread_id) {
+    end_thread_id_ = end_thread_id;
     return *this;
   }
 
@@ -463,13 +480,13 @@ struct TORCH_API KinetoEvent {
     return *this;
   }
 
-  KinetoEvent& fwdThreadId(int64_t fwd_thread_id) {
+  KinetoEvent& fwdThreadId(uint64_t fwd_thread_id) {
     fwd_thread_id_ = fwd_thread_id;
     return *this;
   }
 
   KinetoEvent& shapes(const std::vector<std::vector<int64_t>>& shapes) {
-    shapes_ = shapes;
+    *shapes_ = shapes;
     return *this;
   }
 
@@ -479,7 +496,7 @@ struct TORCH_API KinetoEvent {
   }
 
   KinetoEvent& stack(const std::vector<std::string>& st) {
-    stack_ = st;
+    *stack_ = st;
     return *this;
   }
 
@@ -492,10 +509,10 @@ struct TORCH_API KinetoEvent {
 
   KinetoEvent& activity(const libkineto::TraceActivity& activity) {
     name_ = activity.name();
-    deviceIndex_ = activity.deviceId();
-    startUs_ = activity.timestamp();
-    durationUs_ = activity.duration();
-    correlationId_ = activity.correlationId();
+    device_index_ = activity.deviceId();
+    start_us_ = activity.timestamp();
+    duration_us_ = activity.duration();
+    correlation_id_ = activity.correlationId();
     return *this;
   }
 
@@ -504,40 +521,42 @@ struct TORCH_API KinetoEvent {
   }
 
   uint64_t deviceIndex() const {
-    return deviceIndex_;
+    return device_index_;
   }
 
   uint64_t startUs() const {
-    return startUs_;
+    return start_us_;
   }
 
   uint64_t durationUs() const {
-    return durationUs_;
+    return duration_us_;
   }
 
   uint64_t correlationId() const {
-    return correlationId_;
+    return correlation_id_;
   }
 
-  KinetoEvent& correlationId(uint64_t correlationId)  {
-    correlationId_ = correlationId;
+  KinetoEvent& correlationId(uint64_t correlation_id)  {
+    correlation_id_ = correlation_id;
     return *this;
   }
 
  private:
-  int64_t thread_id_ = 0;
-  c10::DeviceType device_type_ = c10::DeviceType::CPU,
-  int64_t fwd_thread_id_ = 0;
-  std::vector<std::vector<int64_t>> shapes_;
+  uint64_t start_thread_id_ = 0;
+  uint64_t end_thread_id_ = 0;
+  uint64_t fwd_thread_id_ = 0;
   int64_t sequence_nr_ = 0;
-  std::vector<std::string> stack_;
   uint8_t scope_ = 0;
 
+  c10::DeviceType device_type_ = c10::DeviceType::CPU,
+  c10::optional<std::vector<std::vector<int64_t>>> shapes_;
+  c10::optional<std::vector<std::string>> stack_;
+
   std::string name_;
-  uint64_t deviceIndex_ = 0;
-  uint64_t startUs_ = 0;
-  uint64_t durationUs_ = 0;
-  uint64_t correlationId_ = 0;
+  uint64_t device_index_ = 0;
+  uint64_t start_us_ = 0;
+  uint64_t duration_us_ = 0;
+  uint64_t correlation_id_ = 0;
 };
 
 struct TORCH_API ProfilerResult {

From 2faeb8a4e02680e129c1f0fed62922736103f7db Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 22:49:25 -0800
Subject: [PATCH 20/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 115 ++++++++++++++-----------------
 1 file changed, 52 insertions(+), 63 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 834c7f99facd..841a293eae04 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -358,7 +358,9 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
             .sequenceNr(ctx.sequenceNr)
             .fwdThreadId(ctx.fwdThreadId)
             .scope(ctx.recFunScope)
-            .stack(stack); //
+        if (!stack.empty()) {
+          kineto_events_.back().stack(stack);
+        }
       }
       return;
     }
@@ -475,6 +477,24 @@ ProfilerThreadLocalState* getProfilerTLSState() {
   return dynamic_cast<ProfilerThreadLocalState*>(state.get());
 }
 
+std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) {
+  std::vector<std::vector<int64_t>> sizes;
+  sizes.reserve(fn.inputs().size());
+  for (const c10::IValue& input : fn.inputs()) {
+    if (!input.isTensor()) {
+      sizes.emplace_back();
+      continue;
+    }
+    const at::Tensor& tensor = input.toTensor();
+    if (tensor.defined()) {
+      sizes.push_back(input.toTensor().sizes().vec());
+    } else {
+      sizes.emplace_back();
+    }
+  }
+  return sizes;
+}
+
 void pushProfilingCallbacksLegacy() {
   auto state_ptr = getProfilerTLSState();
   TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
@@ -492,20 +512,7 @@ void pushProfilingCallbacksLegacy() {
 
         auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
         if (state_ptr->config().report_input_shapes) {
-          std::vector<std::vector<int64_t>> inputSizes;
-          inputSizes.reserve(fn.inputs().size());
-          for (const c10::IValue& input : fn.inputs()) {
-            if (!input.isTensor()) {
-              inputSizes.emplace_back();
-              continue;
-            }
-            const at::Tensor& tensor = input.toTensor();
-            if (tensor.defined()) {
-              inputSizes.push_back(input.toTensor().sizes().vec());
-            } else {
-              inputSizes.emplace_back();
-            }
-          }
+          auto sizes = inputSizes(fn);
           state_ptr->pushRange(fn, record_cuda, msg, std::move(inputSizes));
         } else {
           state_ptr->pushRange(fn, record_cuda, msg);
@@ -548,21 +555,7 @@ void pushProfilingCallbacks() {
         ctx_ptr->startThreadId = at::RecordFunction::currentThreadId();
 
         if (state_ptr->config().report_input_shapes) {
-          std::vector<std::vector<int64_t>> inputSizes;
-          inputSizes.reserve(fn.inputs().size());
-          for (const c10::IValue& input : fn.inputs()) {
-            if (!input.isTensor()) {
-              inputSizes.emplace_back();
-              continue;
-            }
-            const at::Tensor& tensor = input.toTensor();
-            if (tensor.defined()) {
-              inputSizes.push_back(input.toTensor().sizes().vec());
-            } else {
-              inputSizes.emplace_back();
-            }
-          }
-          ctx_ptr->shapes = inputSizes;
+          ctx_ptr->shapes = inputSizes(fn);
         }
 
         ctx_ptr->sequenceNr = fn.seqNr();
@@ -581,7 +574,6 @@ void pushProfilingCallbacks() {
           ctx_ptr->stack = callstackStr(cs);
         }
 #endif
-
         return ctx_ptr;
       },
       [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) {
@@ -661,38 +653,6 @@ bool kinetoAvailable() {
 #endif
 }
 
-void prepareProfiler(
-    const ProfilerConfig& config,
-    const std::set<ActivityType>& activities) {
-#ifdef USE_KINETO
-  if (config.state == ProfilerState::KINETO) {
-    std::set<libkineto::ActivityType> k_activities;
-    if (activities.count(ActivityType::CPU)) {
-      k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION);
-      k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
-    }
-    //if (activities.count(ActivityType::CUDA_RUNTIME)) {
-    //  k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
-    //}
-    if (activities.count(ActivityType::CUDA)) {
-      k_activities.insert(libkineto::ActivityType::GPU_MEMCPY);
-      k_activities.insert(libkineto::ActivityType::GPU_MEMSET);
-      k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL);
-    }
-
-    if (!libkineto::api().hasProfilerRegistered()) {
-      libkineto::api().registerProfiler(
-        std::make_unique<libkineto::ActivityProfilerInterface>(false));
-    }
-    libkineto::api().initProfilerIfRegistered();
-    libkineto::api().prepareTrace(k_activities);
-
-    return;
-  }
-#endif
-  TORCH_CHECK(false, "Supported only in Kineto profiler");
-}
-
 void enableProfilerLegacy(const ProfilerConfig& new_config) {
   TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(),
     "Can't use NVTX profiler - PyTorch was compiled without CUDA");
@@ -755,6 +715,35 @@ thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> p
 }
 
 #ifdef USE_KINETO
+
+void prepareProfiler(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities) {
+  TORCH_CHECK(config.state == ProfilerState::KINETO,
+      "Supported only in Kineto profiler");
+
+  std::set<libkineto::ActivityType> k_activities;
+  if (activities.count(ActivityType::CPU)) {
+    k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION);
+    k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
+  }
+  //if (activities.count(ActivityType::CUDA_RUNTIME)) {
+  //  k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
+  //}
+  if (activities.count(ActivityType::CUDA)) {
+    k_activities.insert(libkineto::ActivityType::GPU_MEMCPY);
+    k_activities.insert(libkineto::ActivityType::GPU_MEMSET);
+    k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL);
+  }
+
+  if (!libkineto::api().hasProfilerRegistered()) {
+    libkineto::api().registerProfiler(
+      std::make_unique<libkineto::ActivityProfilerInterface>(false));
+  }
+  libkineto::api().initProfilerIfRegistered();
+  libkineto::api().prepareTrace(k_activities);
+}
+
 void enableProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities) {

From 67c890da1ceec84b170838d9336a73b951a040db Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 22:50:29 -0800
Subject: [PATCH 21/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 841a293eae04..14770b06452d 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -778,14 +778,15 @@ ProfilerResult disableProfiler() {
     at::removeCallback(state_ptr->callbackHandle());
   }
 
+  state_ptr->mark("__stop_profile");
+
   if (state_ptr->config().state == ProfilerState::KINETO) {
     auto trace = libkineto::api().stopTrace();
     //
   }
 
-  state_ptr->mark("__stop_profile");
-  // Note that this will erase the underlying events.
-  return state_ptr->consolidate();
+  auto legacy_events = state_ptr->consolidate();
+  //
 }
 #endif
 

From ed8babeb6b413a52218875da7b11f36011f80179 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 22:53:07 -0800
Subject: [PATCH 22/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 14770b06452d..9b32895c2a4d 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -780,13 +780,10 @@ ProfilerResult disableProfiler() {
 
   state_ptr->mark("__stop_profile");
 
-  if (state_ptr->config().state == ProfilerState::KINETO) {
-    auto trace = libkineto::api().stopTrace();
-    //
-  }
-
+  auto trace = libkineto::api().stopTrace();
+  auto kineto_events = filterTrace(trace);
   auto legacy_events = state_ptr->consolidate();
-  //
+  return ProfilerResult(kineto_events, legacy_events);
 }
 #endif
 

From ffc11fdd1960707895124e3b184cbf3ef4270fc3 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Mon, 2 Nov 2020 23:48:01 -0800
Subject: [PATCH 23/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp              | 114 +++++++++---------
 torch/csrc/autograd/profiler.h                |  10 +-
 .../rpc/request_callback_no_python.cpp        |   4 +-
 3 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 9b32895c2a4d..df13c11b7680 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -181,14 +181,42 @@ struct KinetoObserverContext : public at::ObserverContext {
   uint64_t correlationId;
   uint64_t startThreadId;
   uint64_t endThreadId;
-  std::vector<std::vector<int64_t>> shapes;
+  c10::optional<std::vector<std::vector<int64_t>>> shapes;
   int64_t sequenceNr;
   uint64_t fwdThreadId;
   uint8_t recFunScope;
-  std::vector<std::string> stack;
+  c10::optional<std::vector<std::string>> stack;
 };
 #endif
 
+std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
+  std::vector<FileLineFunc> entries;
+  entries.reserve(cs.size());
+  for (const auto& entry : cs) {
+    auto& range = entry.range;
+    if (range.source()) {
+      auto& src = range.source();
+      if (src && src->filename()) {
+        auto line = src->starting_line_no() +
+            src->lineno_for_offset(range.start());
+        entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
+      }
+    }
+  }
+  return entries;
+}
+
+std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
+  std::vector<std::string> cs_str;
+  cs_str.reserve(cs.size());
+  for (const auto& entry : cs) {
+    std::stringstream loc;
+    loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
+    cs_str.push_back(loc.str());
+  }
+  return cs_str;
+}
+
 // Profiler state
 struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
@@ -332,7 +360,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     return config_.profile_memory;
   }
 
-  KinetoEvent& reportKinetoClientActivity(
+  void reportKinetoClientActivity(
       const at::RecordFunction& fn,
       const KinetoObserverContext& ctx) {
 #ifdef USE_KINETO
@@ -340,26 +368,26 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       libkineto::ClientTraceActivity op;
       op.startTime = ctx.startUs;
       op.endTime = (getTime() / 1000);
-      op.opType = std::string(fn.name());
+      op.opType = std::string(fn.name().str());
       op.device = 0; // CPU
       op.correlation = ctx.correlationId;
-      //op.inputDims = toStr(ctx.shapes); //
-      /*
-      op.threadId = pthread_self();
-      */
+      if (ctx.shapes && !ctx.shapes->empty()) {
+        //op.inputDims = toStr(*ctx.shapes); //
+      }
+      //op.threadId = pthread_self();
 
       {
         std::lock_guard<std::mutex> guard(state_mutex_);
-        kineto_client_activities_.emplace_back(op);
+        kineto_client_activities_.emplace_back(std::move(op));
         kineto_events_.emplace_back();
         kineto_events_.back()
             .startThreadId(ctx.startThreadId)
             .endThreadId(ctx.endThreadId)
             .sequenceNr(ctx.sequenceNr)
             .fwdThreadId(ctx.fwdThreadId)
-            .scope(ctx.recFunScope)
-        if (!stack.empty()) {
-          kineto_events_.back().stack(stack);
+            .scope(ctx.recFunScope);
+        if (ctx.stack && !ctx.stack->empty()) {
+          kineto_events_.back().stack(*ctx.stack);
         }
       }
       return;
@@ -369,34 +397,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   }
 
  private:
-  std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
-    std::vector<FileLineFunc> entries;
-    entries.reserve(cs.size());
-    for (const auto& entry : cs) {
-      auto& range = entry.range;
-      if (range.source()) {
-        auto& src = range.source();
-        if (src && src->filename()) {
-          auto line = src->starting_line_no() +
-              src->lineno_for_offset(range.start());
-          entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
-        }
-      }
-    }
-    return entries;
-  }
-
-  std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
-    std::vector<std::string> cs_str;
-    cs_str.reserve(cs.size());
-    for (const auto& entry : cs) {
-      std::stringstream loc;
-      loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
-      cs_str.push_back(loc.str());
-    }
-    return cs_str;
-  }
-
   std::string getNvtxStr(
       const at::StringView& name,
       const char* msg,
@@ -468,7 +468,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
 
 #ifdef USE_KINETO
   std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
-  std::vector<KinetoEventImpl> kineto_events_;
+  std::vector<KinetoEvent> kineto_events_;
 #endif
 };
 
@@ -513,7 +513,7 @@ void pushProfilingCallbacksLegacy() {
         auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
         if (state_ptr->config().report_input_shapes) {
           auto sizes = inputSizes(fn);
-          state_ptr->pushRange(fn, record_cuda, msg, std::move(inputSizes));
+          state_ptr->pushRange(fn, record_cuda, msg, std::move(sizes));
         } else {
           state_ptr->pushRange(fn, record_cuda, msg);
         }
@@ -543,7 +543,7 @@ void pushProfilingCallbacks() {
       [](const at::RecordFunction& fn) {
         auto state_ptr = getProfilerTLSState();
         if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
-          return;
+          return std::make_unique<KinetoObserverContext>();
         }
 
         auto corr_id = next_correlation_id();
@@ -576,7 +576,7 @@ void pushProfilingCallbacks() {
 #endif
         return ctx_ptr;
       },
-      [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) {
+      [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
         auto state_ptr = getProfilerTLSState();
         if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
           return;
@@ -736,11 +736,12 @@ void prepareProfiler(
     k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL);
   }
 
-  if (!libkineto::api().hasProfilerRegistered()) {
-    libkineto::api().registerProfiler(
-      std::make_unique<libkineto::ActivityProfilerInterface>(false));
-  }
-  libkineto::api().initProfilerIfRegistered();
+  //if (!libkineto::api().hasProfilerRegistered()) {
+  //  libkineto::api().registerProfiler(
+  //    std::make_unique<libkineto::ActivityProfilerInterface>(false));
+  //}
+
+  //libkineto::api().initProfilerIfRegistered();
   libkineto::api().prepareTrace(k_activities);
 }
 
@@ -780,8 +781,9 @@ ProfilerResult disableProfiler() {
 
   state_ptr->mark("__stop_profile");
 
-  auto trace = libkineto::api().stopTrace();
-  auto kineto_events = filterTrace(trace);
+  //auto trace = std::move(libkineto::api().stopTrace());
+  libkineto::api().stopTrace();
+  std::vector<std::vector<KinetoEvent>> kineto_events; // = filterTrace(trace);
   auto legacy_events = state_ptr->consolidate();
   return ProfilerResult(kineto_events, legacy_events);
 }
@@ -904,7 +906,6 @@ double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const {
 
 CUDAStubs::~CUDAStubs() = default;
 
-
 static jit::CodeTemplate event_template(R"(
 {
   "name": "${name}",
@@ -937,9 +938,9 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEven
   out << "[\n";
   bool first = true;
   for (LegacyEvent* evt : events) {
-    if (evt->kind() == "push") {
+    if (evt->kindStr() == "push") {
       events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt;
-    } else if (evt->kind() == "pop") {
+    } else if (evt->kindStr() == "pop") {
       if (!first) {
         out << ",\n";
       }
@@ -960,7 +961,6 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEven
   out << "]\n";
 }
 
-
 RecordProfile::RecordProfile(std::ostream& out)
 : out_(out) {
   init();
@@ -972,11 +972,11 @@ RecordProfile::RecordProfile(const std::string& filename)
 }
 
 void RecordProfile::init() {
-  enableProfiler(ProfilerConfig(ProfilerState::CPU));
+  enableProfilerLegacy(ProfilerConfig(ProfilerState::CPU));
 }
 
 RecordProfile::~RecordProfile() {
-  thread_event_lists event_lists = disableProfiler();
+  thread_event_lists event_lists = disableProfilerLegacy();
   std::vector<LegacyEvent*> events;
   for (auto& l : event_lists) {
     for (auto& e : l) {
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 3fe99d7f54a8..04cab1327dd0 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -500,8 +500,8 @@ struct TORCH_API KinetoEvent {
     return *this;
   }
 
-  KinetoEvent& scope(uint8_t scope_id) {
-    scope_id_ = scope_id;
+  KinetoEvent& scope(uint8_t scope) {
+    scope_ = scope;
     return *this;
   }
 
@@ -548,7 +548,7 @@ struct TORCH_API KinetoEvent {
   int64_t sequence_nr_ = 0;
   uint8_t scope_ = 0;
 
-  c10::DeviceType device_type_ = c10::DeviceType::CPU,
+  c10::DeviceType device_type_ = c10::DeviceType::CPU;
   c10::optional<std::vector<std::vector<int64_t>>> shapes_;
   c10::optional<std::vector<std::string>> stack_;
 
@@ -625,10 +625,10 @@ struct TORCH_API TLSProfilerGuard {
           c10::nullopt)
       : cb_(std::move(resultCallback)),
         profilerDisableOptions_(std::move(profilerDisableOptions)) {
-    enableProfiler(cfg);
+    enableProfilerLegacy(cfg);
   }
   ~TLSProfilerGuard() {
-    thread_event_lists event_lists = disableProfiler(profilerDisableOptions_);
+    thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_);
     if (cb_) {
       try {
         (*cb_)(event_lists);
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index cdce84a5d10c..7b0b83f547e3 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -92,7 +92,7 @@ std::shared_ptr<FutureMessage> RequestCallbackNoPython::processMessage(
           if (serverProcessGlobalProfilerStateStackEntryPtr) {
             // Restore thread-local profiler state.
             ::torch::autograd::profiler::thread_event_lists event_lists =
-                ::torch::autograd::profiler::disableProfiler();
+                ::torch::autograd::profiler::disableProfilerLegacy();
             // Put thread_local event_lists into the process-global profiler
             // state.
             profiler::processglobal::pushResultRecursive(
@@ -543,7 +543,7 @@ void RequestCallbackNoPython::processRpc(
               torch::autograd::profiler::ProfilerDisableOptions opts(
                   false, true);
               auto event_lists =
-                  torch::autograd::profiler::disableProfiler(opts);
+                  torch::autograd::profiler::disableProfilerLegacy(opts);
               if (wrappedRpcResponseFuture->hasError()) {
                 // Propagate error
                 // No need to propagate remote events in the case of an error.

From fe76b8416b6ab1369e466586a0166a59b68d8ac1 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 00:25:58 -0800
Subject: [PATCH 24/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 test/cpp/jit/test_misc.cpp       | 12 ++++++------
 torch/csrc/autograd/init.cpp     | 17 +++++++++--------
 torch/csrc/autograd/profiler.cpp | 14 ++++++++++++++
 torch/csrc/autograd/profiler.h   | 19 +++++++------------
 4 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 9ed50daaa86c..fdd155edf0f7 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2164,7 +2164,7 @@ TEST(TLSFutureCallbacksTest, Basic) {
     // Since we join here, we can ensure that all callbacks corresponding to
     // markCompleted() have finished.
     t.join();
-    torch::autograd::profiler::disableProfiler();
+    torch::autograd::profiler::disableProfilerLegacy();
   }
   // then() with TLS State
   {
@@ -2182,7 +2182,7 @@ TEST(TLSFutureCallbacksTest, Basic) {
     std::thread t([s1 = std::move(s1)]() { s1->markCompleted(); });
     t.join();
     s2->wait();
-    torch::autograd::profiler::disableProfiler();
+    torch::autograd::profiler::disableProfilerLegacy();
   }
 }
 
@@ -2204,10 +2204,10 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
     // Don't cleanup TLSState, and just consolidate.
     auto opts = torch::autograd::profiler::ProfilerDisableOptions(false, true);
     auto thread_event_lists =
-        torch::autograd::profiler::disableProfiler(std::move(opts));
+        torch::autograd::profiler::disableProfilerLegacy(std::move(opts));
     // Ensure that the events from this thread are still profiled and we obtain
     // the expected in events in our consolidated list when calling
-    // disableProfiler().
+    // disableProfilerLegacy().
     bool found_ones = false;
     bool found_add = false;
     for (const auto& li : thread_event_lists) {
@@ -2229,7 +2229,7 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
   s1->addCallback(verifyProfilerCb);
   // Disable the profiler, but do not consolidate results in the main thread.
   auto opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
-  torch::autograd::profiler::disableProfiler(std::move(opts));
+  torch::autograd::profiler::disableProfilerLegacy(std::move(opts));
   std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); });
   t.join();
 
@@ -2243,7 +2243,7 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
   // Runs callback inline
   s1->markCompleted(at::IValue(1));
   opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
-  torch::autograd::profiler::disableProfiler(std::move(opts));
+  torch::autograd::profiler::disableProfilerLegacy(std::move(opts));
 }
 
 TEST(IValueKWargsTest, Basic) {
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 153c815050fc..b844c4349fc6 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -73,16 +73,17 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
 #ifdef USE_KINETO
   py::class_<KinetoEvent>(m, "KinetoEvent")
       .def("name", &KinetoEvent::name)
-      .def("thread_id", &KinetoEvent::threadId)
+      .def("start_thread_id", [](const KinetoEvent& e) { return e.startThreadId(); })
+      .def("end_thread_id", [](const KinetoEvent& e) { return e.endThreadId(); })
       .def("device_index", &KinetoEvent::deviceIndex)
       .def("start_us", &KinetoEvent::startUs)
-      .def("duration", &KinetoEvent::duration)
-      .def("correlation_id", &KinetoEvent::correlationId)
-      .def("fwd_thread_id", &KinetoEvent::fwdThreadId)
-      .def("shapes", &KinetoEvent::shapes)
-      .def("sequence_nr", &KinetoEvent::sequenceNr)
-      .def("stack", &KinetoEvent::stack)
-      .def("scope", &KinetoEvent::scope);
+      .def("duration_us", &KinetoEvent::durationUs)
+      .def("correlation_id", [](const KinetoEvent& e) { return e.correlationId(); })
+      .def("fwd_thread_id", [](const KinetoEvent& e) { return e.fwdThreadId(); })
+      .def("shapes", [](const KinetoEvent& e) { return e.shapes(); })
+      .def("sequence_nr", [](const KinetoEvent& e) { return e.sequenceNr(); })
+      .def("stack", [](const KinetoEvent& e) { return e.stack(); })
+      .def("scope", [](const KinetoEvent& e) { return e.scope(); });
 
   py::class_<ProfilerResult>(m, "ProfilerResult")
       .def("events", &ProfilerResult::events)
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index df13c11b7680..484e13b607eb 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -21,6 +21,10 @@
 
 #include <iostream>
 
+#ifdef USE_KINETO
+#include "libkineto.h"
+#endif
+
 namespace torch { namespace autograd { namespace profiler {
 
 namespace {
@@ -787,6 +791,16 @@ ProfilerResult disableProfiler() {
   auto legacy_events = state_ptr->consolidate();
   return ProfilerResult(kineto_events, legacy_events);
 }
+
+KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
+  name_ = activity.name();
+  device_index_ = activity.deviceId();
+  start_us_ = activity.timestamp();
+  duration_us_ = activity.duration();
+  correlation_id_ = activity.correlationId();
+  return *this;
+}
+
 #endif
 
 void addEventList(std::vector<LegacyEvent>&& profiledEvents) {
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 04cab1327dd0..7ecf3a45f05b 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -21,13 +21,15 @@
 #include <ATen/record_function.h>
 #include <c10/core/DeviceType.h>
 
-#ifdef USE_KINETO
-#include "libkineto.h"
-#endif
-
 struct CUevent_st;
 typedef std::shared_ptr<CUevent_st> CUDAEventStub;
 
+#ifdef USE_KINETO
+namespace libkineto {
+class TraceActivity;
+}
+#endif
+
 namespace torch { namespace autograd {
 
 struct Node;
@@ -507,14 +509,7 @@ struct TORCH_API KinetoEvent {
 
   // Kineto fields
 
-  KinetoEvent& activity(const libkineto::TraceActivity& activity) {
-    name_ = activity.name();
-    device_index_ = activity.deviceId();
-    start_us_ = activity.timestamp();
-    duration_us_ = activity.duration();
-    correlation_id_ = activity.correlationId();
-    return *this;
-  }
+  KinetoEvent& activity(const libkineto::TraceActivity& activity);
 
   std::string name() const {
     return name_;

From 76ee80c437969f3e10fb28239cd5fa85a8ca5e18 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 08:48:28 -0800
Subject: [PATCH 25/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/init.cpp            |   2 +-
 torch/csrc/autograd/profiler.cpp        | 705 ++++++++----------------
 torch/csrc/autograd/profiler.h          | 331 ++++-------
 torch/csrc/autograd/profiler_kineto.cpp | 206 +++++++
 torch/csrc/autograd/profiler_kineto.h   | 186 +++++++
 5 files changed, 737 insertions(+), 693 deletions(-)
 create mode 100644 torch/csrc/autograd/profiler_kineto.cpp
 create mode 100644 torch/csrc/autograd/profiler_kineto.h

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index b844c4349fc6..56fefb103c37 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -4,7 +4,7 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <ATen/autocast_mode.h>
-#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/autograd/profiler_kineto.h>
 #include <torch/csrc/autograd/python_function.h>
 #include <torch/csrc/autograd/function.h>
 
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 484e13b607eb..6eb6b37d11f6 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -21,60 +21,35 @@
 
 #include <iostream>
 
-#ifdef USE_KINETO
-#include "libkineto.h"
-#endif
-
 namespace torch { namespace autograd { namespace profiler {
 
-namespace {
-
-enum EventIValueIdx {
-  KIND = 0,
-  NAME,
-  THREAD_ID,
-  HANDLE,
-  NODE_ID,
-  CPU_MEM_USAGE,
-  CPU_NS,
-  CUDA_RECORDED,
-  CUDA_MEM_USAGE,
-  CUDA_DEVICE,
-  CUDA_US,
-  SHAPES,
-  NUM_EVENT_IVALUE_IDX // must be last in list
-};
-
-enum ProfilerIValueIdx {
-  STATE = 0,
-  REPORT_INPUT_SHAPES,
-  PROFILE_MEMORY,
-  NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
-};
-
-const std::unordered_set<std::string> disable_cuda_profiling = {
-  "aten::view",
-  "aten::t",
-  "aten::transpose",
-  "aten::stride",
-  "aten::empty",
-  "aten::empty_like",
-  "aten::empty_strided",
-  "aten::as_strided",
-  "aten::expand",
-  "aten::resize_",
-  "aten::squeeze",
-  "aten::unsqueeze",
-  "aten::slice",
-  "aten::_unsafe_view",
-  "aten::size"
-};
+std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
+  std::vector<FileLineFunc> entries;
+  entries.reserve(cs.size());
+  for (const auto& entry : cs) {
+    auto& range = entry.range;
+    if (range.source()) {
+      auto& src = range.source();
+      if (src && src->filename()) {
+        auto line = src->starting_line_no() +
+            src->lineno_for_offset(range.start());
+        entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
+      }
+    }
+  }
+  return entries;
+}
 
-CUDAStubs default_stubs;
-constexpr CUDAStubs* default_stubs_addr = &default_stubs;
-// Constant initialization, so it is guaranteed to be initialized before
-// static initialization calls which may invoke registerCUDAMethods
-static CUDAStubs* cuda_stubs = default_stubs_addr;
+std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
+  std::vector<std::string> cs_str;
+  cs_str.reserve(cs.size());
+  for (const auto& entry : cs) {
+    std::stringstream loc;
+    loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
+    cs_str.push_back(loc.str());
+  }
+  return cs_str;
+}
 
 // We decompose the profiler logic into the following components:
 //
@@ -167,313 +142,254 @@ static CUDAStubs* cuda_stubs = default_stubs_addr;
 //  - save profiling events into the profiling state
 //
 
-struct FileLineFunc {
-  std::string filename;
-  size_t line;
-  std::string funcname;
-};
-
-// TODO: figure if we can use TLS
-std::atomic<uint64_t> corr_id_ {1};
-uint64_t next_correlation_id() {
-  return corr_id_++;
-}
-
-#ifdef USE_KINETO
-struct KinetoObserverContext : public at::ObserverContext {
-  int64_t startUs;
-  uint64_t correlationId;
-  uint64_t startThreadId;
-  uint64_t endThreadId;
-  c10::optional<std::vector<std::vector<int64_t>>> shapes;
-  int64_t sequenceNr;
-  uint64_t fwdThreadId;
-  uint8_t recFunScope;
-  c10::optional<std::vector<std::string>> stack;
-};
-#endif
-
-std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
-  std::vector<FileLineFunc> entries;
-  entries.reserve(cs.size());
-  for (const auto& entry : cs) {
-    auto& range = entry.range;
-    if (range.source()) {
-      auto& src = range.source();
-      if (src && src->filename()) {
-        auto line = src->starting_line_no() +
-            src->lineno_for_offset(range.start());
-        entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
-      }
-    }
-  }
-  return entries;
+namespace {
+CUDAStubs default_stubs;
+constexpr CUDAStubs* default_stubs_addr = &default_stubs;
+// Constant initialization, so it is guaranteed to be initialized before
+// static initialization calls which may invoke registerCUDAMethods
+static CUDAStubs* cuda_stubs = default_stubs_addr;
 }
 
-std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
-  std::vector<std::string> cs_str;
-  cs_str.reserve(cs.size());
-  for (const auto& entry : cs) {
-    std::stringstream loc;
-    loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
-    cs_str.push_back(loc.str());
-  }
-  return cs_str;
+// Profiler state
+inline const ProfilerConfig& ProfilerThreadLocalState::config() const {
+  return config_;
 }
 
-// Profiler state
-struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
-  explicit ProfilerThreadLocalState(const ProfilerConfig& config)
-      : config_(config), remoteProfiledEvents_{c10::nullopt} {}
-  ~ProfilerThreadLocalState() override = default;
+thread_event_lists ProfilerThreadLocalState::consolidate() {
+  std::lock_guard<std::mutex> g(state_mutex_);
+  thread_event_lists result;
+  for (auto& kv : event_lists_map_) {
+    auto& list = kv.second;
+    result.emplace_back(list->consolidate());
+  }
+  // Consolidate remote events if applicable as well.
+  if (remoteProfiledEvents_) {
+    result.insert(
+        result.end(),
+        std::make_move_iterator(remoteProfiledEvents_->begin()),
+        std::make_move_iterator(remoteProfiledEvents_->end()));
+  }
+  return result;
+}
 
-  inline const ProfilerConfig& config() const {
-    return config_;
+void ProfilerThreadLocalState::mark(std::string name, bool include_cuda) {
+  if (config_.state == ProfilerState::Disabled) {
+    return;
   }
-
-  thread_event_lists consolidate() {
-    std::lock_guard<std::mutex> g(state_mutex_);
-    thread_event_lists result;
-    for (auto& kv : event_lists_map_) {
-      auto& list = kv.second;
-      result.emplace_back(list->consolidate());
-    }
-    // Consolidate remote events if applicable as well.
-    if (remoteProfiledEvents_) {
-      result.insert(
-          result.end(),
-          std::make_move_iterator(remoteProfiledEvents_->begin()),
-          std::make_move_iterator(remoteProfiledEvents_->end()));
-    }
-    return result;
+  if (config_.state == ProfilerState::NVTX) {
+    cuda_stubs->nvtxMarkA(name.c_str());
+  } else {
+    LegacyEvent evt(
+        EventKind::Mark,
+        at::StringView(std::move(name)),
+        at::RecordFunction::currentThreadId(),
+        include_cuda && config_.state == ProfilerState::CUDA);
+    evt.setNodeId(at::RecordFunction::getDefaultNodeId());
+    getEventList().record(std::move(evt));
   }
+}
 
-  void mark(std::string name, bool include_cuda = true) {
-    if (config_.state == ProfilerState::Disabled) {
-      return;
-    }
-    if (config_.state == ProfilerState::NVTX) {
-      cuda_stubs->nvtxMarkA(name.c_str());
-    } else {
-      LegacyEvent evt(
-          EventKind::Mark,
-          at::StringView(std::move(name)),
-          at::RecordFunction::currentThreadId(),
-          include_cuda && config_.state == ProfilerState::CUDA);
-      evt.setNodeId(at::RecordFunction::getDefaultNodeId());
-      getEventList().record(std::move(evt));
-    }
+void ProfilerThreadLocalState::setOrAddRemoteProfiledEvents(
+    std::vector<LegacyEvent>&& remoteProfiledEvents) {
+  // Lock to serialize access from multiple callback threads.
+  std::lock_guard<std::mutex> guard(state_mutex_);
+  if (remoteProfiledEvents_) {
+    (*remoteProfiledEvents_).emplace_back(remoteProfiledEvents);
+  } else {
+    remoteProfiledEvents_ = {std::move(remoteProfiledEvents)};
   }
+}
 
-  void setOrAddRemoteProfiledEvents(
-      std::vector<LegacyEvent>&& remoteProfiledEvents) {
-    // Lock to serialize access from multiple callback threads.
-    std::lock_guard<std::mutex> guard(state_mutex_);
-    if (remoteProfiledEvents_) {
-      (*remoteProfiledEvents_).emplace_back(remoteProfiledEvents);
-    } else {
-      remoteProfiledEvents_ = {std::move(remoteProfiledEvents)};
-    }
+void ProfilerThreadLocalState::pushRange(
+    const at::RecordFunction& fn,
+    const bool record_cuda,
+    const char* msg,
+    std::vector<std::vector<int64_t>>&& shapes) {
+  if (config_.state == ProfilerState::Disabled) {
+    return;
   }
-
-  void pushRange(
-      const at::RecordFunction& fn,
-      const bool record_cuda,
-      const char* msg = "",
-      std::vector<std::vector<int64_t>>&& shapes = {}) {
-    if (config_.state == ProfilerState::Disabled) {
-      return;
-    }
-    if (config_.state == ProfilerState::NVTX) {
-      cuda_stubs->nvtxRangePushA(getNvtxStr(
-          fn.name(), msg, fn.seqNr(), shapes).c_str());
-    } else {
-      LegacyEvent evt(
-          EventKind::PushRange,
-          fn.name(),
-          at::RecordFunction::currentThreadId(),
-          record_cuda,
-          fn.handle(),
-          std::move(shapes),
-          at::RecordFunction::getDefaultNodeId());
-      evt.setSequenceNr(fn.seqNr());
-      evt.setFwdThreadId(fn.forwardThreadId());
-      evt.setScope((uint8_t)fn.scope());
+  if (config_.state == ProfilerState::NVTX) {
+    cuda_stubs->nvtxRangePushA(getNvtxStr(
+        fn.name(), msg, fn.seqNr(), shapes).c_str());
+  } else {
+    LegacyEvent evt(
+        EventKind::PushRange,
+        fn.name(),
+        at::RecordFunction::currentThreadId(),
+        record_cuda,
+        fn.handle(),
+        std::move(shapes),
+        at::RecordFunction::getDefaultNodeId());
+    evt.setSequenceNr(fn.seqNr());
+    evt.setFwdThreadId(fn.forwardThreadId());
+    evt.setScope((uint8_t)fn.scope());
 #ifndef C10_MOBILE
-      // backward nodes source range corresponds to the forward node
-      // TODO: consider using C++ stack trace
-      if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
-        auto cs = prepareCallstack(jit::currentCallstack());
-        if (cs.empty()) {
-          cs = prepareCallstack(jit::tracer::pythonCallstack());
-        }
-        evt.setStack(callstackStr(cs));
+    // backward nodes source range corresponds to the forward node
+    // TODO: consider using C++ stack trace
+    if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+      auto cs = prepareCallstack(jit::currentCallstack());
+      if (cs.empty()) {
+        cs = prepareCallstack(jit::tracer::pythonCallstack());
       }
-#endif
-      getEventList().record(std::move(evt));
+      evt.setStack(callstackStr(cs));
     }
+#endif
+    getEventList().record(std::move(evt));
   }
+}
 
-  void popRange(const at::RecordFunction& fn, const bool record_cuda) {
-    if (config_.state == ProfilerState::Disabled) {
-      return;
-    }
-    if (config_.state == ProfilerState::NVTX) {
-      cuda_stubs->nvtxRangePop();
-    } else {
-      // In some cases RecordFunction (and popRange) may be
-      // called on a different thread than pushRange
-      // As a convention, we put the async pop on the original
-      // thread and save current thread id in pop event
-      LegacyEvent evt(
-          EventKind::PopRange,
-          at::StringView(""),
-          at::RecordFunction::currentThreadId(),
-          record_cuda,
-          fn.handle());
-      evt.setNodeId(at::RecordFunction::getDefaultNodeId());
-      getEventList(fn.threadId()).record(std::move(evt));
-    }
+void ProfilerThreadLocalState::popRange(const at::RecordFunction& fn, const bool record_cuda) {
+  if (config_.state == ProfilerState::Disabled) {
+    return;
   }
-
-  void setCallbackHandle(at::CallbackHandle handle) {
-    handle_ = handle;
+  if (config_.state == ProfilerState::NVTX) {
+    cuda_stubs->nvtxRangePop();
+  } else {
+    // In some cases RecordFunction (and popRange) may be
+    // called on a different thread than pushRange
+    // As a convention, we put the async pop on the original
+    // thread and save current thread id in pop event
+    LegacyEvent evt(
+        EventKind::PopRange,
+        at::StringView(""),
+        at::RecordFunction::currentThreadId(),
+        record_cuda,
+        fn.handle());
+    evt.setNodeId(at::RecordFunction::getDefaultNodeId());
+    getEventList(fn.threadId()).record(std::move(evt));
   }
+}
 
-  at::CallbackHandle callbackHandle() const {
-    return handle_;
-  }
+void ProfilerThreadLocalState::setCallbackHandle(at::CallbackHandle handle) {
+  handle_ = handle;
+}
 
-  void reportMemoryUsage(
-      void* /* unused */,
-      int64_t alloc_size,
-      c10::Device device) override {
-    if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
-      uint64_t thread_id = at::RecordFunction::currentThreadId();
-      LegacyEvent evt(
-          EventKind::MemoryAlloc,
-          at::StringView(""),
-          thread_id,
-          config_.state == ProfilerState::CUDA);
-      evt.updateMemoryStats(alloc_size, device);
-      getEventList(thread_id).record(std::move(evt));
-    }
-  }
+at::CallbackHandle ProfilerThreadLocalState::callbackHandle() const {
+  return handle_;
+}
 
-  bool memoryProfilingEnabled() const override {
-    return config_.profile_memory;
+void ProfilerThreadLocalState::reportMemoryUsage(
+    void* /* unused */,
+    int64_t alloc_size,
+    c10::Device device) {
+  if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
+    uint64_t thread_id = at::RecordFunction::currentThreadId();
+    LegacyEvent evt(
+        EventKind::MemoryAlloc,
+        at::StringView(""),
+        thread_id,
+        config_.state == ProfilerState::CUDA);
+    evt.updateMemoryStats(alloc_size, device);
+    getEventList(thread_id).record(std::move(evt));
   }
+}
 
-  void reportKinetoClientActivity(
-      const at::RecordFunction& fn,
-      const KinetoObserverContext& ctx) {
-#ifdef USE_KINETO
-    if (config_.state == ProfilerState::KINETO) {
-      libkineto::ClientTraceActivity op;
-      op.startTime = ctx.startUs;
-      op.endTime = (getTime() / 1000);
-      op.opType = std::string(fn.name().str());
-      op.device = 0; // CPU
-      op.correlation = ctx.correlationId;
-      if (ctx.shapes && !ctx.shapes->empty()) {
-        //op.inputDims = toStr(*ctx.shapes); //
-      }
-      //op.threadId = pthread_self();
-
-      {
-        std::lock_guard<std::mutex> guard(state_mutex_);
-        kineto_client_activities_.emplace_back(std::move(op));
-        kineto_events_.emplace_back();
-        kineto_events_.back()
-            .startThreadId(ctx.startThreadId)
-            .endThreadId(ctx.endThreadId)
-            .sequenceNr(ctx.sequenceNr)
-            .fwdThreadId(ctx.fwdThreadId)
-            .scope(ctx.recFunScope);
-        if (ctx.stack && !ctx.stack->empty()) {
-          kineto_events_.back().stack(*ctx.stack);
-        }
-      }
-      return;
-    }
-#endif
-    TORCH_CHECK(false, "Supported only in Kineto profiler");
-  }
+bool ProfilerThreadLocalState::memoryProfilingEnabled() const {
+  return config_.profile_memory;
+}
 
- private:
-  std::string getNvtxStr(
-      const at::StringView& name,
-      const char* msg,
-      int64_t sequence_nr,
-      const std::vector<std::vector<int64_t>>& shapes) const {
-    if (sequence_nr >= 0 || shapes.size() > 0) {
-      std::stringstream s;
+std::string ProfilerThreadLocalState::getNvtxStr(
+    const at::StringView& name,
+    const char* msg,
+    int64_t sequence_nr,
+    const std::vector<std::vector<int64_t>>& shapes) const {
+  if (sequence_nr >= 0 || shapes.size() > 0) {
+    std::stringstream s;
 #ifdef __HIP_PLATFORM_HCC__
-      s << name.str();
+    s << name.str();
 #endif
-      if (sequence_nr >= 0) {
+    if (sequence_nr >= 0) {
 #ifdef __HIP_PLATFORM_HCC__
-        s << msg << sequence_nr;
+      s << msg << sequence_nr;
 #else
-        s << name.str() << msg << sequence_nr;
+      s << name.str() << msg << sequence_nr;
 #endif
-      }
-      if (shapes.size() > 0) {
-        s << ", sizes = [";
-        for (size_t idx = 0; idx < shapes.size(); ++idx) {
-          if (shapes[idx].size() > 0) {
-            s << "[";
-            for (size_t dim = 0; dim < shapes[idx].size(); ++dim) {
-              s << shapes[idx][dim];
-              if (dim < shapes[idx].size() - 1) {
-                s << ", ";
-              }
+    }
+    if (shapes.size() > 0) {
+      s << ", sizes = [";
+      for (size_t idx = 0; idx < shapes.size(); ++idx) {
+        if (shapes[idx].size() > 0) {
+          s << "[";
+          for (size_t dim = 0; dim < shapes[idx].size(); ++dim) {
+            s << shapes[idx][dim];
+            if (dim < shapes[idx].size() - 1) {
+              s << ", ";
             }
-            s << "]";
-          } else {
-            s << "[]";
-          }
-          if (idx < shapes.size() - 1) {
-            s << ", ";
           }
+          s << "]";
+        } else {
+          s << "[]";
+        }
+        if (idx < shapes.size() - 1) {
+          s << ", ";
         }
-        s << "]";
       }
-      return s.str();
-    } else {
-      return name.str();
+      s << "]";
     }
+    return s.str();
+  } else {
+    return name.str();
   }
+}
 
-  RangeEventList& getEventList(int64_t thread_id = -1) {
-    if (thread_id < 0) {
-      thread_id = at::RecordFunction::currentThreadId();
-    }
-    RangeEventList* list_ptr = nullptr;
-    std::lock_guard<std::mutex> guard(state_mutex_);
-    auto it = event_lists_map_.find(thread_id);
-    if (it != event_lists_map_.end()) {
-      list_ptr = it->second.get();
-    } else {
-      auto event_list = std::make_shared<RangeEventList>();
-      event_lists_map_[thread_id] = event_list;
-      list_ptr = event_list.get();
-    }
-    return *list_ptr;
+RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) {
+  if (thread_id < 0) {
+    thread_id = at::RecordFunction::currentThreadId();
   }
+  RangeEventList* list_ptr = nullptr;
+  std::lock_guard<std::mutex> guard(state_mutex_);
+  auto it = event_lists_map_.find(thread_id);
+  if (it != event_lists_map_.end()) {
+    list_ptr = it->second.get();
+  } else {
+    auto event_list = std::make_shared<RangeEventList>();
+    event_lists_map_[thread_id] = event_list;
+    list_ptr = event_list.get();
+  }
+  return *list_ptr;
+}
+
+namespace {
 
-  std::mutex state_mutex_;
-  std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
-      event_lists_map_;
+enum EventIValueIdx {
+  KIND = 0,
+  NAME,
+  THREAD_ID,
+  HANDLE,
+  NODE_ID,
+  CPU_MEM_USAGE,
+  CPU_NS,
+  CUDA_RECORDED,
+  CUDA_MEM_USAGE,
+  CUDA_DEVICE,
+  CUDA_US,
+  SHAPES,
+  NUM_EVENT_IVALUE_IDX // must be last in list
+};
 
-  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
-  at::CallbackHandle handle_ = 0;
-  c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
+enum ProfilerIValueIdx {
+  STATE = 0,
+  REPORT_INPUT_SHAPES,
+  PROFILE_MEMORY,
+  NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
+};
 
-#ifdef USE_KINETO
-  std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
-  std::vector<KinetoEvent> kineto_events_;
-#endif
+const std::unordered_set<std::string> disable_cuda_profiling = {
+  "aten::view",
+  "aten::t",
+  "aten::transpose",
+  "aten::stride",
+  "aten::empty",
+  "aten::empty_like",
+  "aten::empty_strided",
+  "aten::as_strided",
+  "aten::expand",
+  "aten::resize_",
+  "aten::squeeze",
+  "aten::unsqueeze",
+  "aten::slice",
+  "aten::_unsafe_view",
+  "aten::size"
 };
 
 ProfilerThreadLocalState* getProfilerTLSState() {
@@ -539,66 +455,6 @@ void pushProfilingCallbacksLegacy() {
   state_ptr->setCallbackHandle(handle);
 }
 
-#ifdef USE_KINETO
-void pushProfilingCallbacks() {
-  auto state_ptr = getProfilerTLSState();
-  TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
-  auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback(
-      [](const at::RecordFunction& fn) {
-        auto state_ptr = getProfilerTLSState();
-        if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
-          return std::make_unique<KinetoObserverContext>();
-        }
-
-        auto corr_id = next_correlation_id();
-        libkineto::api().pushCorrelationId(corr_id);
-
-        auto ctx_ptr = std::make_unique<KinetoObserverContext>();
-        ctx_ptr->startUs = getTime() / 1000;
-        ctx_ptr->correlationId = corr_id;
-        ctx_ptr->startThreadId = at::RecordFunction::currentThreadId();
-
-        if (state_ptr->config().report_input_shapes) {
-          ctx_ptr->shapes = inputSizes(fn);
-        }
-
-        ctx_ptr->sequenceNr = fn.seqNr();
-        ctx_ptr->fwdThreadId = fn.forwardThreadId();
-        ctx_ptr->recFunScope = (uint8_t)fn.scope();
-
-#ifndef C10_MOBILE
-        // backward nodes source range corresponds to the forward node
-        // TODO: consider using C++ stack trace
-        if (state_ptr->config().with_stack &&
-            fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
-          auto cs = prepareCallstack(jit::currentCallstack());
-          if (cs.empty()) {
-            cs = prepareCallstack(jit::tracer::pythonCallstack());
-          }
-          ctx_ptr->stack = callstackStr(cs);
-        }
-#endif
-        return ctx_ptr;
-      },
-      [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
-        auto state_ptr = getProfilerTLSState();
-        if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
-          return;
-        }
-        auto kineto_ctx_ptr = dynamic_cast<KinetoObserverContext*>(ctx_ptr);
-        TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
-
-        kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId();
-
-        state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr);
-        libkineto::api().popCorrelationId();
-      })
-    .needsInputs(state_ptr->config().report_input_shapes)
-    .needsIds(true));
-  state_ptr->setCallbackHandle(handle);
-}
-#endif
-
 const int kCUDAWarmupStart = 5;
 
 } // namespace
@@ -649,14 +505,6 @@ bool profilerEnabled() {
   return state_ptr && state_ptr->config().state != ProfilerState::Disabled;
 }
 
-bool kinetoAvailable() {
-#ifdef USE_KINETO
-  return true;
-#else
-  return false;
-#endif
-}
-
 void enableProfilerLegacy(const ProfilerConfig& new_config) {
   TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(),
     "Can't use NVTX profiler - PyTorch was compiled without CUDA");
@@ -718,91 +566,6 @@ thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> p
   return state_ptr->consolidate();
 }
 
-#ifdef USE_KINETO
-
-void prepareProfiler(
-    const ProfilerConfig& config,
-    const std::set<ActivityType>& activities) {
-  TORCH_CHECK(config.state == ProfilerState::KINETO,
-      "Supported only in Kineto profiler");
-
-  std::set<libkineto::ActivityType> k_activities;
-  if (activities.count(ActivityType::CPU)) {
-    k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION);
-    k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
-  }
-  //if (activities.count(ActivityType::CUDA_RUNTIME)) {
-  //  k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
-  //}
-  if (activities.count(ActivityType::CUDA)) {
-    k_activities.insert(libkineto::ActivityType::GPU_MEMCPY);
-    k_activities.insert(libkineto::ActivityType::GPU_MEMSET);
-    k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL);
-  }
-
-  //if (!libkineto::api().hasProfilerRegistered()) {
-  //  libkineto::api().registerProfiler(
-  //    std::make_unique<libkineto::ActivityProfilerInterface>(false));
-  //}
-
-  //libkineto::api().initProfilerIfRegistered();
-  libkineto::api().prepareTrace(k_activities);
-}
-
-void enableProfiler(
-    const ProfilerConfig& config,
-    const std::set<ActivityType>& activities) {
-  TORCH_CHECK(config.state == ProfilerState::KINETO);
-  TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler");
-
-  auto state_ptr = getProfilerTLSState();
-  TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
-  auto state = std::make_shared<ProfilerThreadLocalState>(config);
-  c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
-
-  if (activities.count(ActivityType::CPU)) {
-    pushProfilingCallbacks();
-  }
-
-  if (!libkineto::api().traceActive()) {
-    libkineto::api().startTrace();
-  }
-
-  state->mark("__start_profile", false);
-}
-
-ProfilerResult disableProfiler() {
-  // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
-  auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
-
-  auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
-  TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO,
-      "Can't disable Kineto profiler when it's not running");
-
-  if (state_ptr->callbackHandle() > 0) {
-    at::removeCallback(state_ptr->callbackHandle());
-  }
-
-  state_ptr->mark("__stop_profile");
-
-  //auto trace = std::move(libkineto::api().stopTrace());
-  libkineto::api().stopTrace();
-  std::vector<std::vector<KinetoEvent>> kineto_events; // = filterTrace(trace);
-  auto legacy_events = state_ptr->consolidate();
-  return ProfilerResult(kineto_events, legacy_events);
-}
-
-KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
-  name_ = activity.name();
-  device_index_ = activity.deviceId();
-  start_us_ = activity.timestamp();
-  duration_us_ = activity.duration();
-  correlation_id_ = activity.correlationId();
-  return *this;
-}
-
-#endif
-
 void addEventList(std::vector<LegacyEvent>&& profiledEvents) {
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(state_ptr, "Profiler must be enabled.");
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 7ecf3a45f05b..b2468b158694 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -19,17 +19,12 @@
 #endif
 
 #include <ATen/record_function.h>
-#include <c10/core/DeviceType.h>
+
+#include <torch/csrc/jit/frontend/source_range.h>
 
 struct CUevent_st;
 typedef std::shared_ptr<CUevent_st> CUDAEventStub;
 
-#ifdef USE_KINETO
-namespace libkineto {
-class TraceActivity;
-}
-#endif
-
 namespace torch { namespace autograd {
 
 struct Node;
@@ -95,61 +90,6 @@ inline int64_t getTime() {
 #endif
 }
 
-// A struct to control settings of disableProfiler options.
-struct TORCH_API ProfilerDisableOptions {
-  ProfilerDisableOptions() = default;
-  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
-      : cleanupTLSState(shouldCleanupTLSState),
-        consolidate(shouldConsolidate) {}
-  // Whether we should clean up profiler states that are thread local, such as
-  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
-  bool cleanupTLSState = true;
-  // Whether we should consolidate all currently recorded profiled events. If
-  // false, will not consolidate and other threads can continue to write to the
-  // event lists.
-  bool consolidate = true;
-};
-
-enum class C10_API_ENUM ProfilerState {
-  Disabled = 0,
-  CPU, // CPU-only profiling
-  CUDA, // CPU + CUDA events
-  NVTX,  // only emit NVTX markers
-  KINETO, // use libkineto
-  NUM_PROFILER_STATES, // must be the last one
-};
-
-enum class C10_API_ENUM ActivityType {
-  CPU = 0,
-  // CUDA_RUNTIME, // CUDA host events
-  CUDA, // CUDA kernels
-  NUM_KINETO_ACTIVITIES, // must be the last one
-};
-
-struct TORCH_API ProfilerConfig {
-  ProfilerConfig(
-      ProfilerState state,
-      bool report_input_shapes = false,
-      bool profile_memory = false,
-      bool with_stack = false)
-      : state(state),
-        report_input_shapes(report_input_shapes),
-        profile_memory(profile_memory),
-        with_stack(with_stack) {}
-  ~ProfilerConfig();
-  ProfilerState state;
-  bool report_input_shapes;
-  bool profile_memory;
-  bool with_stack;
-
-  // Returns IValues corresponding to ProfilerConfig struct, to be used for
-  // serialization.
-  at::IValue toIValue() const;
-
-  // Reconstructs a ProfilerConfig from IValues given by toIValue.
-  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
-};
-
 enum class C10_API_ENUM EventKind : uint16_t {
   Mark,
   PushRange,
@@ -409,6 +349,54 @@ struct RangeEventList {
   static const size_t kReservedCapacity = 1024;
 };
 
+enum class C10_API_ENUM ProfilerState {
+  Disabled = 0,
+  CPU, // CPU-only profiling
+  CUDA, // CPU + CUDA events
+  NVTX,  // only emit NVTX markers
+  KINETO, // use libkineto
+  NUM_PROFILER_STATES, // must be the last one
+};
+
+struct TORCH_API ProfilerConfig {
+  ProfilerConfig(
+      ProfilerState state,
+      bool report_input_shapes = false,
+      bool profile_memory = false,
+      bool with_stack = false)
+      : state(state),
+        report_input_shapes(report_input_shapes),
+        profile_memory(profile_memory),
+        with_stack(with_stack) {}
+  ~ProfilerConfig();
+  ProfilerState state;
+  bool report_input_shapes;
+  bool profile_memory;
+  bool with_stack;
+
+  // Returns IValues corresponding to ProfilerConfig struct, to be used for
+  // serialization.
+  at::IValue toIValue() const;
+
+  // Reconstructs a ProfilerConfig from IValues given by toIValue.
+  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
+};
+
+// A struct to control settings of disableProfiler options.
+struct TORCH_API ProfilerDisableOptions {
+  ProfilerDisableOptions() = default;
+  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
+      : cleanupTLSState(shouldCleanupTLSState),
+        consolidate(shouldConsolidate) {}
+  // Whether we should clean up profiler states that are thread local, such as
+  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
+  bool cleanupTLSState = true;
+  // Whether we should consolidate all currently recorded profiled events. If
+  // false, will not consolidate and other threads can continue to write to the
+  // event lists.
+  bool consolidate = true;
+};
+
 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfilerLegacy(const ProfilerConfig&);
@@ -425,165 +413,6 @@ TORCH_API ProfilerConfig getProfilerConfig();
 // Writes profiled events to a stream.
 TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
 
-#ifdef USE_KINETO
-struct TORCH_API KinetoEvent {
-  uint64_t startThreadId() const {
-    return start_thread_id_;
-  }
-
-  uint64_t endThreadId() const {
-    return end_thread_id_;
-  }
-
-  c10::DeviceType deviceType() const {
-    return device_type_;
-  }
-
-  uint64_t fwdThreadId() const {
-    return fwd_thread_id_;
-  }
-
-  bool hasShapes() const {
-    return shapes_ != c10::nullopt;
-  }
-
-  const std::vector<std::vector<int64_t>>& shapes() const {
-    return *shapes_;
-  }
-
-  int64_t sequenceNr() const {
-    return sequence_nr_;
-  }
-
-  bool hasStack() const {
-    return stack_ != c10::nullopt;
-  }
-
-  const std::vector<std::string>& stack() const {
-    return *stack_;
-  }
-
-  uint8_t scope() const {
-    return scope_;
-  }
-
-  KinetoEvent& startThreadId(uint64_t start_thread_id) {
-    start_thread_id_ = start_thread_id;
-    return *this;
-  }
-
-  KinetoEvent& endThreadId(uint64_t end_thread_id) {
-    end_thread_id_ = end_thread_id;
-    return *this;
-  }
-
-  KinetoEvent& deviceType(c10::DeviceType device_type) {
-    device_type_ = device_type;
-    return *this;
-  }
-
-  KinetoEvent& fwdThreadId(uint64_t fwd_thread_id) {
-    fwd_thread_id_ = fwd_thread_id;
-    return *this;
-  }
-
-  KinetoEvent& shapes(const std::vector<std::vector<int64_t>>& shapes) {
-    *shapes_ = shapes;
-    return *this;
-  }
-
-  KinetoEvent& sequenceNr(int64_t sequence_nr) {
-    sequence_nr_ = sequence_nr_;
-    return *this;
-  }
-
-  KinetoEvent& stack(const std::vector<std::string>& st) {
-    *stack_ = st;
-    return *this;
-  }
-
-  KinetoEvent& scope(uint8_t scope) {
-    scope_ = scope;
-    return *this;
-  }
-
-  // Kineto fields
-
-  KinetoEvent& activity(const libkineto::TraceActivity& activity);
-
-  std::string name() const {
-    return name_;
-  }
-
-  uint64_t deviceIndex() const {
-    return device_index_;
-  }
-
-  uint64_t startUs() const {
-    return start_us_;
-  }
-
-  uint64_t durationUs() const {
-    return duration_us_;
-  }
-
-  uint64_t correlationId() const {
-    return correlation_id_;
-  }
-
-  KinetoEvent& correlationId(uint64_t correlation_id)  {
-    correlation_id_ = correlation_id;
-    return *this;
-  }
-
- private:
-  uint64_t start_thread_id_ = 0;
-  uint64_t end_thread_id_ = 0;
-  uint64_t fwd_thread_id_ = 0;
-  int64_t sequence_nr_ = 0;
-  uint8_t scope_ = 0;
-
-  c10::DeviceType device_type_ = c10::DeviceType::CPU;
-  c10::optional<std::vector<std::vector<int64_t>>> shapes_;
-  c10::optional<std::vector<std::string>> stack_;
-
-  std::string name_;
-  uint64_t device_index_ = 0;
-  uint64_t start_us_ = 0;
-  uint64_t duration_us_ = 0;
-  uint64_t correlation_id_ = 0;
-};
-
-struct TORCH_API ProfilerResult {
-  ProfilerResult(
-      const std::vector<std::vector<KinetoEvent>>& events,
-      const thread_event_lists& legacy_events)
-    : events_(events), legacy_events_(legacy_events) {}
-
-  const std::vector<std::vector<KinetoEvent>> events() const {
-    return events_;
-  }
-
-  const thread_event_lists& legacy_events() const {
-    return legacy_events_;
-  }
-
- private:
-  std::vector<std::vector<KinetoEvent>> events_;
-  thread_event_lists legacy_events_; // tensor mem alloc, start/stop
-};
-TORCH_API void enableProfiler(
-    const ProfilerConfig& config,
-    const std::set<ActivityType>& activities);
-TORCH_API ProfilerResult disableProfiler();
-
-TORCH_API void prepareProfiler(
-    const ProfilerConfig& config,
-    const std::set<ActivityType>& activities);
-#endif // USE_KINETO
-
-TORCH_API bool kinetoAvailable();
-
 // Usage:
 //   {
 //     RecordProfile guard("filename.trace");
@@ -638,5 +467,65 @@ struct TORCH_API TLSProfilerGuard {
   const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };
 
+struct FileLineFunc {
+  std::string filename;
+  size_t line;
+  std::string funcname;
+};
+std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs);
+std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs);
+std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn);
+
+struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
+  explicit ProfilerThreadLocalState(const ProfilerConfig& config)
+      : config_(config), remoteProfiledEvents_{c10::nullopt} {}
+  ~ProfilerThreadLocalState() override = default;
+
+  inline const ProfilerConfig& config() const;
+
+  thread_event_lists consolidate();
+
+  void mark(std::string name, bool include_cuda = true);
+
+  void setOrAddRemoteProfiledEvents(
+      std::vector<LegacyEvent>&& remoteProfiledEvents);
+
+  void pushRange(
+      const at::RecordFunction& fn,
+      const bool record_cuda,
+      const char* msg = "",
+      std::vector<std::vector<int64_t>>&& shapes = {});
+
+  void popRange(const at::RecordFunction& fn, const bool record_cuda);
+
+  void setCallbackHandle(at::CallbackHandle handle);
+
+  at::CallbackHandle callbackHandle() const;
+
+  void reportMemoryUsage(
+      void* /* unused */,
+      int64_t alloc_size,
+      c10::Device device) override;
+
+  bool memoryProfilingEnabled() const override;
+ private:
+  std::string getNvtxStr(
+      const at::StringView& name,
+      const char* msg,
+      int64_t sequence_nr,
+      const std::vector<std::vector<int64_t>>& shapes) const;
+
+  RangeEventList& getEventList(int64_t thread_id = -1);
+
+  std::mutex state_mutex_;
+  std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
+      event_lists_map_;
+
+  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
+  at::CallbackHandle handle_ = 0;
+  c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
+};
+
+
 } // namespace profiler
 }} // namespace torch::autograd
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
new file mode 100644
index 000000000000..171cea0d3496
--- /dev/null
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -0,0 +1,206 @@
+#include <torch/csrc/autograd/profiler_kineto.h>
+
+#ifdef USE_KINETO
+#include "libkineto.h"
+
+namespace torch { namespace autograd { namespace profiler {
+
+namespace {
+// TODO: TLS
+std::atomic<uint64_t> corr_id_ {1};
+uint64_t next_correlation_id() {
+  return corr_id_++;
+}
+
+struct KinetoObserverContext : public at::ObserverContext {
+  int64_t startUs;
+  uint64_t correlationId;
+  uint64_t startThreadId;
+  uint64_t endThreadId;
+  c10::optional<std::vector<std::vector<int64_t>>> shapes;
+  int64_t sequenceNr;
+  uint64_t fwdThreadId;
+  uint8_t recFunScope;
+  c10::optional<std::vector<std::string>> stack;
+};
+
+void reportKinetoClientActivity(
+    const at::RecordFunction& fn,
+    const KinetoObserverContext& ctx) {
+  /*TORCH_CHECK((config_.state == ProfilerState::KINETO,
+      "Supported only in Kineto profiler");
+  op.startTime = ctx.startUs;
+  op.endTime = (getTime() / 1000);
+  op.opType = std::string(fn.name().str());
+  op.device = 0; // CPU
+  op.correlation = ctx.correlationId;
+  if (ctx.shapes && !ctx.shapes->empty()) {
+    //op.inputDims = toStr(*ctx.shapes); //
+  }
+  //op.threadId = pthread_self();
+
+  {
+    std::lock_guard<std::mutex> guard(state_mutex_);
+    kineto_client_activities_.emplace_back(std::move(op));
+    kineto_events_.emplace_back();
+    kineto_events_.back()
+        .startThreadId(ctx.startThreadId)
+        .endThreadId(ctx.endThreadId)
+        .sequenceNr(ctx.sequenceNr)
+        .fwdThreadId(ctx.fwdThreadId)
+        .scope(ctx.recFunScope);
+    if (ctx.stack && !ctx.stack->empty()) {
+      kineto_events_.back().stack(*ctx.stack);
+    }
+  }*/
+}
+
+void pushProfilingCallbacks() {
+  auto state_ptr = getProfilerTLSState();
+  TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
+  auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback(
+      [](const at::RecordFunction& fn) {
+        auto state_ptr = getProfilerTLSState();
+        if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
+          return std::make_unique<KinetoObserverContext>();
+        }
+
+        auto corr_id = next_correlation_id();
+        libkineto::api().pushCorrelationId(corr_id);
+
+        auto ctx_ptr = std::make_unique<KinetoObserverContext>();
+        ctx_ptr->startUs = getTime() / 1000;
+        ctx_ptr->correlationId = corr_id;
+        ctx_ptr->startThreadId = at::RecordFunction::currentThreadId();
+
+        if (state_ptr->config().report_input_shapes) {
+          ctx_ptr->shapes = inputSizes(fn);
+        }
+
+        ctx_ptr->sequenceNr = fn.seqNr();
+        ctx_ptr->fwdThreadId = fn.forwardThreadId();
+        ctx_ptr->recFunScope = (uint8_t)fn.scope();
+
+#ifndef C10_MOBILE
+        // backward nodes source range corresponds to the forward node
+        // TODO: consider using C++ stack trace
+        if (state_ptr->config().with_stack &&
+            fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+          auto cs = prepareCallstack(jit::currentCallstack());
+          if (cs.empty()) {
+            cs = prepareCallstack(jit::tracer::pythonCallstack());
+          }
+          ctx_ptr->stack = callstackStr(cs);
+        }
+#endif
+        return ctx_ptr;
+      },
+      [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {
+        auto state_ptr = getProfilerTLSState();
+        if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) {
+          return;
+        }
+        auto kineto_ctx_ptr = dynamic_cast<KinetoObserverContext*>(ctx_ptr);
+        TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
+
+        kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId();
+
+        state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr);
+        libkineto::api().popCorrelationId();
+      })
+    .needsInputs(state_ptr->config().report_input_shapes)
+    .needsIds(true));
+  state_ptr->setCallbackHandle(handle);
+}
+
+} // namespace
+
+void prepareProfiler(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities) {
+  TORCH_CHECK(config.state == ProfilerState::KINETO,
+      "Supported only in Kineto profiler");
+
+  std::set<libkineto::ActivityType> k_activities;
+  if (activities.count(ActivityType::CPU)) {
+    k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION);
+    k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
+  }
+  //if (activities.count(ActivityType::CUDA_RUNTIME)) {
+  //  k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
+  //}
+  if (activities.count(ActivityType::CUDA)) {
+    k_activities.insert(libkineto::ActivityType::GPU_MEMCPY);
+    k_activities.insert(libkineto::ActivityType::GPU_MEMSET);
+    k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL);
+  }
+
+  //if (!libkineto::api().hasProfilerRegistered()) {
+  //  libkineto::api().registerProfiler(
+  //    std::make_unique<libkineto::ActivityProfilerInterface>(false));
+  //}
+  //libkineto::api().initProfilerIfRegistered();
+  libkineto::api().prepareTrace(k_activities);
+}
+
+void enableProfiler(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities) {
+  TORCH_CHECK(config.state == ProfilerState::KINETO);
+  TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler");
+
+  auto state_ptr = getProfilerTLSState();
+  TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
+  auto state = std::make_shared<ProfilerThreadLocalState>(config);
+  c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
+
+  if (activities.count(ActivityType::CPU)) {
+    pushProfilingCallbacks();
+  }
+
+  if (!libkineto::api().traceActive()) {
+    libkineto::api().startTrace();
+  }
+
+  state->mark("__start_profile", false);
+}
+
+ProfilerResult disableProfiler() {
+  // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
+  auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+
+  auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
+  TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO,
+      "Can't disable Kineto profiler when it's not running");
+
+  if (state_ptr->callbackHandle() > 0) {
+    at::removeCallback(state_ptr->callbackHandle());
+  }
+
+  state_ptr->mark("__stop_profile");
+
+  //auto trace = std::move(libkineto::api().stopTrace());
+  libkineto::api().stopTrace();
+  std::vector<std::vector<KinetoEvent>> kineto_events; // = filterTrace(trace);
+  auto legacy_events = state_ptr->consolidate();
+  return ProfilerResult(kineto_events, legacy_events);
+}
+
+KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
+  name_ = activity.name();
+  device_index_ = activity.deviceId();
+  start_us_ = activity.timestamp();
+  duration_us_ = activity.duration();
+  correlation_id_ = activity.correlationId();
+  return *this;
+}
+
+bool kinetoAvailable() {
+#ifdef USE_KINETO
+  return true;
+#else
+  return false;
+#endif
+}
+
+}}}
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
new file mode 100644
index 000000000000..671a2dc5cc0e
--- /dev/null
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <torch/csrc/autograd/profiler.h>
+
+#include <c10/core/DeviceType.h>
+
+#ifdef USE_KINETO
+namespace libkineto {
+class TraceActivity;
+}
+#endif
+
+namespace torch {
+namespace autograd {
+namespace profiler {
+
+enum class C10_API_ENUM ActivityType {
+  CPU = 0,
+  // CUDA_RUNTIME, // CUDA host events
+  CUDA, // CUDA kernels
+  NUM_KINETO_ACTIVITIES, // must be the last one
+};
+
+#ifdef USE_KINETO
+struct TORCH_API KinetoEvent {
+  uint64_t startThreadId() const {
+    return start_thread_id_;
+  }
+
+  uint64_t endThreadId() const {
+    return end_thread_id_;
+  }
+
+  c10::DeviceType deviceType() const {
+    return device_type_;
+  }
+
+  uint64_t fwdThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  bool hasShapes() const {
+    return shapes_ != c10::nullopt;
+  }
+
+  const std::vector<std::vector<int64_t>>& shapes() const {
+    return *shapes_;
+  }
+
+  int64_t sequenceNr() const {
+    return sequence_nr_;
+  }
+
+  bool hasStack() const {
+    return stack_ != c10::nullopt;
+  }
+
+  const std::vector<std::string>& stack() const {
+    return *stack_;
+  }
+
+  uint8_t scope() const {
+    return scope_;
+  }
+
+  KinetoEvent& startThreadId(uint64_t start_thread_id) {
+    start_thread_id_ = start_thread_id;
+    return *this;
+  }
+
+  KinetoEvent& endThreadId(uint64_t end_thread_id) {
+    end_thread_id_ = end_thread_id;
+    return *this;
+  }
+
+  KinetoEvent& deviceType(c10::DeviceType device_type) {
+    device_type_ = device_type;
+    return *this;
+  }
+
+  KinetoEvent& fwdThreadId(uint64_t fwd_thread_id) {
+    fwd_thread_id_ = fwd_thread_id;
+    return *this;
+  }
+
+  KinetoEvent& shapes(const std::vector<std::vector<int64_t>>& shapes) {
+    *shapes_ = shapes;
+    return *this;
+  }
+
+  KinetoEvent& sequenceNr(int64_t sequence_nr) {
+    sequence_nr_ = sequence_nr_;
+    return *this;
+  }
+
+  KinetoEvent& stack(const std::vector<std::string>& st) {
+    *stack_ = st;
+    return *this;
+  }
+
+  KinetoEvent& scope(uint8_t scope) {
+    scope_ = scope;
+    return *this;
+  }
+
+  // Kineto fields
+
+  KinetoEvent& activity(const libkineto::TraceActivity& activity);
+
+  std::string name() const {
+    return name_;
+  }
+
+  uint64_t deviceIndex() const {
+    return device_index_;
+  }
+
+  uint64_t startUs() const {
+    return start_us_;
+  }
+
+  uint64_t durationUs() const {
+    return duration_us_;
+  }
+
+  uint64_t correlationId() const {
+    return correlation_id_;
+  }
+
+  KinetoEvent& correlationId(uint64_t correlation_id)  {
+    correlation_id_ = correlation_id;
+    return *this;
+  }
+
+ private:
+  uint64_t start_thread_id_ = 0;
+  uint64_t end_thread_id_ = 0;
+  uint64_t fwd_thread_id_ = 0;
+  int64_t sequence_nr_ = 0;
+  uint8_t scope_ = 0;
+
+  c10::DeviceType device_type_ = c10::DeviceType::CPU;
+  c10::optional<std::vector<std::vector<int64_t>>> shapes_;
+  c10::optional<std::vector<std::string>> stack_;
+
+  std::string name_;
+  uint64_t device_index_ = 0;
+  uint64_t start_us_ = 0;
+  uint64_t duration_us_ = 0;
+  uint64_t correlation_id_ = 0;
+};
+
+struct TORCH_API ProfilerResult {
+  ProfilerResult(
+      const std::vector<std::vector<KinetoEvent>>& events,
+      const thread_event_lists& legacy_events)
+    : events_(events), legacy_events_(legacy_events) {}
+
+  const std::vector<std::vector<KinetoEvent>> events() const {
+    return events_;
+  }
+
+  const thread_event_lists& legacy_events() const {
+    return legacy_events_;
+  }
+
+ private:
+  std::vector<std::vector<KinetoEvent>> events_;
+  thread_event_lists legacy_events_; // tensor mem alloc, start/stop
+};
+
+TORCH_API void enableProfiler(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities);
+
+TORCH_API ProfilerResult disableProfiler();
+
+TORCH_API void prepareProfiler(
+    const ProfilerConfig& config,
+    const std::set<ActivityType>& activities);
+#endif // USE_KINETO
+
+TORCH_API bool kinetoAvailable();
+
+} // namespace profiler
+}} // namespace torch::autograd

From 5761ea2617caec4f7a0181827e61b156ed99fbf6 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 09:07:37 -0800
Subject: [PATCH 26/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 tools/build_variables.bzl               |  1 +
 torch/csrc/autograd/profiler.h          | 15 ++++++++++-----
 torch/csrc/autograd/profiler_kineto.cpp | 11 ++++++++++-
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 65f5ec1c6903..fd04c1326263 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -75,6 +75,7 @@ jit_core_sources = [
 
 core_sources_common = [
     "torch/csrc/autograd/profiler.cpp",
+    "torch/csrc/autograd/profiler_kineto.cpp",
     "torch/csrc/jit/frontend/edit_distance.cpp",
     "torch/csrc/jit/frontend/string_to_type.cpp",
     "torch/csrc/jit/mobile/type_parser.cpp",
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index b2468b158694..ca0a42f823a1 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -467,16 +467,16 @@ struct TORCH_API TLSProfilerGuard {
   const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };
 
-struct FileLineFunc {
+struct TORCH_API FileLineFunc {
   std::string filename;
   size_t line;
   std::string funcname;
 };
-std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs);
-std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs);
-std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn);
+TORCH_API std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs);
+TORCH_API std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs);
+TORCH_API std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn);
 
-struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
+struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
       : config_(config), remoteProfiledEvents_{c10::nullopt} {}
   ~ProfilerThreadLocalState() override = default;
@@ -508,6 +508,11 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
       c10::Device device) override;
 
   bool memoryProfilingEnabled() const override;
+
+  virtual void reportClientActivity(
+    const at::RecordFunction& fn,
+    const at::ObserverContext& ctx) {}
+
  private:
   std::string getNvtxStr(
       const at::StringView& name,
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 171cea0d3496..244f3a851340 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -1,5 +1,8 @@
 #include <torch/csrc/autograd/profiler_kineto.h>
 
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
 #ifdef USE_KINETO
 #include "libkineto.h"
 
@@ -55,6 +58,11 @@ void reportKinetoClientActivity(
   }*/
 }
 
+ProfilerThreadLocalState* getProfilerTLSState() {
+  const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE);
+  return dynamic_cast<ProfilerThreadLocalState*>(state.get());
+}
+
 void pushProfilingCallbacks() {
   auto state_ptr = getProfilerTLSState();
   TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
@@ -105,7 +113,7 @@ void pushProfilingCallbacks() {
 
         kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId();
 
-        state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr);
+        //state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr);
         libkineto::api().popCorrelationId();
       })
     .needsInputs(state_ptr->config().report_input_shapes)
@@ -194,6 +202,7 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
   correlation_id_ = activity.correlationId();
   return *this;
 }
+#endif
 
 bool kinetoAvailable() {
 #ifdef USE_KINETO

From dde5ec3c7684893530774ec04469077a829ec7d5 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 09:20:11 -0800
Subject: [PATCH 27/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.cpp | 38 ++++++++++++++++----------------
 torch/csrc/autograd/profiler.h   |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 6eb6b37d11f6..2fe3bdb451cc 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -151,7 +151,7 @@ static CUDAStubs* cuda_stubs = default_stubs_addr;
 }
 
 // Profiler state
-inline const ProfilerConfig& ProfilerThreadLocalState::config() const {
+const ProfilerConfig& ProfilerThreadLocalState::config() const {
   return config_;
 }
 
@@ -349,6 +349,24 @@ RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) {
   return *list_ptr;
 }
 
+std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) {
+  std::vector<std::vector<int64_t>> sizes;
+  sizes.reserve(fn.inputs().size());
+  for (const c10::IValue& input : fn.inputs()) {
+    if (!input.isTensor()) {
+      sizes.emplace_back();
+      continue;
+    }
+    const at::Tensor& tensor = input.toTensor();
+    if (tensor.defined()) {
+      sizes.push_back(input.toTensor().sizes().vec());
+    } else {
+      sizes.emplace_back();
+    }
+  }
+  return sizes;
+}
+
 namespace {
 
 enum EventIValueIdx {
@@ -397,24 +415,6 @@ ProfilerThreadLocalState* getProfilerTLSState() {
   return dynamic_cast<ProfilerThreadLocalState*>(state.get());
 }
 
-std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn) {
-  std::vector<std::vector<int64_t>> sizes;
-  sizes.reserve(fn.inputs().size());
-  for (const c10::IValue& input : fn.inputs()) {
-    if (!input.isTensor()) {
-      sizes.emplace_back();
-      continue;
-    }
-    const at::Tensor& tensor = input.toTensor();
-    if (tensor.defined()) {
-      sizes.push_back(input.toTensor().sizes().vec());
-    } else {
-      sizes.emplace_back();
-    }
-  }
-  return sizes;
-}
-
 void pushProfilingCallbacksLegacy() {
   auto state_ptr = getProfilerTLSState();
   TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set");
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index ca0a42f823a1..1c0694e3c8a9 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -481,7 +481,7 @@ struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase
       : config_(config), remoteProfiledEvents_{c10::nullopt} {}
   ~ProfilerThreadLocalState() override = default;
 
-  inline const ProfilerConfig& config() const;
+  const ProfilerConfig& config() const;
 
   thread_event_lists consolidate();
 

From 3a25bd2a0a81cb08dc327789cab1639042354a7e Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 09:50:54 -0800
Subject: [PATCH 28/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.h          |  4 +-
 torch/csrc/autograd/profiler_kineto.cpp | 83 ++++++++++++-------------
 torch/csrc/autograd/profiler_kineto.h   | 13 ++++
 3 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 1c0694e3c8a9..657a990019fa 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -511,9 +511,9 @@ struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase
 
   virtual void reportClientActivity(
     const at::RecordFunction& fn,
-    const at::ObserverContext& ctx) {}
+    const at::ObserverContext* ctx) {}
 
- private:
+ protected:
   std::string getNvtxStr(
       const at::StringView& name,
       const char* msg,
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 244f3a851340..b6d1acc49c92 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -15,48 +15,47 @@ uint64_t next_correlation_id() {
   return corr_id_++;
 }
 
-struct KinetoObserverContext : public at::ObserverContext {
-  int64_t startUs;
-  uint64_t correlationId;
-  uint64_t startThreadId;
-  uint64_t endThreadId;
-  c10::optional<std::vector<std::vector<int64_t>>> shapes;
-  int64_t sequenceNr;
-  uint64_t fwdThreadId;
-  uint8_t recFunScope;
-  c10::optional<std::vector<std::string>> stack;
-};
-
-void reportKinetoClientActivity(
-    const at::RecordFunction& fn,
-    const KinetoObserverContext& ctx) {
-  /*TORCH_CHECK((config_.state == ProfilerState::KINETO,
-      "Supported only in Kineto profiler");
-  op.startTime = ctx.startUs;
-  op.endTime = (getTime() / 1000);
-  op.opType = std::string(fn.name().str());
-  op.device = 0; // CPU
-  op.correlation = ctx.correlationId;
-  if (ctx.shapes && !ctx.shapes->empty()) {
-    //op.inputDims = toStr(*ctx.shapes); //
-  }
-  //op.threadId = pthread_self();
-
-  {
-    std::lock_guard<std::mutex> guard(state_mutex_);
-    kineto_client_activities_.emplace_back(std::move(op));
-    kineto_events_.emplace_back();
-    kineto_events_.back()
-        .startThreadId(ctx.startThreadId)
-        .endThreadId(ctx.endThreadId)
-        .sequenceNr(ctx.sequenceNr)
-        .fwdThreadId(ctx.fwdThreadId)
-        .scope(ctx.recFunScope);
-    if (ctx.stack && !ctx.stack->empty()) {
-      kineto_events_.back().stack(*ctx.stack);
+struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
+  using ProfilerThreadLocalState::ProfilerThreadLocalState;
+  virtual ~KinetoThreadLocalState() override = default;
+
+  virtual void reportClientActivity(
+      const at::RecordFunction& fn,
+      const at::ObserverContext* observer_ctx) override {
+    auto ctx = dynamic_cast<const KinetoObserverContext*>(observer_ctx);
+    TORCH_CHECK(ctx);
+    TORCH_CHECK(config_.state == ProfilerState::KINETO,
+        "Supported only in Kineto profiler");
+    libkineto::ClientTraceActivity op;
+    op.startTime = ctx->startUs;
+    op.endTime = (getTime() / 1000);
+    op.opType = std::string(fn.name().str());
+    op.device = 0; // CPU
+    op.correlation = ctx->correlationId;
+    if (ctx->shapes && !ctx->shapes->empty()) {
+      //op.inputDims = toStr(*ctx->shapes); //
     }
-  }*/
-}
+    //op.threadId = pthread_self();
+
+    {
+      std::lock_guard<std::mutex> guard(state_mutex_);
+      kineto_client_activities_.emplace_back(std::move(op));
+      kineto_events_.emplace_back();
+      kineto_events_.back()
+          .startThreadId(ctx->startThreadId)
+          .endThreadId(ctx->endThreadId)
+          .sequenceNr(ctx->sequenceNr)
+          .fwdThreadId(ctx->fwdThreadId)
+          .scope(ctx->recFunScope);
+      if (ctx->stack && !ctx->stack->empty()) {
+        kineto_events_.back().stack(*ctx->stack);
+      }
+    }
+  }
+
+  std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
+  std::vector<KinetoEvent> kineto_events_;
+};
 
 ProfilerThreadLocalState* getProfilerTLSState() {
   const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE);
@@ -113,7 +112,7 @@ void pushProfilingCallbacks() {
 
         kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId();
 
-        //state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr);
+        state_ptr->reportClientActivity(fn, kineto_ctx_ptr);
         libkineto::api().popCorrelationId();
       })
     .needsInputs(state_ptr->config().report_input_shapes)
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 671a2dc5cc0e..090069ea00ee 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -22,6 +22,19 @@ enum class C10_API_ENUM ActivityType {
 };
 
 #ifdef USE_KINETO
+
+struct KinetoObserverContext : public at::ObserverContext {
+  int64_t startUs;
+  uint64_t correlationId;
+  uint64_t startThreadId;
+  uint64_t endThreadId;
+  c10::optional<std::vector<std::vector<int64_t>>> shapes;
+  int64_t sequenceNr;
+  uint64_t fwdThreadId;
+  uint8_t recFunScope;
+  c10::optional<std::vector<std::string>> stack;
+};
+
 struct TORCH_API KinetoEvent {
   uint64_t startThreadId() const {
     return start_thread_id_;

From 6023998d80e2d6c13fb674614aaf6c8ab41e0479 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 09:54:08 -0800
Subject: [PATCH 29/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index b6d1acc49c92..1c8993218164 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -57,9 +57,9 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
   std::vector<KinetoEvent> kineto_events_;
 };
 
-ProfilerThreadLocalState* getProfilerTLSState() {
+KinetoThreadLocalState* getProfilerTLSState() {
   const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE);
-  return dynamic_cast<ProfilerThreadLocalState*>(state.get());
+  return dynamic_cast<KinetoThreadLocalState*>(state.get());
 }
 
 void pushProfilingCallbacks() {
@@ -158,7 +158,7 @@ void enableProfiler(
 
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
-  auto state = std::make_shared<ProfilerThreadLocalState>(config);
+  auto state = std::make_shared<KinetoThreadLocalState>(config);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
   if (activities.count(ActivityType::CPU)) {
@@ -176,7 +176,7 @@ ProfilerResult disableProfiler() {
   // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
   auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
 
-  auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
+  auto state_ptr = static_cast<KinetoThreadLocalState*>(state.get());
   TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO,
       "Can't disable Kineto profiler when it's not running");
 

From 0bc66a67af9771e4fbd55ccabdd9f7c0cde489c6 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 10:08:16 -0800
Subject: [PATCH 30/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 11 +++++++++++
 torch/csrc/autograd/profiler_kineto.h   |  1 -
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 1c8993218164..74eba5bcd272 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -55,6 +55,8 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
 
   std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
   std::vector<KinetoEvent> kineto_events_;
+
+  std::unique_ptr<libkineto::CpuTraceBuffer> cpu_trace;
 };
 
 KinetoThreadLocalState* getProfilerTLSState() {
@@ -161,6 +163,11 @@ void enableProfiler(
   auto state = std::make_shared<KinetoThreadLocalState>(config);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
+  state->cpu_trace = std::make_unique<libkineto::CpuTraceBuffer>();
+  state->cpu_trace->span.startTime = getTime() / 1000;
+  state->cpu_trace->gpuOpCount = -1;
+  state->cpu_trace->span.name = "PyTorch Profiler";
+
   if (activities.count(ActivityType::CPU)) {
     pushProfilingCallbacks();
   }
@@ -186,6 +193,10 @@ ProfilerResult disableProfiler() {
 
   state_ptr->mark("__stop_profile");
 
+  state_ptr->cpu_trace->span.endTime = getTime() / 1000;
+
+  libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace));
+
   //auto trace = std::move(libkineto::api().stopTrace());
   libkineto::api().stopTrace();
   std::vector<std::vector<KinetoEvent>> kineto_events; // = filterTrace(trace);
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 090069ea00ee..4f429afb72c3 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -145,7 +145,6 @@ struct TORCH_API KinetoEvent {
     return *this;
   }
 
- private:
   uint64_t start_thread_id_ = 0;
   uint64_t end_thread_id_ = 0;
   uint64_t fwd_thread_id_ = 0;

From aa2d09e753953777feda2a8100602c87e3c4d668 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 10:28:13 -0800
Subject: [PATCH 31/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 74eba5bcd272..f1108247b27d 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -39,7 +39,7 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
 
     {
       std::lock_guard<std::mutex> guard(state_mutex_);
-      kineto_client_activities_.emplace_back(std::move(op));
+      cpu_trace->ops.emplace_back(std::move(op));
       kineto_events_.emplace_back();
       kineto_events_.back()
           .startThreadId(ctx->startThreadId)
@@ -53,7 +53,6 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
     }
   }
 
-  std::vector<libkineto::ClientTraceActivity> kineto_client_activities_;
   std::vector<KinetoEvent> kineto_events_;
 
   std::unique_ptr<libkineto::CpuTraceBuffer> cpu_trace;
@@ -197,7 +196,7 @@ ProfilerResult disableProfiler() {
 
   libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace));
 
-  //auto trace = std::move(libkineto::api().stopTrace());
+  //auto trace = libkineto::api().stopTrace();
   libkineto::api().stopTrace();
   std::vector<std::vector<KinetoEvent>> kineto_events; // = filterTrace(trace);
   auto legacy_events = state_ptr->consolidate();

From 1556a7c21ebd795e323352805534ff7af76f170e Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 12:22:01 -0800
Subject: [PATCH 32/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 tools/build_variables.bzl                                 | 2 +-
 torch/csrc/autograd/{profiler.cpp => profiler_legacy.cpp} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename torch/csrc/autograd/{profiler.cpp => profiler_legacy.cpp} (100%)

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index fd04c1326263..08a57d0549a9 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -74,7 +74,7 @@ jit_core_sources = [
 # list for the shared files.
 
 core_sources_common = [
-    "torch/csrc/autograd/profiler.cpp",
+    "torch/csrc/autograd/profiler_legacy.cpp",
     "torch/csrc/autograd/profiler_kineto.cpp",
     "torch/csrc/jit/frontend/edit_distance.cpp",
     "torch/csrc/jit/frontend/string_to_type.cpp",
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler_legacy.cpp
similarity index 100%
rename from torch/csrc/autograd/profiler.cpp
rename to torch/csrc/autograd/profiler_legacy.cpp

From 4a0fec9d9a96c607ad54eaf80c63a15be3ff7071 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 12:33:47 -0800
Subject: [PATCH 33/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler.h        | 536 +-------------------------
 torch/csrc/autograd/profiler_kineto.h |   2 +-
 torch/csrc/autograd/profiler_legacy.h | 536 ++++++++++++++++++++++++++
 3 files changed, 539 insertions(+), 535 deletions(-)
 create mode 100644 torch/csrc/autograd/profiler_legacy.h

diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 657a990019fa..7ac44096cda7 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -1,536 +1,4 @@
 #pragma once
 
-#include <iostream>
-#include <mutex>
-#include <memory>
-#include <vector>
-#include <cstdint>
-#include <string>
-#include <sstream>
-#include <forward_list>
-#include <tuple>
-#include <ATen/ATen.h>
-#include <torch/csrc/WindowsTorchApiMacro.h>
-#ifndef _WIN32
-#include <ctime>
-#endif
-#if defined(C10_IOS) && defined(C10_MOBILE)
-#include <sys/time.h> // for gettimeofday()
-#endif
-
-#include <ATen/record_function.h>
-
-#include <torch/csrc/jit/frontend/source_range.h>
-
-struct CUevent_st;
-typedef std::shared_ptr<CUevent_st> CUDAEventStub;
-
-namespace torch { namespace autograd {
-
-struct Node;
-
-namespace profiler {
-
-struct TORCH_API CUDAStubs {
-  virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
-    fail();
-  }
-  virtual float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) {
-    fail();
-    return 0.f;
-  }
-  virtual void nvtxMarkA(const char* name) {
-    fail();
-  }
-  virtual void nvtxRangePushA(const char* name) {
-    fail();
-  }
-  virtual void nvtxRangePop() {
-    fail();
-  }
-  virtual bool enabled() {
-    return false;
-  }
-  virtual void onEachDevice(std::function<void(int)> op) {
-    fail();
-  }
-  virtual void synchronize() {
-    fail();
-  }
-  virtual ~CUDAStubs();
-
-private:
-  void fail() {
-    AT_ERROR("CUDA used in profiler but not enabled.");
-  }
-};
-
-TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
-
-constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
-  return ((a + b - 1) / b) * b;
-}
-
-inline int64_t getTime() {
-#if defined(C10_IOS) && defined(C10_MOBILE)
-// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
-// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
-  struct timeval now;
-  gettimeofday(&now, NULL);
-  return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
-#elif defined(_WIN32) || defined(__MACH__)
-  using namespace std::chrono;
-  using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
-  return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
-#else
-  // clock_gettime is *much* faster than std::chrono implementation on Linux
-  struct timespec t{};
-  clock_gettime(CLOCK_MONOTONIC, &t);
-  return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
-#endif
-}
-
-enum class C10_API_ENUM EventKind : uint16_t {
-  Mark,
-  PushRange,
-  PopRange,
-  MemoryAlloc,
-};
-
-// To be deprecated, once we switch to Kineto profiling
-struct TORCH_API LegacyEvent {
-  LegacyEvent(
-      EventKind kind,
-      at::StringView name,
-      uint16_t thread_id,
-      bool record_cuda,
-      at::RecordFunctionHandle handle = 0,
-      std::vector<std::vector<int64_t>>&& shapes = {},
-      int node_id = -1)
-      : name_(std::move(name)),
-        kind_(kind),
-        thread_id_(thread_id),
-        handle_(handle),
-        shapes_(shapes),
-        node_id_(node_id) {
-    record(record_cuda);
-  }
-
-  // Constructor to be used in conjunction with LegacyEvent::fromIValue.
-  LegacyEvent(
-      EventKind kind,
-      at::StringView name,
-      uint16_t thread_id,
-      at::RecordFunctionHandle handle,
-      std::vector<std::vector<int64_t>>&& shapes,
-      int node_id,
-      bool is_remote,
-      int64_t cpu_memory_usage,
-      int64_t cpu_ns,
-      bool cuda_recorded,
-      int64_t cuda_memory_usage = 0,
-      int device = -1,
-      double cuda_us = -1)
-      : cpu_ns_(cpu_ns),
-        name_(std::move(name)),
-        kind_(kind),
-        thread_id_(thread_id),
-        handle_(handle),
-        shapes_(shapes),
-        cpu_memory_usage_(cpu_memory_usage),
-        cuda_memory_usage_(cuda_memory_usage),
-        device_(device),
-        node_id_(node_id),
-        is_remote_(is_remote),
-        cuda_us_(cuda_us) {
-    // Sanity check values that were deserialized
-    TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
-    if (cuda_recorded) {
-      TORCH_INTERNAL_ASSERT(device_ >= 0);
-      TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
-    }
-  }
-
-  // Returns IValues corresponding to event structure, to be used for
-  // serialization.
-  at::IValue toIValue() const;
-
-  // Reconstructs an event from IValues given by toIValue.
-  static LegacyEvent fromIValue(const at::IValue& eventIValue);
-
-  void record(bool record_cuda);
-
-  std::string kindStr() const {
-    switch (kind_) {
-      case EventKind::Mark: return "mark";
-      case EventKind::PushRange: return "push";
-      case EventKind::PopRange: return "pop";
-      case EventKind::MemoryAlloc: return "memory_alloc";
-    }
-    throw std::runtime_error("unknown event kind");
-  }
-
-  const char* name() const {
-    return name_.str();
-  }
-
-  uint64_t threadId() const {
-    return thread_id_;
-  }
-
-  std::vector<std::vector<int64_t>> shapes() const {
-    return shapes_;
-  }
-
-  double cpuElapsedUs(const LegacyEvent& e) const {
-    return (e.cpu_ns_ - cpu_ns_)/(1000.0);
-  }
-
-  double cpuUs() const {
-    return cpu_ns_ / (1000.0);
-  }
-
-  double cudaElapsedUs(const LegacyEvent& e) const;
-
-  bool hasCuda() const {
-    return cuda_event != nullptr || (isRemote() && device_ != -1);
-  }
-
-  int device() const {
-    return device_;
-  }
-
-  void updateMemoryStats(int64_t alloc_size, c10::Device device) {
-    if (device.type() == c10::DeviceType::CUDA ||
-        device.type() == c10::DeviceType::HIP) {
-      cuda_memory_usage_ = alloc_size;
-    } else if (device.type() == c10::DeviceType::CPU ||
-        device.type() == c10::DeviceType::MKLDNN ||
-        device.type() == c10::DeviceType::IDEEP) {
-      cpu_memory_usage_ = alloc_size;
-    } else {
-      LOG(WARNING) << "Unsupported memory profiling device: " << device;
-    }
-  }
-
-  int64_t cpuMemoryUsage() const {
-    return cpu_memory_usage_;
-  }
-
-  int64_t cudaMemoryUsage() const {
-    return cuda_memory_usage_;
-  }
-
-  at::RecordFunctionHandle handle() const {
-    return handle_;
-  }
-
-  // Node ID corresponding to this event.
-  int nodeId( ) const {
-    return node_id_;
-  }
-
-  // Set Node ID on this event.
-  void setNodeId(int node_id) {
-    node_id_ = node_id;
-  }
-
-  void setName(at::StringView newName_) {
-    name_ = std::move(newName_);
-  }
-
-  bool isRemote() const {
-    return is_remote_;
-  }
-
-  void setCudaUs(int64_t cuda_us) {
-    cuda_us_ = cuda_us;
-  }
-
-  void setSequenceNr(int64_t sequence_nr) {
-    sequence_nr_ = sequence_nr;
-  }
-
-  int64_t sequenceNr() const {
-    return sequence_nr_;
-  }
-
-  void setCorrelationId(uint64_t correlation_id) {
-    correlation_id_ = correlation_id;
-  }
-
-  uint64_t correlationId() const {
-    return correlation_id_;
-  }
-
-  const std::vector<std::string>& stack() const {
-    return stack_;
-  }
-
-  void setStack(const std::vector<std::string>& stack) {
-    stack_ = stack;
-  }
-
-  uint64_t fwdThreadId() const {
-    return fwd_thread_id_;
-  }
-
-  void setFwdThreadId(uint64_t fwd_thread_id) {
-    fwd_thread_id_ = fwd_thread_id;
-  }
-
-  uint8_t scope() const {
-    return scope_;
-  }
-
-  void setScope(uint8_t scope) {
-    scope_ = scope;
-  }
-
- private:
-  // signed to allow for negative intervals, initialized for safety.
-  int64_t cpu_ns_ = 0;
-  at::StringView name_;
-  EventKind kind_;
-  uint64_t thread_id_;
-  uint64_t fwd_thread_id_;
-  at::RecordFunctionHandle handle_ {0};
-  std::vector<std::vector<int64_t>> shapes_;
-  int64_t cpu_memory_usage_ = 0;
-  int64_t cuda_memory_usage_ = 0;
-  int device_ = -1;
-  CUDAEventStub cuda_event = nullptr;
-  int node_id_ = 0;
-  bool is_remote_ = false;
-  int64_t cuda_us_ = -1;
-  int64_t sequence_nr_ = -1;
-
-  std::vector<std::string> stack_;
-  uint8_t scope_;
-  uint64_t correlation_id_;
-};
-
-// a linked-list of fixed sized vectors, to avoid
-// a std::vector resize from taking a large amount of time inside
-// a profiling  event
-struct RangeEventList {
-  RangeEventList() {
-    events_.reserve(kReservedCapacity);
-  }
-
-  template<typename... Args>
-  void record(Args&&... args) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    events_.emplace_back(std::forward<Args>(args)...);
-  }
-
-  std::vector<LegacyEvent> consolidate() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    std::vector<LegacyEvent> result;
-    result.insert(
-        result.begin(),
-        std::make_move_iterator(events_.begin()),
-        std::make_move_iterator(events_.end()));
-    events_.erase(events_.begin(), events_.end());
-    return result;
-  }
-
-  size_t size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return events_.size();
-  }
-
- private:
-  // This mutex is used to serialize access when different threads are writing
-  // to the same instance of RangeEventList.
-  std::mutex mutex_;
-  std::vector<LegacyEvent> events_;
-
-  static const size_t kReservedCapacity = 1024;
-};
-
-enum class C10_API_ENUM ProfilerState {
-  Disabled = 0,
-  CPU, // CPU-only profiling
-  CUDA, // CPU + CUDA events
-  NVTX,  // only emit NVTX markers
-  KINETO, // use libkineto
-  NUM_PROFILER_STATES, // must be the last one
-};
-
-struct TORCH_API ProfilerConfig {
-  ProfilerConfig(
-      ProfilerState state,
-      bool report_input_shapes = false,
-      bool profile_memory = false,
-      bool with_stack = false)
-      : state(state),
-        report_input_shapes(report_input_shapes),
-        profile_memory(profile_memory),
-        with_stack(with_stack) {}
-  ~ProfilerConfig();
-  ProfilerState state;
-  bool report_input_shapes;
-  bool profile_memory;
-  bool with_stack;
-
-  // Returns IValues corresponding to ProfilerConfig struct, to be used for
-  // serialization.
-  at::IValue toIValue() const;
-
-  // Reconstructs a ProfilerConfig from IValues given by toIValue.
-  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
-};
-
-// A struct to control settings of disableProfiler options.
-struct TORCH_API ProfilerDisableOptions {
-  ProfilerDisableOptions() = default;
-  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
-      : cleanupTLSState(shouldCleanupTLSState),
-        consolidate(shouldConsolidate) {}
-  // Whether we should clean up profiler states that are thread local, such as
-  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
-  bool cleanupTLSState = true;
-  // Whether we should consolidate all currently recorded profiled events. If
-  // false, will not consolidate and other threads can continue to write to the
-  // event lists.
-  bool consolidate = true;
-};
-
-// NOTE: profiler mode is thread local, with automatic propagation
-// across thread boundary (e.g. at::launch tasks)
-TORCH_API void enableProfilerLegacy(const ProfilerConfig&);
-using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
-TORCH_API thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions = c10::nullopt);
-
-// adds profiledEvents to the current thread local recorded events. Each event
-// will be marked with node ID given by fromNodeId.
-TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
-// Returns if the profiler is currently enabled in the current thread.
-TORCH_API bool profilerEnabled();
-// Retrieve the thread_local ProfilerConfig.
-TORCH_API ProfilerConfig getProfilerConfig();
-// Writes profiled events to a stream.
-TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
-
-// Usage:
-//   {
-//     RecordProfile guard("filename.trace");
-//     // code you want to profile
-//   }
-// Then open filename.trace in chrome://tracing
-struct TORCH_API RecordProfile {
-  RecordProfile(std::ostream& out);
-  RecordProfile(const std::string& filename);
-
-  ~RecordProfile();
-private:
-  void init();
-  std::unique_ptr<std::ofstream> file_;
-  std::ostream& out_;
-  void processEvents(const std::vector<LegacyEvent*>& events);
-};
-
-// A guard that enables the profiler, taking in an optional callback to process
-// the results
-// Usage:
-// {
-//   TLSProfilerGuard g([](thread_event_lists profilerResults) {
-//     // process profilerResults
-//   });
-//   Code to profile
-// }
-struct TORCH_API TLSProfilerGuard {
-  explicit TLSProfilerGuard(
-      const ProfilerConfig& cfg,
-      c10::optional<std::function<void(const thread_event_lists&)>>
-          resultCallback = c10::nullopt,
-      c10::optional<ProfilerDisableOptions> profilerDisableOptions =
-          c10::nullopt)
-      : cb_(std::move(resultCallback)),
-        profilerDisableOptions_(std::move(profilerDisableOptions)) {
-    enableProfilerLegacy(cfg);
-  }
-  ~TLSProfilerGuard() {
-    thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_);
-    if (cb_) {
-      try {
-        (*cb_)(event_lists);
-      } catch (const std::exception& e) {
-        LOG(ERROR) << "Got error processing profiler events: " << e.what();
-      }
-    }
-  }
-
- private:
-  c10::optional<std::function<void(const thread_event_lists&)>> cb_;
-  const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
-};
-
-struct TORCH_API FileLineFunc {
-  std::string filename;
-  size_t line;
-  std::string funcname;
-};
-TORCH_API std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs);
-TORCH_API std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs);
-TORCH_API std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn);
-
-struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
-  explicit ProfilerThreadLocalState(const ProfilerConfig& config)
-      : config_(config), remoteProfiledEvents_{c10::nullopt} {}
-  ~ProfilerThreadLocalState() override = default;
-
-  const ProfilerConfig& config() const;
-
-  thread_event_lists consolidate();
-
-  void mark(std::string name, bool include_cuda = true);
-
-  void setOrAddRemoteProfiledEvents(
-      std::vector<LegacyEvent>&& remoteProfiledEvents);
-
-  void pushRange(
-      const at::RecordFunction& fn,
-      const bool record_cuda,
-      const char* msg = "",
-      std::vector<std::vector<int64_t>>&& shapes = {});
-
-  void popRange(const at::RecordFunction& fn, const bool record_cuda);
-
-  void setCallbackHandle(at::CallbackHandle handle);
-
-  at::CallbackHandle callbackHandle() const;
-
-  void reportMemoryUsage(
-      void* /* unused */,
-      int64_t alloc_size,
-      c10::Device device) override;
-
-  bool memoryProfilingEnabled() const override;
-
-  virtual void reportClientActivity(
-    const at::RecordFunction& fn,
-    const at::ObserverContext* ctx) {}
-
- protected:
-  std::string getNvtxStr(
-      const at::StringView& name,
-      const char* msg,
-      int64_t sequence_nr,
-      const std::vector<std::vector<int64_t>>& shapes) const;
-
-  RangeEventList& getEventList(int64_t thread_id = -1);
-
-  std::mutex state_mutex_;
-  std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
-      event_lists_map_;
-
-  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
-  at::CallbackHandle handle_ = 0;
-  c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
-};
-
-
-} // namespace profiler
-}} // namespace torch::autograd
+#include <torch/csrc/autograd/profiler_legacy.h>
+#include <torch/csrc/autograd/profiler_kineto.h>
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 4f429afb72c3..f4e5582fd6fe 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/autograd/profiler_legacy.h>
 
 #include <c10/core/DeviceType.h>
 
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
new file mode 100644
index 000000000000..657a990019fa
--- /dev/null
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -0,0 +1,536 @@
+#pragma once
+
+#include <iostream>
+#include <mutex>
+#include <memory>
+#include <vector>
+#include <cstdint>
+#include <string>
+#include <sstream>
+#include <forward_list>
+#include <tuple>
+#include <ATen/ATen.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
+#ifndef _WIN32
+#include <ctime>
+#endif
+#if defined(C10_IOS) && defined(C10_MOBILE)
+#include <sys/time.h> // for gettimeofday()
+#endif
+
+#include <ATen/record_function.h>
+
+#include <torch/csrc/jit/frontend/source_range.h>
+
+struct CUevent_st;
+typedef std::shared_ptr<CUevent_st> CUDAEventStub;
+
+namespace torch { namespace autograd {
+
+struct Node;
+
+namespace profiler {
+
+struct TORCH_API CUDAStubs {
+  virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) {
+    fail();
+  }
+  virtual float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) {
+    fail();
+    return 0.f;
+  }
+  virtual void nvtxMarkA(const char* name) {
+    fail();
+  }
+  virtual void nvtxRangePushA(const char* name) {
+    fail();
+  }
+  virtual void nvtxRangePop() {
+    fail();
+  }
+  virtual bool enabled() {
+    return false;
+  }
+  virtual void onEachDevice(std::function<void(int)> op) {
+    fail();
+  }
+  virtual void synchronize() {
+    fail();
+  }
+  virtual ~CUDAStubs();
+
+private:
+  void fail() {
+    AT_ERROR("CUDA used in profiler but not enabled.");
+  }
+};
+
+TORCH_API void registerCUDAMethods(CUDAStubs* stubs);
+
+constexpr inline size_t ceilToMultiple(size_t a, size_t b) {
+  return ((a + b - 1) / b) * b;
+}
+
+inline int64_t getTime() {
+#if defined(C10_IOS) && defined(C10_MOBILE)
+// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on
+// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<int64_t>(now.tv_sec) * 1000000000 + static_cast<int64_t>(now.tv_usec) * 1000;
+#elif defined(_WIN32) || defined(__MACH__)
+  using namespace std::chrono;
+  using clock = std::conditional<high_resolution_clock::is_steady, high_resolution_clock, steady_clock>::type;
+  return duration_cast<nanoseconds>(clock::now().time_since_epoch()).count();
+#else
+  // clock_gettime is *much* faster than std::chrono implementation on Linux
+  struct timespec t{};
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return static_cast<int64_t>(t.tv_sec) * 1000000000 + static_cast<int64_t>(t.tv_nsec);
+#endif
+}
+
+enum class C10_API_ENUM EventKind : uint16_t {
+  Mark,
+  PushRange,
+  PopRange,
+  MemoryAlloc,
+};
+
+// To be deprecated, once we switch to Kineto profiling
+struct TORCH_API LegacyEvent {
+  LegacyEvent(
+      EventKind kind,
+      at::StringView name,
+      uint16_t thread_id,
+      bool record_cuda,
+      at::RecordFunctionHandle handle = 0,
+      std::vector<std::vector<int64_t>>&& shapes = {},
+      int node_id = -1)
+      : name_(std::move(name)),
+        kind_(kind),
+        thread_id_(thread_id),
+        handle_(handle),
+        shapes_(shapes),
+        node_id_(node_id) {
+    record(record_cuda);
+  }
+
+  // Constructor to be used in conjunction with LegacyEvent::fromIValue.
+  LegacyEvent(
+      EventKind kind,
+      at::StringView name,
+      uint16_t thread_id,
+      at::RecordFunctionHandle handle,
+      std::vector<std::vector<int64_t>>&& shapes,
+      int node_id,
+      bool is_remote,
+      int64_t cpu_memory_usage,
+      int64_t cpu_ns,
+      bool cuda_recorded,
+      int64_t cuda_memory_usage = 0,
+      int device = -1,
+      double cuda_us = -1)
+      : cpu_ns_(cpu_ns),
+        name_(std::move(name)),
+        kind_(kind),
+        thread_id_(thread_id),
+        handle_(handle),
+        shapes_(shapes),
+        cpu_memory_usage_(cpu_memory_usage),
+        cuda_memory_usage_(cuda_memory_usage),
+        device_(device),
+        node_id_(node_id),
+        is_remote_(is_remote),
+        cuda_us_(cuda_us) {
+    // Sanity check values that were deserialized
+    TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
+    if (cuda_recorded) {
+      TORCH_INTERNAL_ASSERT(device_ >= 0);
+      TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
+    }
+  }
+
+  // Returns IValues corresponding to event structure, to be used for
+  // serialization.
+  at::IValue toIValue() const;
+
+  // Reconstructs an event from IValues given by toIValue.
+  static LegacyEvent fromIValue(const at::IValue& eventIValue);
+
+  void record(bool record_cuda);
+
+  std::string kindStr() const {
+    switch (kind_) {
+      case EventKind::Mark: return "mark";
+      case EventKind::PushRange: return "push";
+      case EventKind::PopRange: return "pop";
+      case EventKind::MemoryAlloc: return "memory_alloc";
+    }
+    throw std::runtime_error("unknown event kind");
+  }
+
+  const char* name() const {
+    return name_.str();
+  }
+
+  uint64_t threadId() const {
+    return thread_id_;
+  }
+
+  std::vector<std::vector<int64_t>> shapes() const {
+    return shapes_;
+  }
+
+  double cpuElapsedUs(const LegacyEvent& e) const {
+    return (e.cpu_ns_ - cpu_ns_)/(1000.0);
+  }
+
+  double cpuUs() const {
+    return cpu_ns_ / (1000.0);
+  }
+
+  double cudaElapsedUs(const LegacyEvent& e) const;
+
+  bool hasCuda() const {
+    return cuda_event != nullptr || (isRemote() && device_ != -1);
+  }
+
+  int device() const {
+    return device_;
+  }
+
+  void updateMemoryStats(int64_t alloc_size, c10::Device device) {
+    if (device.type() == c10::DeviceType::CUDA ||
+        device.type() == c10::DeviceType::HIP) {
+      cuda_memory_usage_ = alloc_size;
+    } else if (device.type() == c10::DeviceType::CPU ||
+        device.type() == c10::DeviceType::MKLDNN ||
+        device.type() == c10::DeviceType::IDEEP) {
+      cpu_memory_usage_ = alloc_size;
+    } else {
+      LOG(WARNING) << "Unsupported memory profiling device: " << device;
+    }
+  }
+
+  int64_t cpuMemoryUsage() const {
+    return cpu_memory_usage_;
+  }
+
+  int64_t cudaMemoryUsage() const {
+    return cuda_memory_usage_;
+  }
+
+  at::RecordFunctionHandle handle() const {
+    return handle_;
+  }
+
+  // Node ID corresponding to this event.
+  int nodeId( ) const {
+    return node_id_;
+  }
+
+  // Set Node ID on this event.
+  void setNodeId(int node_id) {
+    node_id_ = node_id;
+  }
+
+  void setName(at::StringView newName_) {
+    name_ = std::move(newName_);
+  }
+
+  bool isRemote() const {
+    return is_remote_;
+  }
+
+  void setCudaUs(int64_t cuda_us) {
+    cuda_us_ = cuda_us;
+  }
+
+  void setSequenceNr(int64_t sequence_nr) {
+    sequence_nr_ = sequence_nr;
+  }
+
+  int64_t sequenceNr() const {
+    return sequence_nr_;
+  }
+
+  void setCorrelationId(uint64_t correlation_id) {
+    correlation_id_ = correlation_id;
+  }
+
+  uint64_t correlationId() const {
+    return correlation_id_;
+  }
+
+  const std::vector<std::string>& stack() const {
+    return stack_;
+  }
+
+  void setStack(const std::vector<std::string>& stack) {
+    stack_ = stack;
+  }
+
+  uint64_t fwdThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  void setFwdThreadId(uint64_t fwd_thread_id) {
+    fwd_thread_id_ = fwd_thread_id;
+  }
+
+  uint8_t scope() const {
+    return scope_;
+  }
+
+  void setScope(uint8_t scope) {
+    scope_ = scope;
+  }
+
+ private:
+  // signed to allow for negative intervals, initialized for safety.
+  int64_t cpu_ns_ = 0;
+  at::StringView name_;
+  EventKind kind_;
+  uint64_t thread_id_;
+  uint64_t fwd_thread_id_;
+  at::RecordFunctionHandle handle_ {0};
+  std::vector<std::vector<int64_t>> shapes_;
+  int64_t cpu_memory_usage_ = 0;
+  int64_t cuda_memory_usage_ = 0;
+  int device_ = -1;
+  CUDAEventStub cuda_event = nullptr;
+  int node_id_ = 0;
+  bool is_remote_ = false;
+  int64_t cuda_us_ = -1;
+  int64_t sequence_nr_ = -1;
+
+  std::vector<std::string> stack_;
+  uint8_t scope_;
+  uint64_t correlation_id_;
+};
+
+// a linked-list of fixed sized vectors, to avoid
+// a std::vector resize from taking a large amount of time inside
+// a profiling  event
+struct RangeEventList {
+  RangeEventList() {
+    events_.reserve(kReservedCapacity);
+  }
+
+  template<typename... Args>
+  void record(Args&&... args) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<LegacyEvent> consolidate() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<LegacyEvent> result;
+    result.insert(
+        result.begin(),
+        std::make_move_iterator(events_.begin()),
+        std::make_move_iterator(events_.end()));
+    events_.erase(events_.begin(), events_.end());
+    return result;
+  }
+
+  size_t size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return events_.size();
+  }
+
+ private:
+  // This mutex is used to serialize access when different threads are writing
+  // to the same instance of RangeEventList.
+  std::mutex mutex_;
+  std::vector<LegacyEvent> events_;
+
+  static const size_t kReservedCapacity = 1024;
+};
+
+enum class C10_API_ENUM ProfilerState {
+  Disabled = 0,
+  CPU, // CPU-only profiling
+  CUDA, // CPU + CUDA events
+  NVTX,  // only emit NVTX markers
+  KINETO, // use libkineto
+  NUM_PROFILER_STATES, // must be the last one
+};
+
+struct TORCH_API ProfilerConfig {
+  ProfilerConfig(
+      ProfilerState state,
+      bool report_input_shapes = false,
+      bool profile_memory = false,
+      bool with_stack = false)
+      : state(state),
+        report_input_shapes(report_input_shapes),
+        profile_memory(profile_memory),
+        with_stack(with_stack) {}
+  ~ProfilerConfig();
+  ProfilerState state;
+  bool report_input_shapes;
+  bool profile_memory;
+  bool with_stack;
+
+  // Returns IValues corresponding to ProfilerConfig struct, to be used for
+  // serialization.
+  at::IValue toIValue() const;
+
+  // Reconstructs a ProfilerConfig from IValues given by toIValue.
+  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
+};
+
+// A struct to control settings of disableProfiler options.
+struct TORCH_API ProfilerDisableOptions {
+  ProfilerDisableOptions() = default;
+  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
+      : cleanupTLSState(shouldCleanupTLSState),
+        consolidate(shouldConsolidate) {}
+  // Whether we should clean up profiler states that are thread local, such as
+  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
+  bool cleanupTLSState = true;
+  // Whether we should consolidate all currently recorded profiled events. If
+  // false, will not consolidate and other threads can continue to write to the
+  // event lists.
+  bool consolidate = true;
+};
+
+// NOTE: profiler mode is thread local, with automatic propagation
+// across thread boundary (e.g. at::launch tasks)
+TORCH_API void enableProfilerLegacy(const ProfilerConfig&);
+using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
+TORCH_API thread_event_lists disableProfilerLegacy(c10::optional<ProfilerDisableOptions> profilerDisableOptions = c10::nullopt);
+
+// adds profiledEvents to the current thread local recorded events. Each event
+// will be marked with node ID given by fromNodeId.
+TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
+// Returns if the profiler is currently enabled in the current thread.
+TORCH_API bool profilerEnabled();
+// Retrieve the thread_local ProfilerConfig.
+TORCH_API ProfilerConfig getProfilerConfig();
+// Writes profiled events to a stream.
+TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector<LegacyEvent*>& events);
+
+// Usage:
+//   {
+//     RecordProfile guard("filename.trace");
+//     // code you want to profile
+//   }
+// Then open filename.trace in chrome://tracing
+struct TORCH_API RecordProfile {
+  RecordProfile(std::ostream& out);
+  RecordProfile(const std::string& filename);
+
+  ~RecordProfile();
+private:
+  void init();
+  std::unique_ptr<std::ofstream> file_;
+  std::ostream& out_;
+  void processEvents(const std::vector<LegacyEvent*>& events);
+};
+
+// A guard that enables the profiler, taking in an optional callback to process
+// the results
+// Usage:
+// {
+//   TLSProfilerGuard g([](thread_event_lists profilerResults) {
+//     // process profilerResults
+//   });
+//   Code to profile
+// }
+struct TORCH_API TLSProfilerGuard {
+  explicit TLSProfilerGuard(
+      const ProfilerConfig& cfg,
+      c10::optional<std::function<void(const thread_event_lists&)>>
+          resultCallback = c10::nullopt,
+      c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+          c10::nullopt)
+      : cb_(std::move(resultCallback)),
+        profilerDisableOptions_(std::move(profilerDisableOptions)) {
+    enableProfilerLegacy(cfg);
+  }
+  ~TLSProfilerGuard() {
+    thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_);
+    if (cb_) {
+      try {
+        (*cb_)(event_lists);
+      } catch (const std::exception& e) {
+        LOG(ERROR) << "Got error processing profiler events: " << e.what();
+      }
+    }
+  }
+
+ private:
+  c10::optional<std::function<void(const thread_event_lists&)>> cb_;
+  const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
+};
+
+struct TORCH_API FileLineFunc {
+  std::string filename;
+  size_t line;
+  std::string funcname;
+};
+TORCH_API std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs);
+TORCH_API std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs);
+TORCH_API std::vector<std::vector<int64_t>> inputSizes(const at::RecordFunction& fn);
+
+struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
+  explicit ProfilerThreadLocalState(const ProfilerConfig& config)
+      : config_(config), remoteProfiledEvents_{c10::nullopt} {}
+  ~ProfilerThreadLocalState() override = default;
+
+  const ProfilerConfig& config() const;
+
+  thread_event_lists consolidate();
+
+  void mark(std::string name, bool include_cuda = true);
+
+  void setOrAddRemoteProfiledEvents(
+      std::vector<LegacyEvent>&& remoteProfiledEvents);
+
+  void pushRange(
+      const at::RecordFunction& fn,
+      const bool record_cuda,
+      const char* msg = "",
+      std::vector<std::vector<int64_t>>&& shapes = {});
+
+  void popRange(const at::RecordFunction& fn, const bool record_cuda);
+
+  void setCallbackHandle(at::CallbackHandle handle);
+
+  at::CallbackHandle callbackHandle() const;
+
+  void reportMemoryUsage(
+      void* /* unused */,
+      int64_t alloc_size,
+      c10::Device device) override;
+
+  bool memoryProfilingEnabled() const override;
+
+  virtual void reportClientActivity(
+    const at::RecordFunction& fn,
+    const at::ObserverContext* ctx) {}
+
+ protected:
+  std::string getNvtxStr(
+      const at::StringView& name,
+      const char* msg,
+      int64_t sequence_nr,
+      const std::vector<std::vector<int64_t>>& shapes) const;
+
+  RangeEventList& getEventList(int64_t thread_id = -1);
+
+  std::mutex state_mutex_;
+  std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
+      event_lists_map_;
+
+  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
+  at::CallbackHandle handle_ = 0;
+  c10::optional<std::vector<std::vector<LegacyEvent>>> remoteProfiledEvents_;
+};
+
+
+} // namespace profiler
+}} // namespace torch::autograd

From bb6396a74b54a6babc6082dcafa00f0549af0beb Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 13:15:11 -0800
Subject: [PATCH 34/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/init.cpp            |  2 +-
 torch/csrc/autograd/profiler_kineto.cpp | 37 +++++++++++++++++++++----
 torch/csrc/autograd/profiler_legacy.h   |  4 +++
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 56fefb103c37..b844c4349fc6 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -4,7 +4,7 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <ATen/autocast_mode.h>
-#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/python_function.h>
 #include <torch/csrc/autograd/function.h>
 
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index d9864ec515c3..cbb52a06d446 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -3,18 +3,44 @@
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/runtime/operator.h>
 
+#include <sstream>
+
+#ifndef _WIN32
+#include <pthread.h>
+#endif
+
 #ifdef USE_KINETO
 #include "libkineto.h"
 
 namespace torch { namespace autograd { namespace profiler {
 
 namespace {
-// TODO: TLS
+// TODO: consider TLS
 std::atomic<uint64_t> corr_id_ {1};
 uint64_t next_correlation_id() {
   return corr_id_++;
 }
 
+std::string shapesToStr(const std::vector<std::vector<int64_t>>& shapes) {
+  std::ostringstream oss;
+  oss << "[";
+  for (auto t_idx = 0; t_idx < shapes.size(); ++t_idx) {
+    if (t_idx > 0) {
+      oss << ", ";
+    }
+    oss << "[";
+    for (auto s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) {
+      if (s_idx > 0) {
+        oss << ", ";
+      }
+      oss << shapes[t_idx][s_idx];
+    }
+    oss << "]";
+  }
+  oss << "]";
+  return oss.str();
+}
+
 struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
   using ProfilerThreadLocalState::ProfilerThreadLocalState;
   virtual ~KinetoThreadLocalState() override = default;
@@ -28,15 +54,16 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
         "Supported only in Kineto profiler");
     libkineto::ClientTraceActivity op;
     op.startTime = ctx->startUs;
-    op.endTime = (getTime() / 1000);
+    op.endTime = getTimeUs();
     op.opType = std::string(fn.name().str());
     op.device = 0; // CPU
     op.correlation = ctx->correlationId;
     if (ctx->shapes && !ctx->shapes->empty()) {
-      //op.inputDims = toStr(*ctx->shapes); //
+      op.inputDims = shapesToStr(*ctx->shapes);
     }
-    //op.threadId = pthread_self();
-
+#ifndef _WIN32
+    op.threadId = pthread_self();
+#endif
     {
       std::lock_guard<std::mutex> guard(state_mutex_);
       cpu_trace->ops.emplace_back(std::move(op));
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index 657a990019fa..c3efd63a16c4 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -90,6 +90,10 @@ inline int64_t getTime() {
 #endif
 }
 
+inline int64_t getTimeUs() {
+  return getTime() / 1000;
+}
+
 enum class C10_API_ENUM EventKind : uint16_t {
   Mark,
   PushRange,

From 60b5dee47a0dede585b266b3c9b00ddb84cc3a44 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 13:21:59 -0800
Subject: [PATCH 35/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 44 ++++++++++++++-----------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index cbb52a06d446..530ea007eed7 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -11,35 +11,19 @@
 
 #ifdef USE_KINETO
 #include "libkineto.h"
+#endif
 
 namespace torch { namespace autograd { namespace profiler {
 
+#ifdef USE_KINETO
 namespace {
 // TODO: consider TLS
-std::atomic<uint64_t> corr_id_ {1};
 uint64_t next_correlation_id() {
+  static std::atomic<uint64_t> corr_id_ {1};
   return corr_id_++;
 }
 
-std::string shapesToStr(const std::vector<std::vector<int64_t>>& shapes) {
-  std::ostringstream oss;
-  oss << "[";
-  for (auto t_idx = 0; t_idx < shapes.size(); ++t_idx) {
-    if (t_idx > 0) {
-      oss << ", ";
-    }
-    oss << "[";
-    for (auto s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) {
-      if (s_idx > 0) {
-        oss << ", ";
-      }
-      oss << shapes[t_idx][s_idx];
-    }
-    oss << "]";
-  }
-  oss << "]";
-  return oss.str();
-}
+std::string shapesToStr(const std::vector<std::vector<int64_t>>& shapes);
 
 struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
   using ProfilerThreadLocalState::ProfilerThreadLocalState;
@@ -148,6 +132,26 @@ void pushProfilingCallbacks() {
   state_ptr->setCallbackHandle(handle);
 }
 
+std::string shapesToStr(const std::vector<std::vector<int64_t>>& shapes) {
+  std::ostringstream oss;
+  oss << "[";
+  for (auto t_idx = 0; t_idx < shapes.size(); ++t_idx) {
+    if (t_idx > 0) {
+      oss << ", ";
+    }
+    oss << "[";
+    for (auto s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) {
+      if (s_idx > 0) {
+        oss << ", ";
+      }
+      oss << shapes[t_idx][s_idx];
+    }
+    oss << "]";
+  }
+  oss << "]";
+  return oss.str();
+}
+
 } // namespace
 
 void prepareProfiler(

From e1a5480012e27741c52d6806692cec99cbd71624 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 13:24:40 -0800
Subject: [PATCH 36/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 530ea007eed7..33ce86603280 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -88,7 +88,7 @@ void pushProfilingCallbacks() {
         libkineto::api().pushCorrelationId(corr_id);
 
         auto ctx_ptr = std::make_unique<KinetoObserverContext>();
-        ctx_ptr->startUs = getTime() / 1000;
+        ctx_ptr->startUs = getTimeUs();
         ctx_ptr->correlationId = corr_id;
         ctx_ptr->startThreadId = at::RecordFunction::currentThreadId();
 
@@ -194,7 +194,7 @@ void enableProfiler(
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
   state->cpu_trace = std::make_unique<libkineto::CpuTraceBuffer>();
-  state->cpu_trace->span.startTime = getTime() / 1000;
+  state->cpu_trace->span.startTime = getTimeUs();
   state->cpu_trace->gpuOpCount = -1;
   state->cpu_trace->span.name = "PyTorch Profiler";
 
@@ -229,7 +229,7 @@ ProfilerResult disableProfiler() {
 
   state_ptr->mark("__stop_profile");
 
-  state_ptr->cpu_trace->span.endTime = getTime() / 1000;
+  state_ptr->cpu_trace->span.endTime = getTimeUs();
 
   libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace));
 

From 38a37dd6eeab50f5d577d992d9e92b4062fa05c5 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 13:31:16 -0800
Subject: [PATCH 37/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp |  2 +-
 torch/csrc/autograd/profiler_legacy.cpp |  8 --------
 torch/csrc/autograd/profiler_legacy.h   | 12 ++++++++++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 33ce86603280..27a395182496 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -223,7 +223,7 @@ ProfilerResult disableProfiler() {
   TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO,
       "Can't disable Kineto profiler when it's not running");
 
-  if (state_ptr->callbackHandle() > 0) {
+  if (state_ptr->hasCallbackHandle()) {
     at::removeCallback(state_ptr->callbackHandle());
   }
 
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index 2fe3bdb451cc..a8e37d45ee7e 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -260,14 +260,6 @@ void ProfilerThreadLocalState::popRange(const at::RecordFunction& fn, const bool
   }
 }
 
-void ProfilerThreadLocalState::setCallbackHandle(at::CallbackHandle handle) {
-  handle_ = handle;
-}
-
-at::CallbackHandle ProfilerThreadLocalState::callbackHandle() const {
-  return handle_;
-}
-
 void ProfilerThreadLocalState::reportMemoryUsage(
     void* /* unused */,
     int64_t alloc_size,
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index c3efd63a16c4..4ea5c0e830ce 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -502,9 +502,17 @@ struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase
 
   void popRange(const at::RecordFunction& fn, const bool record_cuda);
 
-  void setCallbackHandle(at::CallbackHandle handle);
+  void setCallbackHandle(at::CallbackHandle handle) {
+    handle_ = handle;
+  }
 
-  at::CallbackHandle callbackHandle() const;
+  at::CallbackHandle callbackHandle() const {
+    return handle_;
+  }
+
+  bool hasCallbackHandle() {
+    return handle_ > 0;
+  }
 
   void reportMemoryUsage(
       void* /* unused */,

From c6c603972ac71d0c2793eb136176879740dc8a4a Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 14:16:52 -0800
Subject: [PATCH 38/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 20 ++++++++------------
 torch/csrc/autograd/profiler_kineto.h   |  9 ++++++++-
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 27a395182496..d730ac6e7f8d 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -50,27 +50,30 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
 #endif
     {
       std::lock_guard<std::mutex> guard(state_mutex_);
-      cpu_trace->ops.emplace_back(std::move(op));
       kineto_events_.emplace_back();
       kineto_events_.back()
+          .activity(op)
           .startThreadId(ctx->startThreadId)
           .endThreadId(ctx->endThreadId)
           .sequenceNr(ctx->sequenceNr)
           .fwdThreadId(ctx->fwdThreadId)
-          .scope(ctx->recFunScope);
+          .scope(ctx->recFunScope)
+          .deviceType(c10::DeviceType::CPU)
+          .shapes(*ctx->shapes);
       if (ctx->stack && !ctx->stack->empty()) {
         kineto_events_.back().stack(*ctx->stack);
       }
+      cpu_trace->ops.emplace_back(std::move(op));
     }
   }
 
   std::vector<KinetoEvent> kineto_events_;
-
   std::unique_ptr<libkineto::CpuTraceBuffer> cpu_trace;
 };
 
 KinetoThreadLocalState* getProfilerTLSState() {
-  const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE);
+  const auto& state = c10::ThreadLocalDebugInfo::get(
+      c10::DebugInfoKind::PROFILER_STATE);
   return dynamic_cast<KinetoThreadLocalState*>(state.get());
 }
 
@@ -172,6 +175,7 @@ void prepareProfiler(
     k_activities.insert(libkineto::ActivityType::GPU_MEMCPY);
     k_activities.insert(libkineto::ActivityType::GPU_MEMSET);
     k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL);
+    k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME);
   }
 
   //if (!libkineto::api().hasProfilerRegistered()) {
@@ -239,14 +243,6 @@ ProfilerResult disableProfiler() {
   return ProfilerResult(kineto_events, legacy_events);
 }
 
-KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
-  name_ = activity.name();
-  device_index_ = activity.deviceId();
-  start_us_ = activity.timestamp();
-  duration_us_ = activity.duration();
-  correlation_id_ = activity.correlationId();
-  return *this;
-}
 #endif
 
 bool kinetoAvailable() {
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index f4e5582fd6fe..889be6456473 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -118,7 +118,14 @@ struct TORCH_API KinetoEvent {
 
   // Kineto fields
 
-  KinetoEvent& activity(const libkineto::TraceActivity& activity);
+  KinetoEvent& activity(const libkineto::TraceActivity& activity) {
+    name_ = activity.name();
+    device_index_ = activity.deviceId();
+    start_us_ = activity.timestamp();
+    duration_us_ = activity.duration();
+    correlation_id_ = activity.correlationId();
+    return *this;
+  }
 
   std::string name() const {
     return name_;

From 17767d1a5a1791c2d245e627d4983c9a1e669a81 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 14:32:09 -0800
Subject: [PATCH 39/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index d730ac6e7f8d..48749c820156 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -59,7 +59,9 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
           .fwdThreadId(ctx->fwdThreadId)
           .scope(ctx->recFunScope)
           .deviceType(c10::DeviceType::CPU)
-          .shapes(*ctx->shapes);
+      if (ctx->shapes && !ctx->shapes.empty()) {
+          kineto_events_.back().shapes(*ctx->shapes);
+      }
       if (ctx->stack && !ctx->stack->empty()) {
         kineto_events_.back().stack(*ctx->stack);
       }

From 3537e9d3b95998a9e9ba0efb1ec4f240a72e02d0 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 15:57:34 -0800
Subject: [PATCH 40/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/init.cpp            | 44 ++++++++++++++------
 torch/csrc/autograd/profiler_kineto.cpp | 54 ++++++++++++++++++-------
 torch/csrc/autograd/profiler_kineto.h   | 29 +++++++------
 3 files changed, 87 insertions(+), 40 deletions(-)

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index b844c4349fc6..94d5476f6080 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -73,21 +73,41 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
 #ifdef USE_KINETO
   py::class_<KinetoEvent>(m, "KinetoEvent")
       .def("name", &KinetoEvent::name)
-      .def("start_thread_id", [](const KinetoEvent& e) { return e.startThreadId(); })
-      .def("end_thread_id", [](const KinetoEvent& e) { return e.endThreadId(); })
+      .def("start_thread_id", [](const KinetoEvent& e) {
+        return e.startThreadId();
+      })
+      .def("end_thread_id", [](const KinetoEvent& e) {
+        return e.endThreadId();
+      })
       .def("device_index", &KinetoEvent::deviceIndex)
       .def("start_us", &KinetoEvent::startUs)
       .def("duration_us", &KinetoEvent::durationUs)
-      .def("correlation_id", [](const KinetoEvent& e) { return e.correlationId(); })
-      .def("fwd_thread_id", [](const KinetoEvent& e) { return e.fwdThreadId(); })
-      .def("shapes", [](const KinetoEvent& e) { return e.shapes(); })
-      .def("sequence_nr", [](const KinetoEvent& e) { return e.sequenceNr(); })
-      .def("stack", [](const KinetoEvent& e) { return e.stack(); })
-      .def("scope", [](const KinetoEvent& e) { return e.scope(); });
-
-  py::class_<ProfilerResult>(m, "ProfilerResult")
-      .def("events", &ProfilerResult::events)
-      .def("legacy_events", &ProfilerResult::legacy_events);
+      .def("correlation_id", [](const KinetoEvent& e) {
+        return e.correlationId();
+      })
+      .def("fwd_thread_id", [](const KinetoEvent& e) {
+        return e.fwdThreadId();
+      })
+      .def("shapes", [](const KinetoEvent& e) {
+        return e.shapes();
+      })
+      .def("sequence_nr", [](const KinetoEvent& e) {
+        return e.sequenceNr();
+      })
+      .def("stack", [](const KinetoEvent& e) {
+        return e.stack();
+      })
+      .def("scope", [](const KinetoEvent& e) {
+        return e.scope();
+      });
+
+  py::class_<ProfilerResultWrapper>(m, "ProfilerResult")
+      .def("events",  [](const ProfilerResultWrapper& r) {
+        return r.result_->events();
+      })
+      .def("legacy_events",  [](const ProfilerResultWrapper& r) {
+        return r.result_->legacy_events();
+      });
 
   m.def("_enable_profiler", enableProfiler);
   m.def("_disable_profiler", disableProfiler);
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 48749c820156..09f2563d4b63 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -34,8 +34,9 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
       const at::ObserverContext* observer_ctx) override {
     auto ctx = dynamic_cast<const KinetoObserverContext*>(observer_ctx);
     TORCH_CHECK(ctx);
-    TORCH_CHECK(config_.state == ProfilerState::KINETO,
-        "Supported only in Kineto profiler");
+    if (!ctx) {
+      return;
+    }
     libkineto::ClientTraceActivity op;
     op.startTime = ctx->startUs;
     op.endTime = getTimeUs();
@@ -58,8 +59,8 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
           .sequenceNr(ctx->sequenceNr)
           .fwdThreadId(ctx->fwdThreadId)
           .scope(ctx->recFunScope)
-          .deviceType(c10::DeviceType::CPU)
-      if (ctx->shapes && !ctx->shapes.empty()) {
+          .deviceType(c10::DeviceType::CPU);
+      if (ctx->shapes && !ctx->shapes->empty()) {
           kineto_events_.back().shapes(*ctx->shapes);
       }
       if (ctx->stack && !ctx->stack->empty()) {
@@ -69,6 +70,15 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
     }
   }
 
+  void addTraceEvents(libkineto::ActivityTraceInterface& trace) {
+    // tbd
+  }
+
+  std::vector<std::vector<KinetoEvent>> events() {
+    // tbd
+    return std::vector<std::vector<KinetoEvent>>();
+  }
+
   std::vector<KinetoEvent> kineto_events_;
   std::unique_ptr<libkineto::CpuTraceBuffer> cpu_trace;
 };
@@ -215,13 +225,7 @@ void enableProfiler(
   state->mark("__start_profile", false);
 }
 
-std::vector<std::vector<KinetoEvent>> filterTrace(
-    std::unique_ptr<libkineto::ActivityTraceInterface>&& trace) {
-  // tbd
-  return std::vector<std::vector<KinetoEvent>>();
-}
-
-ProfilerResult disableProfiler() {
+ProfilerResultWrapper disableProfiler() {
   // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
   auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
 
@@ -239,12 +243,32 @@ ProfilerResult disableProfiler() {
 
   libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace));
 
-  std::vector<std::vector<KinetoEvent>> kineto_events = filterTrace(
-      std::move(libkineto::api().stopTrace()));
-  auto legacy_events = state_ptr->consolidate();
-  return ProfilerResult(kineto_events, legacy_events);
+  auto trace = std::move(libkineto::api().stopTrace());
+  TORCH_CHECK(trace);
+  state_ptr->addTraceEvents(*trace);
+  return ProfilerResultWrapper(std::make_shared<ProfilerResult>(
+      std::move(state_ptr->events()),
+      std::move(state_ptr->consolidate()),
+      std::move(trace)));
 }
 
+KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
+  name_ = activity.name();
+  device_index_ = activity.deviceId();
+  start_us_ = activity.timestamp();
+  duration_us_ = activity.duration();
+  correlation_id_ = activity.correlationId();
+  return *this;
+}
+
+ProfilerResult::ProfilerResult(
+    std::vector<std::vector<KinetoEvent>> events,
+    thread_event_lists legacy_events,
+    std::unique_ptr<libkineto::ActivityTraceInterface> trace)
+  : events_(std::move(events)),
+    legacy_events_(std::move(legacy_events)),
+    trace_(std::move(trace)) {}
+
 #endif
 
 bool kinetoAvailable() {
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 889be6456473..916cba70a1ca 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -3,10 +3,12 @@
 #include <torch/csrc/autograd/profiler_legacy.h>
 
 #include <c10/core/DeviceType.h>
+//#include <torch/csrc/WindowsTorchApiMacro.h>
 
 #ifdef USE_KINETO
 namespace libkineto {
 class TraceActivity;
+class ActivityTraceInterface;
 }
 #endif
 
@@ -118,14 +120,7 @@ struct TORCH_API KinetoEvent {
 
   // Kineto fields
 
-  KinetoEvent& activity(const libkineto::TraceActivity& activity) {
-    name_ = activity.name();
-    device_index_ = activity.deviceId();
-    start_us_ = activity.timestamp();
-    duration_us_ = activity.duration();
-    correlation_id_ = activity.correlationId();
-    return *this;
-  }
+  KinetoEvent& activity(const libkineto::TraceActivity& activity);
 
   std::string name() const {
     return name_;
@@ -171,11 +166,11 @@ struct TORCH_API KinetoEvent {
 
 struct TORCH_API ProfilerResult {
   ProfilerResult(
-      const std::vector<std::vector<KinetoEvent>>& events,
-      const thread_event_lists& legacy_events)
-    : events_(events), legacy_events_(legacy_events) {}
+      std::vector<std::vector<KinetoEvent>> events,
+      thread_event_lists legacy_events,
+      std::unique_ptr<libkineto::ActivityTraceInterface> trace);
 
-  const std::vector<std::vector<KinetoEvent>> events() const {
+  const std::vector<std::vector<KinetoEvent>>& events() const {
     return events_;
   }
 
@@ -186,13 +181,21 @@ struct TORCH_API ProfilerResult {
  private:
   std::vector<std::vector<KinetoEvent>> events_;
   thread_event_lists legacy_events_; // tensor mem alloc, start/stop
+  std::unique_ptr<libkineto::ActivityTraceInterface> trace_;
+};
+
+// avoid unique_ptr cophy issues when using pybind
+struct TORCH_API ProfilerResultWrapper {
+  ProfilerResultWrapper(const std::shared_ptr<ProfilerResult>& result)
+    : result_(result) {}
+ std::shared_ptr<ProfilerResult> result_;
 };
 
 TORCH_API void enableProfiler(
     const ProfilerConfig& config,
     const std::set<ActivityType>& activities);
 
-TORCH_API ProfilerResult disableProfiler();
+TORCH_API ProfilerResultWrapper disableProfiler();
 
 TORCH_API void prepareProfiler(
     const ProfilerConfig& config,

From 043dcd2ac4e6f6e3b86c0c04546ca2bc6e376363 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 3 Nov 2020 16:52:37 -0800
Subject: [PATCH 41/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/profiler_kineto.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 09f2563d4b63..f812f0b83d38 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -194,7 +194,7 @@ void prepareProfiler(
   //  libkineto::api().registerProfiler(
   //    std::make_unique<libkineto::ActivityProfilerInterface>(false));
   //}
-  //libkineto::api().initProfilerIfRegistered();
+  libkineto::api().initProfilerIfRegistered();
   libkineto::api().prepareTrace(k_activities);
 }
 

From 67d4acb9ad6d173d07edbccfc6bb46fe08aecf1e Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 12:29:17 -0800
Subject: [PATCH 42/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py | 77 +++++++++++++++++++++-----------------
 1 file changed, 43 insertions(+), 34 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index bcf10dfb8699..380325464575 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -89,15 +89,15 @@ def populate_cpu_children(self):
         for thread_id, thread_events in threads:
             thread_events_ = sorted(
                 thread_events,
-                key=lambda event: [event.cpu_interval.start, -event.cpu_interval.end],
+                key=lambda event: [event.time_range.start, -event.time_range.end],
             )
             current_events: List[FunctionEvent] = []
             cur_end = 0
             for event in thread_events_:
                 while len(current_events) > 0:
                     parent = current_events[-1]
-                    if event.cpu_interval.start >= parent.cpu_interval.end or \
-                            event.cpu_interval.end > parent.cpu_interval.end:
+                    if event.time_range.start >= parent.time_range.end or \
+                            event.time_range.end > parent.time_range.end:
                         # this can't be a parent
                         current_events.pop()
                     else:
@@ -205,8 +205,8 @@ def export_chrome_trace(self, path):
                     '"args": {}}, '
                     % (
                         evt.name,
-                        evt.cpu_interval.start,
-                        evt.cpu_interval.elapsed_us(),
+                        evt.time_range.start,
+                        evt.time_range.elapsed_us(),
                         evt.thread
                         if not evt.is_remote
                         else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "',
@@ -222,7 +222,7 @@ def export_chrome_trace(self, path):
                             '"pid": "CPU functions", '
                             '"id": %s, '
                             '"cat": "cpu_to_cuda", '
-                            '"args": {}}, ' % (evt.name, evt.cpu_interval.start,
+                            '"args": {}}, ' % (evt.name, evt.time_range.start,
                                                evt.thread, next_id))
                     f.write('{"name": "%s", '
                             '"ph": "f", '
@@ -433,22 +433,15 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         if not self.enabled:
             return
         if self.kineto_activities:
-            result = torch.autograd._disable_profiler()
-            #
-            for evt_list in result.legacy_events():
-                for evt in evt_list:
-                    print(evt, evt.kind(), flush=True)
-            print()
-            for evt in result.events():
-                print("  ", evt.name(), evt.start_thread_id(), evt.end_thread_id(), evt.device_index(), evt.device_resource_id(), evt.start_us(), evt.duration_us(), evt.correlation_id(), evt.fwd_thread_id())
-            #
-            self.function_events = parse_profiler_result(result)
+            results = torch.autograd._disable_profiler()
+            parsed_results = parse_kineto_results(results)
         else:
             records = torch.autograd._disable_profiler_legacy()
-            self.function_events = EventList(
-                parse_event_records(records),
-                use_cuda=self.use_cuda,
-                profile_memory=self.profile_memory)
+            parsed_results = parse_legacy_records(records)
+        self.function_events = EventList(
+            parsed_results,
+            use_cuda=self.use_cuda,
+            profile_memory=self.profile_memory)
         if self.with_stack:
             self.function_events.set_backward_stacktraces()
         return False
@@ -779,13 +772,13 @@ def elapsed_us(self):
 class FunctionEvent(FormattedTimesMixin):
     """Profiling information about a single function."""
     def __init__(
-            self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None,
+            self, id, node_id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
             is_remote=True, sequence_nr=-1):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
-        self.cpu_interval: Interval = Interval(cpu_start, cpu_end)
+        self.time_range: Interval = Interval(start_us, end_us)
         self.thread: int = thread
         self.fwd_thread: Optional[int] = fwd_thread
         self.kernels: List[Kernel] = []
@@ -860,7 +853,7 @@ def self_cuda_time_total(self):
 
     @property
     def cpu_time_total(self):
-        return self.cpu_interval.elapsed_us()
+        return self.time_range.elapsed_us()
 
     @property
     def key(self):
@@ -868,14 +861,14 @@ def key(self):
 
     def __repr__(self):
         return (
-            '<FunctionEvent id={} node_id={} cpu_time={} cpu_start={} cpu_end={} '
+            '<FunctionEvent id={} node_id={} cpu_time={} start_us={} end_us={} '
             'cpu_children={} cuda_time={} name={} thread={} input_shapes={} '
             'cpu_memory_usage={} cuda_memory_usage={} is_async={} is_remote={} seq_nr={}>'.format(
                 self.id,
                 self.node_id,
                 self.cpu_time_str,
-                self.cpu_interval.start,
-                self.cpu_interval.end,
+                self.time_range.start,
+                self.time_range.end,
                 str([child.id for child in self.cpu_children]),
                 self.cuda_time_str,
                 self.name,
@@ -971,10 +964,26 @@ def __missing__(self, key):
         self[key] = torch._C._demangle(key) if len(key) > 1 else key
         return self[key]
 
-def parse_event_records(thread_records):
+# Parsing of kineto profiler events
+def parse_kineto_results(result):
+    #
+    for evt_list in result.legacy_events():
+        for evt in evt_list:
+            print(evt, evt.kind(), flush=True)
+    print()
+    for evt in result.events():
+        print("  ", evt.name(), evt.start_thread_id(), evt.end_thread_id(), evt.device_index(), evt.device_resource_id(), evt.start_us(), evt.duration_us(), evt.correlation_id(), evt.fwd_thread_id())
+    #
+    return []
+    # result.events() has most of the events - PyTorch op-level and device-level events
+    # result.legacy_events() has events not yet ported to kineto
+    # (e.g. start/stop marks, tensor memory allocator events)
+
+# Parsing of legacy profiler events
+def parse_legacy_records(thread_records):
     def get_record_key(record):
         """
-        Returns a tuple to be used by parse_event_records for correlating start and
+        Returns a tuple to be used by parse_legacy_records for correlating start and
         end records.
         """
         return (record.handle(), record.node_id())
@@ -1083,8 +1092,8 @@ def adjusted_time(cuda_record, cuda_records_map):
                     node_id=record.node_id(),
                     name=string_table[start.name()],
                     thread=start.thread_id(),
-                    cpu_start=start_record.cpu_elapsed_us(start),
-                    cpu_end=start_record.cpu_elapsed_us(record),
+                    start_us=start_record.cpu_elapsed_us(start),
+                    end_us=start_record.cpu_elapsed_us(record),
                     fwd_thread=start.fwd_thread_id(),
                     input_shapes=start.shapes(),
                     stack=[entry for entry in start.stack() if filter_stack_entry(entry)],
@@ -1122,7 +1131,7 @@ def adjusted_time(cuda_record, cuda_records_map):
     # granularity of the given clock tick)--we always show
     # the outermost nested call first. This adds stability
     # in how FunctionEvents appear
-    functions.sort(key=lambda evt: [evt.cpu_interval.start, -evt.cpu_interval.end])
+    functions.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return functions
 
 
@@ -1169,8 +1178,8 @@ def parse_nvprof_trace(path):
                             node_id=0,  # missing a node_id when calling FunctionEvent. This is just to ensure
                                         # that pytorch doesn't crash when creating a FunctionEvent() object
                             name=strings[row['name']],
-                            cpu_start=row['start_time'],
-                            cpu_end=row['end_time'],
+                            start_us=row['start_time'],
+                            end_us=row['end_time'],
                             thread=0)  # TODO: find in sqlite database
         functions.append(evt)
         functions_map[evt.id] = evt
@@ -1201,7 +1210,7 @@ def parse_nvprof_trace(path):
                           row['kernel_start'],
                           row['kernel_end'])
 
-    functions.sort(key=lambda evt: evt.cpu_interval.start)
+    functions.sort(key=lambda evt: evt.time_range.start)
     return functions
 
 

From 9f1d24fa938d8f4ca31080b9136d398e8cc7703c Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 12:53:49 -0800
Subject: [PATCH 43/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 380325464575..0d48ff1f1b41 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -966,19 +966,22 @@ def __missing__(self, key):
 
 # Parsing of kineto profiler events
 def parse_kineto_results(result):
-    #
-    for evt_list in result.legacy_events():
-        for evt in evt_list:
-            print(evt, evt.kind(), flush=True)
-    print()
-    for evt in result.events():
-        print("  ", evt.name(), evt.start_thread_id(), evt.end_thread_id(), evt.device_index(), evt.device_resource_id(), evt.start_us(), evt.duration_us(), evt.correlation_id(), evt.fwd_thread_id())
-    #
-    return []
     # result.events() has most of the events - PyTorch op-level and device-level events
     # result.legacy_events() has events not yet ported to kineto
     # (e.g. start/stop marks, tensor memory allocator events)
 
+    # First, find __start_profile mark to get the absolute time of the start of the trace
+    start_record = None
+    for record in itertools.chain(*result.legacy_events()):
+        if record.kind() == 'mark' and record.name() == '__start_profile':
+            assert start_record is None
+            start_record = record
+    assert start_record is not None, "Invalid profiler output, __start_profile is missing"
+
+    # Create and return FunctionEvent list
+    function_events = []
+    return function_events
+
 # Parsing of legacy profiler events
 def parse_legacy_records(thread_records):
     def get_record_key(record):

From e86420571e5458128a62d3b0ee11bc5b8b18f80b Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 13:04:28 -0800
Subject: [PATCH 44/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py | 80 +++++++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 22 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 0d48ff1f1b41..4e3c0b9ca15b 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -964,22 +964,78 @@ def __missing__(self, key):
         self[key] = torch._C._demangle(key) if len(key) > 1 else key
         return self[key]
 
+def filter_stack_entry(entry):
+    filtered_entries = [
+        ("autograd/__init__", "_make_grads"),
+        ("autograd/__init__", "backward"),
+        ("torch/tensor", "backward"),
+        ("_internal/common_utils", "prof_callable"),
+        ("_internal/common_utils", "prof_func_call"),
+        ("_internal/common_utils", "prof_meth_call"),
+    ]
+    return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries])
+
+def filter_name(name):
+    # ignoring the following utility ops
+    filtered_out_names = [
+        "profiler::_record_function_enter",
+        "profiler::_record_function_exit",
+        "aten::is_leaf",
+        "aten::output_nr",
+        "aten::_version",
+    ]
+    return name in filtered_out_names
+
 # Parsing of kineto profiler events
 def parse_kineto_results(result):
     # result.events() has most of the events - PyTorch op-level and device-level events
     # result.legacy_events() has events not yet ported to kineto
     # (e.g. start/stop marks, tensor memory allocator events)
 
-    # First, find __start_profile mark to get the absolute time of the start of the trace
+    # First, find __start_profile mark to get the absolute time of the start of the trace;
+    # save memory allocation records
     start_record = None
+    mem_records = []
     for record in itertools.chain(*result.legacy_events()):
         if record.kind() == 'mark' and record.name() == '__start_profile':
             assert start_record is None
             start_record = record
+        if record.kind() == 'memory_alloc':
+            mem_records.append(record)
     assert start_record is not None, "Invalid profiler output, __start_profile is missing"
 
     # Create and return FunctionEvent list
     function_events = []
+    for kineto_event in result.events():
+        fe = FunctionEvent(
+            id=record.handle(),
+            node_id=record.node_id(),
+            name=string_table[start.name()],
+            thread=start.thread_id(),
+            start_us=start_record.cpu_elapsed_us(start),
+            end_us=start_record.cpu_elapsed_us(record),
+            fwd_thread=start.fwd_thread_id(),
+            input_shapes=start.shapes(),
+            stack=[entry for entry in start.stack() if filter_stack_entry(entry)],
+            scope=start.scope(),
+            cpu_memory_usage=cpu_memory_usage,
+            cuda_memory_usage=cuda_memory_usage,
+            is_async=is_async,
+            is_remote=is_remote_event,
+            sequence_nr=start.sequence_nr(),
+        )
+        # note: async events have only cpu total time
+        if not is_async and start.has_cuda():
+            cuda_start = adjusted_time(start, cuda_records)
+            cuda_end = adjusted_time(record, cuda_records)
+            if (cuda_end - cuda_start) > 0:
+                fe.append_kernel(
+                    start.name(),
+                    start.device(),
+                    cuda_start,
+                    cuda_end)
+        function_events.append(fe)
+    function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return function_events
 
 # Parsing of legacy profiler events
@@ -998,26 +1054,6 @@ def get_record_key(record):
     record_stack = []
     string_table = StringTable()
 
-    # ignoring the following utility ops
-    filtered_out_names = [
-        "profiler::_record_function_enter",
-        "profiler::_record_function_exit",
-        "aten::is_leaf",
-        "aten::output_nr",
-        "aten::_version",
-    ]
-
-    def filter_stack_entry(entry):
-        filtered_entries = [
-            ("autograd/__init__", "_make_grads"),
-            ("autograd/__init__", "backward"),
-            ("torch/tensor", "backward"),
-            ("_internal/common_utils", "prof_callable"),
-            ("_internal/common_utils", "prof_func_call"),
-            ("_internal/common_utils", "prof_meth_call"),
-        ]
-        return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries])
-
     # cuda start events and the overall profiler start event don't happen
     # at exactly the same time because we need to record an event on each device
     # and each record takes ~4us. So we adjust here by the difference
@@ -1054,7 +1090,7 @@ def adjusted_time(cuda_record, cuda_records_map):
         prev_record = None
         for record in thread_record_list:
             record_key = get_record_key(record)
-            if (record.name() in filtered_out_names or
+            if (filter_name(record.name()) or
                     record_key in filtered_handles):
                 filtered_handles.add(record_key)
                 continue

From 380b874e628290877b75a9da955336a75e145004 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 14:17:00 -0800
Subject: [PATCH 45/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py              | 44 ++++++++------------
 torch/csrc/autograd/init.cpp            | 53 ++++++++++++++++++-------
 torch/csrc/autograd/profiler_kineto.cpp | 18 +++++++++
 torch/csrc/autograd/profiler_kineto.h   |  2 +
 4 files changed, 76 insertions(+), 41 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 4e3c0b9ca15b..17d854a74f4d 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -772,9 +772,9 @@ def elapsed_us(self):
 class FunctionEvent(FormattedTimesMixin):
     """Profiling information about a single function."""
     def __init__(
-            self, id, node_id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
+            self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
-            is_remote=True, sequence_nr=-1):
+            is_remote=False, sequence_nr=-1, node_id=0):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
@@ -1005,35 +1005,25 @@ def parse_kineto_results(result):
     assert start_record is not None, "Invalid profiler output, __start_profile is missing"
 
     # Create and return FunctionEvent list
+    string_table = StringTable()
     function_events = []
     for kineto_event in result.events():
+        fe_start_us = kineto_event.start_us() - start_record.start_us()
         fe = FunctionEvent(
-            id=record.handle(),
-            node_id=record.node_id(),
-            name=string_table[start.name()],
-            thread=start.thread_id(),
-            start_us=start_record.cpu_elapsed_us(start),
-            end_us=start_record.cpu_elapsed_us(record),
-            fwd_thread=start.fwd_thread_id(),
-            input_shapes=start.shapes(),
-            stack=[entry for entry in start.stack() if filter_stack_entry(entry)],
-            scope=start.scope(),
-            cpu_memory_usage=cpu_memory_usage,
-            cuda_memory_usage=cuda_memory_usage,
-            is_async=is_async,
-            is_remote=is_remote_event,
-            sequence_nr=start.sequence_nr(),
+            id=kineto_event.correlation_id(),
+            name=string_table[kineto_event.name()],
+            thread=kineto_event.start_thread_id(),
+            start_us=fe_start_us,
+            end_us=fe_start_us + kineto_event.duration_us(),
+            fwd_thread=kineto_event.fwd_thread_id(),
+            input_shapes=kineto_event.shapes(),
+            stack=[entry for entry in kineto_event.stack() if filter_stack_entry(entry)],
+            scope=kineto_event.scope(),
+            #cpu_memory_usage=cpu_memory_usage,
+            #cuda_memory_usage=cuda_memory_usage,
+            is_async=kineto_event.start_thread_id() != kineto_event.end_thread_id(),
+            sequence_nr=kineto_event.sequence_nr(),
         )
-        # note: async events have only cpu total time
-        if not is_async and start.has_cuda():
-            cuda_start = adjusted_time(start, cuda_records)
-            cuda_end = adjusted_time(record, cuda_records)
-            if (cuda_end - cuda_start) > 0:
-                fe.append_kernel(
-                    start.name(),
-                    start.device(),
-                    cuda_start,
-                    cuda_end)
         function_events.append(fe)
     function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return function_events
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 26b7e47742b3..647683cc19da 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -67,42 +67,67 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("sequence_nr", &LegacyEvent::sequenceNr)
       .def("stack", &LegacyEvent::stack)
       .def("scope", &LegacyEvent::scope)
-      .def("correlation_id", &LegacyEvent::correlationId);
+      .def("correlation_id", &LegacyEvent::correlationId)
+      .def("start_us", &LegacyEvent::cpuUs);
 
 #ifdef USE_KINETO
   py::class_<KinetoEvent>(m, "KinetoEvent")
+      // name of the event
       .def("name", &KinetoEvent::name)
+      // start callback PyTorch thread id
       .def("start_thread_id", [](const KinetoEvent& e) {
         return e.startThreadId();
       })
+      // end callback PyTorch thread id
       .def("end_thread_id", [](const KinetoEvent& e) {
         return e.endThreadId();
       })
-      .def("device_index", &KinetoEvent::deviceIndex)
-      .def("device_resource_id", &KinetoEvent::deviceResourceId)
+      // for events of scope BACKWARD_FUNCTION - PyTorch thread id
+      // of the corresponding forward op
+      .def("fwd_thread_id", [](const KinetoEvent& e) {
+        return e.fwdThreadId();
+      })
+      // together with fwd_thread_id, used to uniquely identify
+      // the forward op
+      .def("sequence_nr", [](const KinetoEvent& e) {
+        return e.sequenceNr();
+      })
+      // absolute start time (since unix epoch) in us
       .def("start_us", &KinetoEvent::startUs)
+      // duration in us
       .def("duration_us", &KinetoEvent::durationUs)
+      // used to correlate between high-level PyTorch events
+      // and low-level device events
       .def("correlation_id", [](const KinetoEvent& e) {
         return e.correlationId();
       })
-      .def("fwd_thread_id", [](const KinetoEvent& e) {
-        return e.fwdThreadId();
-      })
+      // shapes of input tensors
       .def("shapes", [](const KinetoEvent& e) {
-        return e.shapes();
-      })
-      .def("sequence_nr", [](const KinetoEvent& e) {
-        return e.sequenceNr();
+        if (e.hasShapes()) {
+          return e.shapes();
+        } else {
+          return std::vector<std::vector<int64_t>>();
+        }
       })
+      // stack traces of the PyTorch CPU events
       .def("stack", [](const KinetoEvent& e) {
-        return e.stack();
+        if (e.hasStack()) {
+          return e.stack();
+        } else {
+          return std::vector<std::string>();
+        }
       })
+      // type of the RecordFunction that generated this PyTorch CPU event
+      // (op, torchscript function, user label, etc)
       .def("scope", [](const KinetoEvent& e) {
         return e.scope();
       })
-      .def("activity_type", [](const KinetoEvent& e) {
-        return e.activityType();
-      });
+      // device number, for CPU - process id
+      .def("device_index", &KinetoEvent::deviceIndex)
+      // for CUDA - stream id, for CPU - start thread id
+      .def("device_resource_id", &KinetoEvent::deviceResourceId)
+      // device type, currently: CPU or CUDA
+      .def("device_type", &KinetoEvent::deviceType);
 
   py::class_<ProfilerResultWrapper>(m, "ProfilerResult")
       .def("events",  [](const ProfilerResultWrapper& r) {
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index e855674bc860..721a698bacaf 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -282,6 +282,24 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
   return *this;
 }
 
+c10::DeviceType KinetoEvent::deviceType() const {
+  switch (activity_type_) {
+    case (uint8_t)libkineto::ActivityType::CPU_OP:
+      return c10::DeviceType::CPU;
+    case (uint8_t)libkineto::ActivityType::GPU_MEMCPY:
+      return c10::DeviceType::CUDA;
+    case (uint8_t)libkineto::ActivityType::GPU_MEMSET:
+      return c10::DeviceType::CUDA;
+    case (uint8_t)libkineto::ActivityType::CONCURRENT_KERNEL:
+      return c10::DeviceType::CUDA;
+    case (uint8_t)libkineto::ActivityType::EXTERNAL_CORRELATION:
+      return c10::DeviceType::CPU;
+    case (uint8_t)libkineto::ActivityType::CUDA_RUNTIME:
+      return c10::DeviceType::CPU;
+  }
+  TORCH_CHECK(false, "Unknown activity type");
+}
+
 KinetoEvent::KinetoEvent() : activity_type_((uint8_t)libkineto::ActivityType::CPU_OP) {}
 
 ProfilerResult::ProfilerResult(
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index c4e08834af0e..998667da6d97 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -144,6 +144,8 @@ struct TORCH_API KinetoEvent {
     return device_resource_id_;
   }
 
+  c10::DeviceType deviceType() const;
+
   uint64_t start_thread_id_ = 0;
   uint64_t end_thread_id_ = 0;
   uint64_t fwd_thread_id_ = 0;

From 165bb7c75613e25793e2cb90c483b88b0e6a3177 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 14:19:52 -0800
Subject: [PATCH 46/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/csrc/autograd/init.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 647683cc19da..6afdab010b76 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -74,11 +74,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
   py::class_<KinetoEvent>(m, "KinetoEvent")
       // name of the event
       .def("name", &KinetoEvent::name)
-      // start callback PyTorch thread id
+      // PyTorch thread id of the start callback
       .def("start_thread_id", [](const KinetoEvent& e) {
         return e.startThreadId();
       })
-      // end callback PyTorch thread id
+      // PyTorch thread id of the end callback
       .def("end_thread_id", [](const KinetoEvent& e) {
         return e.endThreadId();
       })
@@ -96,7 +96,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("start_us", &KinetoEvent::startUs)
       // duration in us
       .def("duration_us", &KinetoEvent::durationUs)
-      // used to correlate between high-level PyTorch events
+      // used for correlation between high-level PyTorch events
       // and low-level device events
       .def("correlation_id", [](const KinetoEvent& e) {
         return e.correlationId();
@@ -117,7 +117,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
           return std::vector<std::string>();
         }
       })
-      // type of the RecordFunction that generated this PyTorch CPU event
+      // type of the RecordFunction that generated a PyTorch CPU event
       // (op, torchscript function, user label, etc)
       .def("scope", [](const KinetoEvent& e) {
         return e.scope();

From 445b8c1300435b1f14501694ec769715802699ea Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 16:06:58 -0800
Subject: [PATCH 47/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py   | 80 ++++++++++++++++++++++++++++--------
 torch/csrc/autograd/init.cpp |  4 +-
 2 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 17d854a74f4d..24b88c41865e 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -62,7 +62,7 @@ def populate_cpu_children(self):
         # Some events can be async (i.e. start and end on different threads),
         # since it's generally undefined how to attribute children ranges to
         # async ranges, we do not use them when calculating nested ranges and stats
-        sync_events = [evt for evt in self if not evt.is_async]
+        sync_events = [evt for evt in self if not evt.is_async and evt.device_type == 0]
         events = sorted(
             sync_events,
             key=attrgetter("thread"),
@@ -774,7 +774,7 @@ class FunctionEvent(FormattedTimesMixin):
     def __init__(
             self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
-            is_remote=False, sequence_nr=-1, node_id=0):
+            is_remote=False, sequence_nr=-1, node_id=0, device_type=0):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
@@ -793,8 +793,10 @@ def __init__(
         self.is_async: bool = is_async
         self.is_remote: bool = is_remote
         self.sequence_nr: int = sequence_nr
+        self.device_type: int = device_type
 
     def append_kernel(self, name, device, start, end):
+        assert self.device_type == 0 # CPU
         self.kernels.append(Kernel(name, device, Interval(start, end)))
 
     def append_cpu_child(self, child):
@@ -803,7 +805,9 @@ def append_cpu_child(self, child):
         One is supposed to append only direct children to the event to have
         correct self cpu time being reported.
         """
+        assert(self.device_type == 0) # CPU
         assert(isinstance(child, FunctionEvent))
+        assert(child.device_type == 0)
         self.cpu_children.append(child)
 
     def set_cpu_parent(self, parent):
@@ -813,14 +817,16 @@ def set_cpu_parent(self, parent):
         the child's range interval is completely inside the parent's. We use
         this connection to determine the event is from top-level op or not.
         """
+        assert(self.device_type == 0) # CPU
         assert(isinstance(parent, FunctionEvent))
+        assert(parent.device_type == 0)
         self.cpu_parent = parent
 
     # Note: async events don't have children, are not used when computing 'self'
     # metrics of other events, have only total cpu time
     @property
     def self_cpu_memory_usage(self):
-        if self.is_async:
+        if self.is_async or self.device_type != 0: # CPU
             return 0
         return self.cpu_memory_usage - sum(
             [child.cpu_memory_usage for child in self.cpu_children]
@@ -828,7 +834,7 @@ def self_cpu_memory_usage(self):
 
     @property
     def self_cuda_memory_usage(self):
-        if self.is_async:
+        if self.is_async or self.device_type != 0: # CPU
             return 0
         return self.cuda_memory_usage - sum(
             [child.cuda_memory_usage for child in self.cpu_children]
@@ -836,7 +842,7 @@ def self_cuda_memory_usage(self):
 
     @property
     def self_cpu_time_total(self):
-        if self.is_async:
+        if self.is_async or self.device_type != 0:
             return 0
         return self.cpu_time_total - sum(
             [child.cpu_time_total for child in self.cpu_children]
@@ -844,16 +850,31 @@ def self_cpu_time_total(self):
 
     @property
     def cuda_time_total(self):
-        return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
+        if self.is_async:
+            return 0
+        if self.device_type == 0: # CPU
+            return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
+        else:
+            assert self.device_type == 1 # CUDA
+            return self.time_range.elapsed_us()
 
     @property
     def self_cuda_time_total(self):
-        return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \
-            sum([child.cuda_time_total for child in self.cpu_children])
+        if self.is_async:
+            return 0
+        if self.device_type == 0: # CPU
+            return self.cuda_time_total - \
+                sum([child.cuda_time_total for child in self.cpu_children])
+        else:
+            assert(self.device_type == 1) # CUDA
+            return self.cuda_time_total
 
     @property
     def cpu_time_total(self):
-        return self.time_range.elapsed_us()
+        if self.device_type == 0: # CPU
+            return self.time_range.elapsed_us()
+        else:
+            return 0
 
     @property
     def key(self):
@@ -861,10 +882,12 @@ def key(self):
 
     def __repr__(self):
         return (
-            '<FunctionEvent id={} node_id={} cpu_time={} start_us={} end_us={} '
+            '<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} '
             'cpu_children={} cuda_time={} name={} thread={} input_shapes={} '
             'cpu_memory_usage={} cuda_memory_usage={} is_async={} is_remote={} seq_nr={}>'.format(
                 self.id,
+                self.name,
+                self.device_type,
                 self.node_id,
                 self.cpu_time_str,
                 self.time_range.start,
@@ -1008,22 +1031,46 @@ def parse_kineto_results(result):
     string_table = StringTable()
     function_events = []
     for kineto_event in result.events():
-        fe_start_us = kineto_event.start_us() - start_record.start_us()
+        rel_start_us = kineto_event.start_us() - start_record.start_us()
+        rel_end_us = rel_start_us + kineto_event.duration_us()
+        abs_end_us = kineto_event.start_us() + kineto_event.duration_us()
+
+        cpu_memory_usage = 0
+        cuda_memory_usage = 0
+        if kineto_event.device_type() == 0: # CPU
+            # find the corresponding memory allocation events
+            for mem_record in mem_records:
+                if (mem_record.start_us() >= kineto_event.start_us() and
+                    mem_record.start_us() <= abs_end_us):
+                    cpu_memory_usage += mem_record.cpu_memory_usage()
+                    cuda_memory_usage += mem_record.cuda_memory_usage()
+        is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id()
         fe = FunctionEvent(
             id=kineto_event.correlation_id(),
             name=string_table[kineto_event.name()],
             thread=kineto_event.start_thread_id(),
-            start_us=fe_start_us,
-            end_us=fe_start_us + kineto_event.duration_us(),
+            start_us=rel_start_us,
+            end_us=rel_end_us,
             fwd_thread=kineto_event.fwd_thread_id(),
             input_shapes=kineto_event.shapes(),
             stack=[entry for entry in kineto_event.stack() if filter_stack_entry(entry)],
             scope=kineto_event.scope(),
-            #cpu_memory_usage=cpu_memory_usage,
-            #cuda_memory_usage=cuda_memory_usage,
-            is_async=kineto_event.start_thread_id() != kineto_event.end_thread_id(),
+            cpu_memory_usage=cpu_memory_usage,
+            cuda_memory_usage=cuda_memory_usage,
+            is_async=is_async,
             sequence_nr=kineto_event.sequence_nr(),
+            device_type=kineto_event.device_type(),
         )
+        # associate CUDA kernels with a CPU event
+        if kineto_event.device_type() == 0 and not is_async:
+            for evt in result.events():
+                if evt.device_type == 1: # CUDA
+                    if evt.correlation_id == kineto_event.correlation_id:
+                        fe.append_kernel(
+                            evt.name(),
+                            evt.device_index(),
+                            evt.start_us(),
+                            evt.start_us() + evt.duration_us())
         function_events.append(fe)
     function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return function_events
@@ -1132,6 +1179,7 @@ def adjusted_time(cuda_record, cuda_records_map):
                     is_async=is_async,
                     is_remote=is_remote_event,
                     sequence_nr=start.sequence_nr(),
+                    device_type=0, # CPU
                 )
                 # note: async events have only cpu total time
                 if not is_async and start.has_cuda():
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 6afdab010b76..b187a4f8524c 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -127,7 +127,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       // for CUDA - stream id, for CPU - start thread id
       .def("device_resource_id", &KinetoEvent::deviceResourceId)
       // device type, currently: CPU or CUDA
-      .def("device_type", &KinetoEvent::deviceType);
+      .def("device_type", [](const KinetoEvent& e) {
+        return (uint8_t)e.deviceType();
+      });
 
   py::class_<ProfilerResultWrapper>(m, "ProfilerResult")
       .def("events",  [](const ProfilerResultWrapper& r) {

From 7c317f5cd49b980c321ce36ec73bcc936051c316 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 17:13:26 -0800
Subject: [PATCH 48/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py
```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 test/test_autograd.py                   | 57 ++++++++++++-------------
 test/test_profiler.py                   |  5 ++-
 torch/autograd/profiler.py              | 30 ++++++-------
 torch/csrc/autograd/profiler_kineto.cpp |  1 +
 4 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 177a9b4c7805..365c72fff471 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -33,7 +33,7 @@
                                                   suppress_warnings, slowTest,
                                                   load_tests, random_symmetric_matrix,
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck)
-from torch.autograd import Variable, Function, detect_anomaly
+from torch.autograd import Variable, Function, detect_anomaly, kineto_available
 from torch.autograd.function import InplaceFunction
 from torch.testing import randn_like
 from torch.testing._internal.common_methods_invocations import (method_tests,
@@ -2989,7 +2989,7 @@ def gen_matrices(p):
             https://github.com/pytorch/pytorch/issues/34086""")
     def test_profiler_tracing(self):
         t1, t2 = torch.ones(1), torch.ones(1)
-        with torch.autograd.profiler.profile() as prof:
+        with torch.autograd.profiler.profile(use_kineto=kineto_available()) as prof:
             torch.add(t1, t2)
 
         with tempfile.NamedTemporaryFile(mode="w+") as f:
@@ -3004,7 +3004,7 @@ def test_profiler_tracing(self):
 
         device = torch.device("cuda:0")
         t1, t2 = torch.ones(1, device=device), torch.ones(1, device=device)
-        with torch.autograd.profiler.profile(use_cuda=True) as prof:
+        with torch.autograd.profiler.profile(use_cuda=True, use_kineto=kineto_available()) as prof:
             torch.add(t1, t2)
 
         with tempfile.NamedTemporaryFile(mode="w+") as f:
@@ -3015,7 +3015,7 @@ def test_profiler_tracing(self):
     def test_profiler(self):
         x = torch.randn(10, 10)
 
-        with profile() as p:
+        with profile(use_kineto=kineto_available()) as p:
             self.assertTrue(torch.autograd._profiler_enabled())
             y = x * 2 + 4
 
@@ -3026,17 +3026,14 @@ def test_profiler(self):
                  'aten::empty', 'aten::add', 'aten::to', 'aten::empty_strided',
                  'aten::copy_', 'aten::empty']
         top_level_names = ['aten::mul', 'aten::add']
-        top_level_iter = iter(top_level_names)
-        self.assertEqual(len(p.function_events), len(names))
-        for info, expected_name in zip(p.function_events, names):
-            if info.cpu_interval.start > last_end:
-                top_level_name_expected = next(top_level_iter)
-                self.assertEqual(info.name, top_level_name_expected)
-                last_end = info.cpu_interval.end
-            self.assertEqual(info.name, expected_name)
+        for evt in p.function_events:
+            if evt.time_range.start > last_end:
+                self.assertTrue(evt.name in top_level_names)
+                last_end = evt.time_range.end
+            self.assertTrue(evt.name in names)
 
     def test_profiler_seq_nr(self):
-        with profile() as p:
+        with profile(use_kineto=kineto_available()) as p:
             x = torch.randn(10, 10, requires_grad=True)
             y = torch.randn(10, 10, requires_grad=True)
             z = x + y
@@ -3084,7 +3081,7 @@ def test_profiler_seq_nr(self):
     def test_profiler_unboxed_only(self):
         x = torch.rand(3, 4)
 
-        with torch.autograd.profiler.profile() as prof:
+        with torch.autograd.profiler.profile(use_kineto=kineto_available()) as prof:
             x.resize_([3, 2])
 
     def test_profiler_propagation(self):
@@ -3109,7 +3106,7 @@ def bar(x):
 
         traced_bar = torch.jit.trace(bar, x)
 
-        with profile() as p:
+        with profile(use_kineto=kineto_available()) as p:
             traced_bar(x)
 
         found_foo = False
@@ -3131,7 +3128,7 @@ def bar(x):
 
     def test_record_function_callbacks(self):
         x = torch.randn(10, 10)
-        with profile() as p:
+        with profile(use_kineto=kineto_available()) as p:
             with record_function("foo"):
                 y = x * 2 + 4
 
@@ -3163,8 +3160,8 @@ def get_id():
                         node_id=0,
                         name="",
                         thread=thread,
-                        cpu_start=range[0],
-                        cpu_end=range[1],
+                        start_us=range[0],
+                        end_us=range[1],
                     )
                 )
 
@@ -3187,7 +3184,7 @@ def test_profiler_aggregation_table(self):
         """
 
         x = torch.randn(1024)
-        with torch.autograd.profiler.profile() as prof:
+        with torch.autograd.profiler.profile(use_kineto=kineto_available()) as prof:
             torch.einsum("i->", x)
 
         prof_str = str(prof)
@@ -3197,8 +3194,8 @@ def test_profiler_aggregation_table(self):
 
     def test_profiler_function_event_avg(self):
         avg = FunctionEventAvg()
-        avg.add(FunctionEvent(id=0, node_id=0, name="foo", thread=0, cpu_start=10, cpu_end=15))
-        avg.add(FunctionEvent(id=1, node_id=0, name="foo", thread=0, cpu_start=20, cpu_end=30))
+        avg.add(FunctionEvent(id=0, node_id=0, name="foo", thread=0, start_us=10, end_us=15))
+        avg.add(FunctionEvent(id=1, node_id=0, name="foo", thread=0, start_us=20, end_us=30))
         avg.add(avg)
         self.assertEqual(avg.key, "foo")
 
@@ -3217,7 +3214,7 @@ def test_profiler_shapes(self):
         layer1 = torch.nn.Linear(20, 30)
         layer2 = torch.nn.Linear(30, 40)
         input = torch.randn(128, 20)
-        with profile(record_shapes=True) as prof:
+        with profile(record_shapes=True, use_kineto=kineto_available()) as prof:
             layer2(layer1(input))
 
         print(prof.function_events)
@@ -3233,18 +3230,18 @@ def test_profiler_shapes(self):
         last_end = 0
 
         for event in prof.function_events:
-            if event.cpu_interval.start > last_end:
+            if event.time_range.start > last_end:
                 name_expected, input_shape_expected = next(expected_iter)
                 if name_expected is not None:
                     self.assertEqual(event.name, name_expected)
                 self.assertEqual(event.input_shapes, input_shape_expected)
-                last_end = event.cpu_interval.end
+                last_end = event.time_range.end
 
     def test_profiler_no_cuda(self):
         print("")
         layer = torch.nn.Linear(20, 30)
         x = torch.randn(128, 20)
-        with profile(use_cuda=False) as prof:
+        with profile(use_cuda=False, use_kineto=kineto_available()) as prof:
             layer(x)
 
         prof_str = str(prof)
@@ -3256,7 +3253,7 @@ def test_profiler_aggregation_lstm(self):
         print("")
         rnn = torch.nn.LSTM(10, 20, 2)
         total_time_s = 0
-        with profile(record_shapes=True) as prof:
+        with profile(record_shapes=True, use_kineto=kineto_available()) as prof:
             for i in range(20):
                 input = torch.randn(5, 3, 10)
                 h = torch.randn(2, 3, 20)
@@ -3293,7 +3290,7 @@ def test_profiler_aggregation_lstm(self):
     def test_memory_profiler(self):
         def run_profiler(tensor_creation_fn, metric):
             # collecting allocs / deallocs
-            with profile(profile_memory=True, record_shapes=True) as prof:
+            with profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
                 x = None
                 with record_function("test_user_scope_alloc"):
                     x = tensor_creation_fn()
@@ -3385,7 +3382,7 @@ def create_mkldnn_tensor():
 
         # check partial overlap of tensor allocation with memory profiler
         x = torch.rand(10, 10)
-        with profile(profile_memory=True, record_shapes=True) as prof:
+        with profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
             del x
             x = torch.rand(10, 10)
         del x
@@ -3411,7 +3408,7 @@ def forward(x):
 
         forward(x)
 
-        with profile() as p:
+        with profile(use_kineto=kineto_available()) as p:
             forward(x)
 
         events = p.function_events
@@ -3436,7 +3433,7 @@ def forward(x):
         def f(x, y):
             return x + y
 
-        with profile() as p:
+        with profile(use_kineto=kineto_available()) as p:
             f(1, 2)
 
         self.assertTrue('my_func' in str(p))
diff --git a/test/test_profiler.py b/test/test_profiler.py
index 44973546429e..6d7618ca8a0b 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -7,6 +7,7 @@
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS)
 from torch.autograd.profiler import profile
+from torch.autograd import kineto_available
 
 try:
     import psutil
@@ -73,7 +74,7 @@ def forward(self, x):
 
         mod = DummyModule()
 
-        with profile(with_stack=True) as p:
+        with profile(with_stack=True, use_kineto=kineto_available()) as p:
             x = torch.randn(10, 10, requires_grad=True)
             y = torch.randn(10, 10, requires_grad=True)
             z = x + y
@@ -99,7 +100,7 @@ def forward(self, x):
 
         torch._C._set_graph_executor_optimize(prev_opt)
 
-    @unittest.skipIf(not torch.autograd.kineto_available(), "Kineto is required")
+    @unittest.skipIf(not kineto_available(), "Kineto is required")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_kineto(self):
         x = torch.randn(10, 10).cuda()
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 24b88c41865e..04858f94deca 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -6,7 +6,7 @@
 from collections import defaultdict, namedtuple
 from operator import attrgetter
 
-from typing import List, Dict, Tuple, Optional
+from typing import List, Tuple, Optional
 
 try:
     # Available in Python >= 3.2
@@ -120,7 +120,7 @@ def set_backward_stacktraces(self):
         def bw_parent(evt):
             if evt is None:
                 return None
-            elif evt.scope == 1: # BACKWARD_FUNCTION
+            elif evt.scope == 1:  # BACKWARD_FUNCTION
                 return evt
             else:
                 return bw_parent(evt.cpu_parent)
@@ -671,7 +671,7 @@ def __enter__(self):
             raise RuntimeError("NVTX annotation context manager is not reentrant")
         self.entered = True
         torch.cuda.synchronize()
-        torch.autograd._enable_profiler(
+        torch.autograd._enable_profiler_legacy(
             torch.autograd.ProfilerConfig(
                 torch.autograd.ProfilerState.NVTX,
                 self.record_shapes,
@@ -684,7 +684,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         if not self.enabled:
             return
         torch.cuda.synchronize()
-        torch.autograd._disable_profiler()
+        torch.autograd._disable_profiler_legacy()
         return False
 
 
@@ -796,7 +796,7 @@ def __init__(
         self.device_type: int = device_type
 
     def append_kernel(self, name, device, start, end):
-        assert self.device_type == 0 # CPU
+        assert self.device_type == 0  # CPU
         self.kernels.append(Kernel(name, device, Interval(start, end)))
 
     def append_cpu_child(self, child):
@@ -805,7 +805,7 @@ def append_cpu_child(self, child):
         One is supposed to append only direct children to the event to have
         correct self cpu time being reported.
         """
-        assert(self.device_type == 0) # CPU
+        assert(self.device_type == 0)  # CPU
         assert(isinstance(child, FunctionEvent))
         assert(child.device_type == 0)
         self.cpu_children.append(child)
@@ -817,7 +817,7 @@ def set_cpu_parent(self, parent):
         the child's range interval is completely inside the parent's. We use
         this connection to determine the event is from top-level op or not.
         """
-        assert(self.device_type == 0) # CPU
+        assert(self.device_type == 0)  # CPU
         assert(isinstance(parent, FunctionEvent))
         assert(parent.device_type == 0)
         self.cpu_parent = parent
@@ -826,7 +826,7 @@ def set_cpu_parent(self, parent):
     # metrics of other events, have only total cpu time
     @property
     def self_cpu_memory_usage(self):
-        if self.is_async or self.device_type != 0: # CPU
+        if self.is_async or self.device_type != 0:  # CPU
             return 0
         return self.cpu_memory_usage - sum(
             [child.cpu_memory_usage for child in self.cpu_children]
@@ -834,7 +834,7 @@ def self_cpu_memory_usage(self):
 
     @property
     def self_cuda_memory_usage(self):
-        if self.is_async or self.device_type != 0: # CPU
+        if self.is_async or self.device_type != 0:  # CPU
             return 0
         return self.cuda_memory_usage - sum(
             [child.cuda_memory_usage for child in self.cpu_children]
@@ -852,26 +852,26 @@ def self_cpu_time_total(self):
     def cuda_time_total(self):
         if self.is_async:
             return 0
-        if self.device_type == 0: # CPU
+        if self.device_type == 0:  # CPU
             return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
         else:
-            assert self.device_type == 1 # CUDA
+            assert self.device_type == 1  # CUDA
             return self.time_range.elapsed_us()
 
     @property
     def self_cuda_time_total(self):
         if self.is_async:
             return 0
-        if self.device_type == 0: # CPU
+        if self.device_type == 0:  # CPU
             return self.cuda_time_total - \
                 sum([child.cuda_time_total for child in self.cpu_children])
         else:
-            assert(self.device_type == 1) # CUDA
+            assert(self.device_type == 1)  # CUDA
             return self.cuda_time_total
 
     @property
     def cpu_time_total(self):
-        if self.device_type == 0: # CPU
+        if self.device_type == 0:  # CPU
             return self.time_range.elapsed_us()
         else:
             return 0
@@ -1179,7 +1179,7 @@ def adjusted_time(cuda_record, cuda_records_map):
                     is_async=is_async,
                     is_remote=is_remote_event,
                     sequence_nr=start.sequence_nr(),
-                    device_type=0, # CPU
+                    device_type=0,  # CPU
                 )
                 # note: async events have only cpu total time
                 if not is_async and start.has_cuda():
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 721a698bacaf..9c1340a9a7e4 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -193,6 +193,7 @@ void prepareProfiler(
     libkineto::ActivityType::GPU_MEMCPY,
     libkineto::ActivityType::GPU_MEMSET,
     libkineto::ActivityType::CONCURRENT_KERNEL,
+    // also including CUDA_RUNTIME
     libkineto::ActivityType::CUDA_RUNTIME,
   };
 

From c904443bde4dc227e1aa0ca13b5aab6ed7d62c32 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 19:15:38 -0800
Subject: [PATCH 49/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls       Node ID
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      11.000us        64.71%      11.000us      11.000us             1             0
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.000us        17.65%       3.000us       3.000us             1             0
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        11.76%       2.000us       2.000us             1             0
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us         5.88%       1.000us       1.000us             1             0
                                               aten::mm        13.86%     421.014ms        27.73%     842.019ms     421.010ms       0.000us         0.00%       0.000us       0.000us             2             0
                                            aten::empty         0.00%      25.000us         0.00%      25.000us      12.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                           aten::stride         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us             3             0
                                              aten::add        36.55%        1.110s        73.11%        2.220s        1.110s       0.000us         0.00%       0.000us       0.000us             2             0
                                               aten::to         0.00%       9.000us         0.00%      99.000us      99.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                    aten::empty_strided         0.00%      21.000us         0.00%      21.000us      21.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                            aten::copy_         0.00%      69.000us         0.00%     133.000us      66.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                               cudaFree        13.00%     394.907ms        13.00%     394.907ms     394.907ms       0.000us         0.00%       0.000us       0.000us             1             0
                                 cudaDeviceGetAttribute         0.00%       1.000us         0.00%       1.000us       0.091us       0.000us         0.00%       0.000us       0.000us            11             0
                                             cudaMalloc         0.02%     632.000us         0.02%     632.000us     210.667us       0.000us         0.00%       0.000us       0.000us             3             0
                                             cudaMemcpy         0.00%      20.000us         0.00%      20.000us      20.000us       0.000us         0.00%       0.000us       0.000us             1             0
                               cudaEventCreateWithFlags         0.00%       9.000us         0.00%       9.000us       0.562us       0.000us         0.00%       0.000us       0.000us            16             0
                                       cudaLaunchKernel        36.55%        1.110s        36.55%        1.110s     555.021ms       0.000us         0.00%       0.000us       0.000us             2             0
                                        cudaMemcpyAsync         0.00%      33.000us         0.00%      33.000us      33.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                  cudaStreamSynchronize         0.00%       4.000us         0.00%       4.000us       4.000us       0.000us         0.00%       0.000us       0.000us             1             0
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 04858f94deca..fb6c8dfc0bb2 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -1027,6 +1027,13 @@ def parse_kineto_results(result):
             mem_records.append(record)
     assert start_record is not None, "Invalid profiler output, __start_profile is missing"
 
+    cuda_corr_map = {}
+    for kineto_event in result.events():
+        if kineto_event.device_type() == 1: # CUDA
+            if kineto_event.correlation_id() not in cuda_corr_map:
+                cuda_corr_map[kineto_event.correlation_id()] = []
+            cuda_corr_map[kineto_event.correlation_id()].append(kineto_event)
+
     # Create and return FunctionEvent list
     string_table = StringTable()
     function_events = []
@@ -1062,15 +1069,14 @@ def parse_kineto_results(result):
             device_type=kineto_event.device_type(),
         )
         # associate CUDA kernels with a CPU event
-        if kineto_event.device_type() == 0 and not is_async:
-            for evt in result.events():
-                if evt.device_type == 1: # CUDA
-                    if evt.correlation_id == kineto_event.correlation_id:
-                        fe.append_kernel(
-                            evt.name(),
-                            evt.device_index(),
-                            evt.start_us(),
-                            evt.start_us() + evt.duration_us())
+        if (kineto_event.device_type() == 0 and not is_async and
+                kineto_event.correlation_id() in cuda_corr_map):
+            for evt in cuda_corr_map[kineto_event.correlation_id()]:
+                fe.append_kernel(
+                    evt.name(),
+                    evt.device_index(),
+                    evt.start_us(),
+                    evt.start_us() + evt.duration_us())
         function_events.append(fe)
     function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return function_events

From 1f600f84fac69e36666fb4d04b2e06d31628c9ca Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 23:26:04 -0800
Subject: [PATCH 50/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls       Node ID
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      11.000us        64.71%      11.000us      11.000us             1             0
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.000us        17.65%       3.000us       3.000us             1             0
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        11.76%       2.000us       2.000us             1             0
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us         5.88%       1.000us       1.000us             1             0
                                               aten::mm        13.86%     421.014ms        27.73%     842.019ms     421.010ms       0.000us         0.00%       0.000us       0.000us             2             0
                                            aten::empty         0.00%      25.000us         0.00%      25.000us      12.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                           aten::stride         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us             3             0
                                              aten::add        36.55%        1.110s        73.11%        2.220s        1.110s       0.000us         0.00%       0.000us       0.000us             2             0
                                               aten::to         0.00%       9.000us         0.00%      99.000us      99.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                    aten::empty_strided         0.00%      21.000us         0.00%      21.000us      21.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                            aten::copy_         0.00%      69.000us         0.00%     133.000us      66.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                               cudaFree        13.00%     394.907ms        13.00%     394.907ms     394.907ms       0.000us         0.00%       0.000us       0.000us             1             0
                                 cudaDeviceGetAttribute         0.00%       1.000us         0.00%       1.000us       0.091us       0.000us         0.00%       0.000us       0.000us            11             0
                                             cudaMalloc         0.02%     632.000us         0.02%     632.000us     210.667us       0.000us         0.00%       0.000us       0.000us             3             0
                                             cudaMemcpy         0.00%      20.000us         0.00%      20.000us      20.000us       0.000us         0.00%       0.000us       0.000us             1             0
                               cudaEventCreateWithFlags         0.00%       9.000us         0.00%       9.000us       0.562us       0.000us         0.00%       0.000us       0.000us            16             0
                                       cudaLaunchKernel        36.55%        1.110s        36.55%        1.110s     555.021ms       0.000us         0.00%       0.000us       0.000us             2             0
                                        cudaMemcpyAsync         0.00%      33.000us         0.00%      33.000us      33.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                  cudaStreamSynchronize         0.00%       4.000us         0.00%       4.000us       4.000us       0.000us         0.00%       0.000us       0.000us             1             0
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 test/test_autograd.py                 |  2 +
 torch/autograd/profiler.py            | 90 ++++++++++++++++-----------
 torch/csrc/autograd/profiler_kineto.h |  4 +-
 3 files changed, 57 insertions(+), 39 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 365c72fff471..e3e0c4f5fa7d 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3039,6 +3039,8 @@ def test_profiler_seq_nr(self):
             z = x + y
             s = z.sum()
             s.backward()
+        print(p.key_averages().table(
+            sort_by="self_cpu_time_total", row_limit=-1))
         # expecting aten::add, aten::sum to have the sequence numbers,
         # expecting the corresponding backward nodes to have the same numbers
         # as the forward ops
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index fb6c8dfc0bb2..db6abfabccc8 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -37,14 +37,57 @@ def __init__(self, *args, **kwargs):
         use_cuda = kwargs.pop('use_cuda', True)
         profile_memory = kwargs.pop('profile_memory', False)
         super(EventList, self).__init__(*args, **kwargs)
-        self._cpu_children_populated = False
         self._use_cuda = use_cuda
         self._profile_memory = profile_memory
+        self._tree_built = False
+
+    def build_tree(self):
+        self._populate_cpu_children()
+        self._remove_dup_nodes()
+        self._set_kernels()
+        self._set_backward_stacktraces()
+        self._tree_built = True
 
     def __str__(self):
         return self.table()
 
-    def populate_cpu_children(self):
+    def _remove_dup_nodes(self):
+        while True:
+            to_delete = []
+            for idx in range(len(self)):
+                if (self[idx].cpu_parent is not None and
+                        self[idx].cpu_parent.name == self[idx].name and
+                        len(self[idx].cpu_parent.cpu_children) == 1):
+                    self[idx].cpu_parent.cpu_children += self[idx].cpu_children
+                    for ch in self[idx].cpu_children:
+                        ch.cpu_parent = self[idx].cpu_parent
+                    to_delete.append(idx)
+            if len(to_delete) == 0:
+                break
+            new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
+            self.clear()
+            self.extend(new_evts)
+
+    def _set_kernels(self):
+        # associate CUDA kernels with CPU events
+        cuda_corr_map = {}
+        for evt in self:
+            if evt.device_type == 1:  # CUDA
+                if evt.id not in cuda_corr_map:
+                    cuda_corr_map[evt.id] = []
+                cuda_corr_map[evt.id].append(evt)
+
+        for evt in self:
+            if (evt.device_type == 0 and not evt.is_async and
+                    evt.id in cuda_corr_map):
+                for k_evt in cuda_corr_map[evt.id]:
+                    evt.append_kernel(
+                        k_evt.name(),
+                        k_evt.device_index(),
+                        k_evt.start_us(),
+                        k_evt.start_us() + k_evt.duration_us())
+
+    def _populate_cpu_children(self):
         """Populates child events into each underlying FunctionEvent object.
         One event is a child of another if [s1, e1) is inside [s2, e2). Where
         s1 and e1 would be start and end of the child event's interval. And
@@ -56,8 +99,6 @@ def populate_cpu_children(self):
         If for any reason two intervals intersect only partially, this function
         will not record a parent child relationship between then.
         """
-        if self.cpu_children_populated:
-            return
 
         # Some events can be async (i.e. start and end on different threads),
         # since it's generally undefined how to attribute children ranges to
@@ -112,11 +153,7 @@ def populate_cpu_children(self):
 
                 current_events.append(event)
 
-        self._cpu_children_populated = True
-
-    def set_backward_stacktraces(self):
-        self.populate_cpu_children()
-
+    def _set_backward_stacktraces(self):
         def bw_parent(evt):
             if evt is None:
                 return None
@@ -127,7 +164,7 @@ def bw_parent(evt):
 
         fwd_stacks = {}
         for evt in self:
-            if bw_parent(evt) is None:
+            if bw_parent(evt) is None and evt.stack is not None:
                 t = (evt.sequence_nr, evt.thread)
                 if t not in fwd_stacks:
                     fwd_stacks[t] = evt.stack
@@ -142,15 +179,10 @@ def bw_parent(evt):
                 else:
                     evt.stack = []
 
-
     @property
     def self_cpu_time_total(self):
         return sum([event.self_cpu_time_total for event in self])
 
-    @property
-    def cpu_children_populated(self):
-        return self._cpu_children_populated
-
     def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False):
         """Prints an EventList as a nicely formatted table.
 
@@ -262,7 +294,7 @@ def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
         Returns:
             An EventList containing FunctionEventAvg objects.
         """
-        self.populate_cpu_children()
+        assert self._tree_built
         stats = defaultdict(FunctionEventAvg)
 
         def get_key(event, group_by_input_shapes, group_by_stack_n):
@@ -442,8 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             parsed_results,
             use_cuda=self.use_cuda,
             profile_memory=self.profile_memory)
-        if self.with_stack:
-            self.function_events.set_backward_stacktraces()
+        self.function_events.build_tree()
         return False
 
     def __repr__(self):
@@ -454,13 +485,11 @@ def __repr__(self):
     def __str__(self):
         if self.function_events is None:
             return '<unfinished torch.autograd.profile>'
-        self.function_events.populate_cpu_children()
         return str(self.function_events)
 
     def _check_finish(self):
         if self.function_events is None:
             raise RuntimeError("can't export a trace that didn't finish running")
-        self.function_events.populate_cpu_children()
 
     def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False):
         self._check_finish()
@@ -774,7 +803,7 @@ class FunctionEvent(FormattedTimesMixin):
     def __init__(
             self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
-            is_remote=False, sequence_nr=-1, node_id=0, device_type=0):
+            is_remote=False, sequence_nr=-1, node_id=-1, device_type=0):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
@@ -1027,17 +1056,12 @@ def parse_kineto_results(result):
             mem_records.append(record)
     assert start_record is not None, "Invalid profiler output, __start_profile is missing"
 
-    cuda_corr_map = {}
-    for kineto_event in result.events():
-        if kineto_event.device_type() == 1: # CUDA
-            if kineto_event.correlation_id() not in cuda_corr_map:
-                cuda_corr_map[kineto_event.correlation_id()] = []
-            cuda_corr_map[kineto_event.correlation_id()].append(kineto_event)
-
     # Create and return FunctionEvent list
     string_table = StringTable()
     function_events = []
     for kineto_event in result.events():
+        if filter_name(kineto_event.name()):
+            continue
         rel_start_us = kineto_event.start_us() - start_record.start_us()
         rel_end_us = rel_start_us + kineto_event.duration_us()
         abs_end_us = kineto_event.start_us() + kineto_event.duration_us()
@@ -1068,16 +1092,8 @@ def parse_kineto_results(result):
             sequence_nr=kineto_event.sequence_nr(),
             device_type=kineto_event.device_type(),
         )
-        # associate CUDA kernels with a CPU event
-        if (kineto_event.device_type() == 0 and not is_async and
-                kineto_event.correlation_id() in cuda_corr_map):
-            for evt in cuda_corr_map[kineto_event.correlation_id()]:
-                fe.append_kernel(
-                    evt.name(),
-                    evt.device_index(),
-                    evt.start_us(),
-                    evt.start_us() + evt.duration_us())
         function_events.append(fe)
+
     function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return function_events
 
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 998667da6d97..2c1c1974e9a8 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -97,7 +97,7 @@ struct TORCH_API KinetoEvent {
   }
 
   KinetoEvent& sequenceNr(int64_t sequence_nr) {
-    sequence_nr_ = sequence_nr_;
+    sequence_nr_ = sequence_nr;
     return *this;
   }
 
@@ -149,7 +149,7 @@ struct TORCH_API KinetoEvent {
   uint64_t start_thread_id_ = 0;
   uint64_t end_thread_id_ = 0;
   uint64_t fwd_thread_id_ = 0;
-  int64_t sequence_nr_ = 0;
+  int64_t sequence_nr_ = -1;
   uint8_t scope_ = 0;
 
   uint8_t activity_type_;

From 5aacc1ca50641a292b213d64ddd78a76b75a6722 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Wed, 11 Nov 2020 23:39:20 -0800
Subject: [PATCH 51/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls       Node ID
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      11.000us        64.71%      11.000us      11.000us             1             0
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.000us        17.65%       3.000us       3.000us             1             0
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        11.76%       2.000us       2.000us             1             0
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us         5.88%       1.000us       1.000us             1             0
                                               aten::mm        13.86%     421.014ms        27.73%     842.019ms     421.010ms       0.000us         0.00%       0.000us       0.000us             2             0
                                            aten::empty         0.00%      25.000us         0.00%      25.000us      12.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                           aten::stride         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us             3             0
                                              aten::add        36.55%        1.110s        73.11%        2.220s        1.110s       0.000us         0.00%       0.000us       0.000us             2             0
                                               aten::to         0.00%       9.000us         0.00%      99.000us      99.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                    aten::empty_strided         0.00%      21.000us         0.00%      21.000us      21.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                            aten::copy_         0.00%      69.000us         0.00%     133.000us      66.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                               cudaFree        13.00%     394.907ms        13.00%     394.907ms     394.907ms       0.000us         0.00%       0.000us       0.000us             1             0
                                 cudaDeviceGetAttribute         0.00%       1.000us         0.00%       1.000us       0.091us       0.000us         0.00%       0.000us       0.000us            11             0
                                             cudaMalloc         0.02%     632.000us         0.02%     632.000us     210.667us       0.000us         0.00%       0.000us       0.000us             3             0
                                             cudaMemcpy         0.00%      20.000us         0.00%      20.000us      20.000us       0.000us         0.00%       0.000us       0.000us             1             0
                               cudaEventCreateWithFlags         0.00%       9.000us         0.00%       9.000us       0.562us       0.000us         0.00%       0.000us       0.000us            16             0
                                       cudaLaunchKernel        36.55%        1.110s        36.55%        1.110s     555.021ms       0.000us         0.00%       0.000us       0.000us             2             0
                                        cudaMemcpyAsync         0.00%      33.000us         0.00%      33.000us      33.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                  cudaStreamSynchronize         0.00%       4.000us         0.00%       4.000us       4.000us       0.000us         0.00%       0.000us       0.000us             1             0
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index db6abfabccc8..a37a08d5377a 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -82,10 +82,10 @@ def _set_kernels(self):
                     evt.id in cuda_corr_map):
                 for k_evt in cuda_corr_map[evt.id]:
                     evt.append_kernel(
-                        k_evt.name(),
-                        k_evt.device_index(),
-                        k_evt.start_us(),
-                        k_evt.start_us() + k_evt.duration_us())
+                        k_evt.name,
+                        k_evt.device_index,
+                        k_evt.time_range.start,
+                        k_evt.time_range.end)
 
     def _populate_cpu_children(self):
         """Populates child events into each underlying FunctionEvent object.
@@ -803,7 +803,7 @@ class FunctionEvent(FormattedTimesMixin):
     def __init__(
             self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
-            is_remote=False, sequence_nr=-1, node_id=-1, device_type=0):
+            is_remote=False, sequence_nr=-1, node_id=-1, device_type=0, device_index=0):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
@@ -823,6 +823,7 @@ def __init__(
         self.is_remote: bool = is_remote
         self.sequence_nr: int = sequence_nr
         self.device_type: int = device_type
+        self.device_index: int = device_index
 
     def append_kernel(self, name, device, start, end):
         assert self.device_type == 0  # CPU
@@ -1091,6 +1092,7 @@ def parse_kineto_results(result):
             is_async=is_async,
             sequence_nr=kineto_event.sequence_nr(),
             device_type=kineto_event.device_type(),
+            device_index=kineto_event.device_index(),
         )
         function_events.append(fe)
 

From 651f5565d40c781b71f87d3dd0f85a2563adb693 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Thu, 12 Nov 2020 10:23:02 -0800
Subject: [PATCH 52/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls       Node ID
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      11.000us        64.71%      11.000us      11.000us             1             0
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.000us        17.65%       3.000us       3.000us             1             0
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        11.76%       2.000us       2.000us             1             0
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us         5.88%       1.000us       1.000us             1             0
                                               aten::mm        13.86%     421.014ms        27.73%     842.019ms     421.010ms       0.000us         0.00%       0.000us       0.000us             2             0
                                            aten::empty         0.00%      25.000us         0.00%      25.000us      12.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                           aten::stride         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us             3             0
                                              aten::add        36.55%        1.110s        73.11%        2.220s        1.110s       0.000us         0.00%       0.000us       0.000us             2             0
                                               aten::to         0.00%       9.000us         0.00%      99.000us      99.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                    aten::empty_strided         0.00%      21.000us         0.00%      21.000us      21.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                            aten::copy_         0.00%      69.000us         0.00%     133.000us      66.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                               cudaFree        13.00%     394.907ms        13.00%     394.907ms     394.907ms       0.000us         0.00%       0.000us       0.000us             1             0
                                 cudaDeviceGetAttribute         0.00%       1.000us         0.00%       1.000us       0.091us       0.000us         0.00%       0.000us       0.000us            11             0
                                             cudaMalloc         0.02%     632.000us         0.02%     632.000us     210.667us       0.000us         0.00%       0.000us       0.000us             3             0
                                             cudaMemcpy         0.00%      20.000us         0.00%      20.000us      20.000us       0.000us         0.00%       0.000us       0.000us             1             0
                               cudaEventCreateWithFlags         0.00%       9.000us         0.00%       9.000us       0.562us       0.000us         0.00%       0.000us       0.000us            16             0
                                       cudaLaunchKernel        36.55%        1.110s        36.55%        1.110s     555.021ms       0.000us         0.00%       0.000us       0.000us             2             0
                                        cudaMemcpyAsync         0.00%      33.000us         0.00%      33.000us      33.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                  cudaStreamSynchronize         0.00%       4.000us         0.00%       4.000us       4.000us       0.000us         0.00%       0.000us       0.000us             1             0
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/profiler.py              | 4 +++-
 torch/csrc/autograd/profiler_kineto.cpp | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index a37a08d5377a..fa40f0d0a402 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -883,7 +883,9 @@ def cuda_time_total(self):
         if self.is_async:
             return 0
         if self.device_type == 0:  # CPU
-            return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
+            # account for the kernels in the children ops
+            return (sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) +
+                        sum(ch.cuda_time_total for ch in self.cpu_children))
         else:
             assert self.device_type == 1  # CUDA
             return self.time_range.elapsed_us()
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 9c1340a9a7e4..0de6b048fa4f 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -279,7 +279,11 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
   start_us_ = activity.timestamp();
   duration_us_ = activity.duration();
   correlation_id_ = activity.correlationId();
+  //std::cerr << "DEBUG: " name_ << ":  setting corr. id to " << correlation_id_ << std::endl;
   activity_type_ = (uint8_t)activity.type();
+  //if (activity.linkedActivity()) {
+  //  std::cerr << "DEBUG: linkedActivity: " << activity.name() << " " << activity.linkedActivity()->deviceId() << " " << activity.linkedActivity()->resourceId() << std::endl;
+  //}
   return *this;
 }
 

From 9997011df74bb27e6170a4a0be1a329cf773311e Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Thu, 12 Nov 2020 16:43:26 -0800
Subject: [PATCH 53/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      12.000us        63.16%      12.000us      12.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       2.750us        14.47%       2.750us       2.750us             1
                        Memcpy HtoD (Pagable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.250us        11.84%       2.250us       2.250us             1
                        Memcpy DtoH (Device -> Pagable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        10.53%       2.000us       2.000us             1
                                               aten::mm        25.87%     364.400ms        25.87%     364.426ms     364.426ms       0.000us         0.00%       0.000us       0.000us             1
                                            aten::empty         0.00%      39.585us         0.00%      39.585us      19.792us       0.000us         0.00%       0.000us       0.000us             2
                                           aten::stride         0.00%       3.363us         0.00%       3.363us       1.121us       0.000us         0.00%       0.000us       0.000us             3
                                              aten::add        74.12%        1.044s        74.12%        1.044s        1.044s       0.000us         0.00%       0.000us       0.000us             1
                                               aten::to         0.00%      13.155us         0.01%     116.398us     116.398us       0.000us         0.00%       0.000us       0.000us             1
                                    aten::empty_strided         0.00%      30.365us         0.00%      30.365us      30.365us       0.000us         0.00%       0.000us       0.000us             1
                                            aten::copy_         0.01%      72.878us         0.01%      72.878us      72.878us       0.000us         0.00%       0.000us       0.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls       Node ID
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us      11.000us        64.71%      11.000us      11.000us             1             0
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       3.000us        17.65%       3.000us       3.000us             1             0
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        11.76%       2.000us       2.000us             1             0
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us         5.88%       1.000us       1.000us             1             0
                                               aten::mm        13.86%     421.014ms        27.73%     842.019ms     421.010ms       0.000us         0.00%       0.000us       0.000us             2             0
                                            aten::empty         0.00%      25.000us         0.00%      25.000us      12.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                           aten::stride         0.00%       0.000us         0.00%       0.000us       0.000us       0.000us         0.00%       0.000us       0.000us             3             0
                                              aten::add        36.55%        1.110s        73.11%        2.220s        1.110s       0.000us         0.00%       0.000us       0.000us             2             0
                                               aten::to         0.00%       9.000us         0.00%      99.000us      99.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                    aten::empty_strided         0.00%      21.000us         0.00%      21.000us      21.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                            aten::copy_         0.00%      69.000us         0.00%     133.000us      66.500us       0.000us         0.00%       0.000us       0.000us             2             0
                                               cudaFree        13.00%     394.907ms        13.00%     394.907ms     394.907ms       0.000us         0.00%       0.000us       0.000us             1             0
                                 cudaDeviceGetAttribute         0.00%       1.000us         0.00%       1.000us       0.091us       0.000us         0.00%       0.000us       0.000us            11             0
                                             cudaMalloc         0.02%     632.000us         0.02%     632.000us     210.667us       0.000us         0.00%       0.000us       0.000us             3             0
                                             cudaMemcpy         0.00%      20.000us         0.00%      20.000us      20.000us       0.000us         0.00%       0.000us       0.000us             1             0
                               cudaEventCreateWithFlags         0.00%       9.000us         0.00%       9.000us       0.562us       0.000us         0.00%       0.000us       0.000us            16             0
                                       cudaLaunchKernel        36.55%        1.110s        36.55%        1.110s     555.021ms       0.000us         0.00%       0.000us       0.000us             2             0
                                        cudaMemcpyAsync         0.00%      33.000us         0.00%      33.000us      33.000us       0.000us         0.00%       0.000us       0.000us             1             0
                                  cudaStreamSynchronize         0.00%       4.000us         0.00%       4.000us       4.000us       0.000us         0.00%       0.000us       0.000us             1             0
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 test/test_profiler.py                   | 18 ++++++---
 torch/autograd/profiler.py              | 49 +++++++++++++------------
 torch/csrc/autograd/init.cpp            |  4 +-
 torch/csrc/autograd/profiler_kineto.cpp | 24 ++++++++++--
 torch/csrc/autograd/profiler_kineto.h   |  5 +++
 torch/csrc/autograd/profiler_legacy.h   |  4 --
 6 files changed, 67 insertions(+), 37 deletions(-)

diff --git a/test/test_profiler.py b/test/test_profiler.py
index 6d7618ca8a0b..797ad0995913 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -100,15 +100,22 @@ def forward(self, x):
 
         torch._C._set_graph_executor_optimize(prev_opt)
 
+    def payload(self):
+        x = torch.randn(10, 10).cuda()
+        y = torch.randn(10, 10).cuda()
+        z = torch.mm(x, y)
+        z = z + y
+        z = z.cpu()
+
     @unittest.skipIf(not kineto_available(), "Kineto is required")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
     def test_kineto(self):
-        x = torch.randn(10, 10).cuda()
-        y = torch.randn(10, 10).cuda()
+        with profile(use_cuda=True, use_kineto=True):
+            self.payload()
+
+        # rerun to avoid initial start overhead
         with profile(use_cuda=True, use_kineto=True) as p:
-            z = torch.mm(x, y)
-            z = z + y
-            z = z.cpu()
+            self.payload()
         print(p.key_averages().table(
             sort_by="self_cuda_time_total", row_limit=-1))
         found_gemm = False
@@ -120,6 +127,7 @@ def test_kineto(self):
                 found_memcpy = True
         self.assertTrue(found_gemm)
         self.assertTrue(found_memcpy)
+        # p.export_chrome_trace("/tmp/test_trace.json")
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index fa40f0d0a402..92d0e2848a47 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -44,7 +44,6 @@ def __init__(self, *args, **kwargs):
     def build_tree(self):
         self._populate_cpu_children()
         self._remove_dup_nodes()
-        self._set_kernels()
         self._set_backward_stacktraces()
         self._tree_built = True
 
@@ -68,25 +67,6 @@ def _remove_dup_nodes(self):
             self.clear()
             self.extend(new_evts)
 
-    def _set_kernels(self):
-        # associate CUDA kernels with CPU events
-        cuda_corr_map = {}
-        for evt in self:
-            if evt.device_type == 1:  # CUDA
-                if evt.id not in cuda_corr_map:
-                    cuda_corr_map[evt.id] = []
-                cuda_corr_map[evt.id].append(evt)
-
-        for evt in self:
-            if (evt.device_type == 0 and not evt.is_async and
-                    evt.id in cuda_corr_map):
-                for k_evt in cuda_corr_map[evt.id]:
-                    evt.append_kernel(
-                        k_evt.name,
-                        k_evt.device_index,
-                        k_evt.time_range.start,
-                        k_evt.time_range.end)
-
     def _populate_cpu_children(self):
         """Populates child events into each underlying FunctionEvent object.
         One event is a child of another if [s1, e1) is inside [s2, e2). Where
@@ -414,6 +394,7 @@ def __init__(
         self.profile_memory = profile_memory
         self.with_stack = with_stack
         self.use_cpu = use_cpu
+        self.kineto_results = None
         if not self.use_cpu:
             assert use_kineto, \
                 "Device-only events supported only with Kineto (use_kineto=True)"
@@ -465,8 +446,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         if not self.enabled:
             return
         if self.kineto_activities:
-            results = torch.autograd._disable_profiler()
-            parsed_results = parse_kineto_results(results)
+            self.kineto_results = torch.autograd._disable_profiler()
+            parsed_results = parse_kineto_results(self.kineto_results)
         else:
             records = torch.autograd._disable_profiler_legacy()
             parsed_results = parse_legacy_records(records)
@@ -502,8 +483,11 @@ def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=Non
 
     def export_chrome_trace(self, path):
         self._check_finish()
-        assert self.function_events is not None
-        return self.function_events.export_chrome_trace(path)
+        if self.kineto_results is not None:
+            self.kineto_results.save(path)
+        else:
+            assert self.function_events is not None
+            return self.function_events.export_chrome_trace(path)
     export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
 
     def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
@@ -1062,6 +1046,7 @@ def parse_kineto_results(result):
     # Create and return FunctionEvent list
     string_table = StringTable()
     function_events = []
+    cuda_corr_map = {}
     for kineto_event in result.events():
         if filter_name(kineto_event.name()):
             continue
@@ -1097,6 +1082,22 @@ def parse_kineto_results(result):
             device_index=kineto_event.device_index(),
         )
         function_events.append(fe)
+        if kineto_event.device_type() == 1:  # CUDA
+            corr_id = kineto_event.linked_correlation_id()
+            if corr_id > 0 and corr_id not in cuda_corr_map:
+                cuda_corr_map[corr_id] = []
+            cuda_corr_map[corr_id].append(kineto_event)
+
+    # associate CUDA kernels with CPU events
+    for fe in function_events:
+        if (fe.device_type == 0 and not fe.is_async and
+                fe.id in cuda_corr_map):
+            for k_evt in cuda_corr_map[fe.id]:
+                fe.append_kernel(
+                    k_evt.name(),
+                    k_evt.device_index(),
+                    k_evt.start_us(),
+                    k_evt.start_us() + k_evt.duration_us())
 
     function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
     return function_events
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index b187a4f8524c..d67d823f24c0 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -129,7 +129,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       // device type, currently: CPU or CUDA
       .def("device_type", [](const KinetoEvent& e) {
         return (uint8_t)e.deviceType();
-      });
+      })
+      // correlation id of a linked event
+      .def("linked_correlation_id", &KinetoEvent::linkedCorrelationId);
 
   py::class_<ProfilerResultWrapper>(m, "ProfilerResult")
       .def("events",  [](const ProfilerResultWrapper& r) {
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 0de6b048fa4f..62c95f8cf83a 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -20,6 +20,11 @@ uint64_t next_correlation_id() {
   return corr_id_++;
 }
 
+inline int64_t getTimeUs() {
+  using namespace std::chrono;
+  return duration_cast<microseconds>(high_resolution_clock::now().time_since_epoch()).count();
+}
+
 std::string shapesToStr(const std::vector<std::vector<int64_t>>& shapes);
 
 struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
@@ -44,6 +49,16 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
     // if (ctx->shapes && !ctx->shapes->empty()) {
     //   op.inputDims = shapesToStr(*ctx->shapes);
     // }
+
+    // Not setting atm
+    op.inputTypes = "[]";
+    op.arguments = "[]";
+    op.outputDims = "[]";
+    op.outputTypes = "[]";
+    op.inputNames = "[]";
+    op.outputNames = "[]";
+
+    //
     op.threadId = pthread_self();
     {
       std::lock_guard<std::mutex> guard(state_mutex_);
@@ -82,6 +97,8 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState {
     for (auto idx = 0; idx < cpu_trace->activities.size(); ++idx) {
       if (kineto_events_[idx].hasShapes()) {
         cpu_trace->activities[idx].inputDims = shapesToStr(kineto_events_[idx].shapes());
+      } else {
+        cpu_trace->activities[idx].inputDims = "[]";
       }
     }
   }
@@ -237,9 +254,7 @@ void enableProfiler(
     pushProfilingCallbacks();
   }
 
-  if (!libkineto::api().activityProfiler().isActive()) {
-    libkineto::api().activityProfiler().startTrace();
-  }
+  libkineto::api().activityProfiler().startTrace();
 
   state->mark("__start_profile", false);
 }
@@ -284,6 +299,9 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
   //if (activity.linkedActivity()) {
   //  std::cerr << "DEBUG: linkedActivity: " << activity.name() << " " << activity.linkedActivity()->deviceId() << " " << activity.linkedActivity()->resourceId() << std::endl;
   //}
+  if (activity.linkedActivity()) {
+    linked_correlation_id_ = activity.linkedActivity()->correlationId();
+  }
   return *this;
 }
 
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 2c1c1974e9a8..732dd6ed5f2d 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -140,6 +140,10 @@ struct TORCH_API KinetoEvent {
     return *this;
   }
 
+  uint64_t linkedCorrelationId() const {
+    return linked_correlation_id_;
+  }
+
   int64_t deviceResourceId() const {
     return device_resource_id_;
   }
@@ -161,6 +165,7 @@ struct TORCH_API KinetoEvent {
   uint64_t start_us_ = 0;
   uint64_t duration_us_ = 0;
   uint64_t correlation_id_ = 0;
+  uint64_t linked_correlation_id_ = 0;
   int64_t device_resource_id_ = 0;
 };
 
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index bc1381e40469..86c9f81f7fee 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -90,10 +90,6 @@ inline int64_t getTime() {
 #endif
 }
 
-inline int64_t getTimeUs() {
-  return getTime() / 1000;
-}
-
 enum class C10_API_ENUM EventKind : uint16_t {
   Mark,
   PushRange,

From bde96f63b9cb6ca614be902cdfefe4f97aa08e56 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Thu, 12 Nov 2020 22:37:15 -0800
Subject: [PATCH 54/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       1.000us             2
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       2.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                                            aten::randn         5.17%      74.000us         6.71%      96.000us      48.000us       0.000us         0.00%       0.000us       0.000us             2
                                            aten::empty         1.33%      19.000us         1.33%      19.000us       4.750us       0.000us         0.00%       0.000us       0.000us             4
                                          aten::normal_         1.05%      15.000us         1.05%      15.000us       7.500us       0.000us         0.00%       0.000us       0.000us             2
                                               aten::to        77.90%       1.114ms        91.61%       1.310ms     436.667us       0.000us         0.00%       3.000us       1.000us             3
                                    aten::empty_strided         2.52%      36.000us         2.52%      36.000us      12.000us       0.000us         0.00%       0.000us       0.000us             3
                                            aten::copy_         2.73%      39.000us        11.19%     160.000us      53.333us       0.000us         0.00%       3.000us       1.000us             3
                                        cudaMemcpyAsync         4.34%      62.000us         4.34%      62.000us      20.667us       0.000us         0.00%       0.000us       0.000us             3
                                  cudaStreamSynchronize         1.61%      23.000us         1.61%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3
                                               aten::mm         0.21%       3.000us         7.20%     103.000us     103.000us       0.000us         0.00%       2.000us       2.000us             1
                                           aten::stride         0.21%       3.000us         0.21%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3
                                       cudaLaunchKernel         2.45%      35.000us         2.45%      35.000us      17.500us       0.000us         0.00%       0.000us       0.000us             2
                                              aten::add         0.49%       7.000us         4.27%      61.000us      61.000us       0.000us         0.00%       1.000us       1.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 benchmarks/profiler_benchmark/profiler_bench.py | 3 ++-
 torch/csrc/autograd/profiler_kineto.cpp         | 3 +++
 torch/csrc/autograd/profiler_kineto.h           | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py
index 9f7e7fc2ac06..8b6a596c0e3b 100644
--- a/benchmarks/profiler_benchmark/profiler_bench.py
+++ b/benchmarks/profiler_benchmark/profiler_bench.py
@@ -51,11 +51,12 @@ def parallel_task(x):
     INTERNAL_ITER = args.internal_iter
 
     for profiling_enabled in [False, True]:
-        print("Profiling {}, tensor size {}x{}, use cuda: {}, with stacks: {}, use script: {}".format(
+        print("Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format(
             "enabled" if profiling_enabled else "disabled",
             args.profiling_tensor_size,
             args.profiling_tensor_size,
             args.with_cuda,
+            args.use_kineto,
             args.with_stack,
             args.use_script))
 
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 704bb0df790f..bf4a39b5b13d 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -335,7 +335,10 @@ ProfilerResult::ProfilerResult(
 ProfilerResult::~ProfilerResult() {}
 
 void ProfilerResult::save(const std::string& path) {
+  // Kineto's save is destructive
+  TORCH_CHECK(!saved_, "Trace is already saved");
   trace_->save(path);
+  saved_ = true;
 }
 
 #endif
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 667b5da31551..a1c2b2122e41 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -190,6 +190,7 @@ struct TORCH_API ProfilerResult {
   void save(const std::string& path);
 
  private:
+  bool saved_ = false;
   std::vector<KinetoEvent> events_;
   thread_event_lists legacy_events_;
   std::unique_ptr<libkineto::ActivityTraceInterface> trace_;

From b1a0292af7df5a4fd6fa3a45bdeb7e09bf8d4d41 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Fri, 13 Nov 2020 11:48:33 -0800
Subject: [PATCH 55/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       1.000us             2
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       2.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                                            aten::randn         5.17%      74.000us         6.71%      96.000us      48.000us       0.000us         0.00%       0.000us       0.000us             2
                                            aten::empty         1.33%      19.000us         1.33%      19.000us       4.750us       0.000us         0.00%       0.000us       0.000us             4
                                          aten::normal_         1.05%      15.000us         1.05%      15.000us       7.500us       0.000us         0.00%       0.000us       0.000us             2
                                               aten::to        77.90%       1.114ms        91.61%       1.310ms     436.667us       0.000us         0.00%       3.000us       1.000us             3
                                    aten::empty_strided         2.52%      36.000us         2.52%      36.000us      12.000us       0.000us         0.00%       0.000us       0.000us             3
                                            aten::copy_         2.73%      39.000us        11.19%     160.000us      53.333us       0.000us         0.00%       3.000us       1.000us             3
                                        cudaMemcpyAsync         4.34%      62.000us         4.34%      62.000us      20.667us       0.000us         0.00%       0.000us       0.000us             3
                                  cudaStreamSynchronize         1.61%      23.000us         1.61%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3
                                               aten::mm         0.21%       3.000us         7.20%     103.000us     103.000us       0.000us         0.00%       2.000us       2.000us             1
                                           aten::stride         0.21%       3.000us         0.21%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3
                                       cudaLaunchKernel         2.45%      35.000us         2.45%      35.000us      17.500us       0.000us         0.00%       0.000us       0.000us             2
                                              aten::add         0.49%       7.000us         4.27%      61.000us      61.000us       0.000us         0.00%       1.000us       1.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 test/test_jit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index c7df2c33350f..5014af17e490 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -2552,10 +2552,10 @@ def fn(x):
         for e in prof.function_events:
             if e.name == "aten::mul":
                 self.assertTrue(e.thread not in mul_events)
-                mul_events[e.thread] = e.cpu_interval.elapsed_us()
+                mul_events[e.thread] = e.time_range.elapsed_us()
             elif e.name == "other_fn":
                 self.assertTrue(e.thread not in other_fn_events)
-                other_fn_events[e.thread] = e.cpu_interval.elapsed_us()
+                other_fn_events[e.thread] = e.time_range.elapsed_us()
 
         self.assertTrue(len(mul_events) == 2)
         self.assertTrue(len(other_fn_events) == 2)

From 09a4762f0322423bbeb84dd367c7fdda3969112b Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 17 Nov 2020 00:57:59 -0800
Subject: [PATCH 56/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       1.000us             2
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       2.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                                            aten::randn         5.17%      74.000us         6.71%      96.000us      48.000us       0.000us         0.00%       0.000us       0.000us             2
                                            aten::empty         1.33%      19.000us         1.33%      19.000us       4.750us       0.000us         0.00%       0.000us       0.000us             4
                                          aten::normal_         1.05%      15.000us         1.05%      15.000us       7.500us       0.000us         0.00%       0.000us       0.000us             2
                                               aten::to        77.90%       1.114ms        91.61%       1.310ms     436.667us       0.000us         0.00%       3.000us       1.000us             3
                                    aten::empty_strided         2.52%      36.000us         2.52%      36.000us      12.000us       0.000us         0.00%       0.000us       0.000us             3
                                            aten::copy_         2.73%      39.000us        11.19%     160.000us      53.333us       0.000us         0.00%       3.000us       1.000us             3
                                        cudaMemcpyAsync         4.34%      62.000us         4.34%      62.000us      20.667us       0.000us         0.00%       0.000us       0.000us             3
                                  cudaStreamSynchronize         1.61%      23.000us         1.61%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3
                                               aten::mm         0.21%       3.000us         7.20%     103.000us     103.000us       0.000us         0.00%       2.000us       2.000us             1
                                           aten::stride         0.21%       3.000us         0.21%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3
                                       cudaLaunchKernel         2.45%      35.000us         2.45%      35.000us      17.500us       0.000us         0.00%       0.000us       0.000us             2
                                              aten::add         0.49%       7.000us         4.27%      61.000us      61.000us       0.000us         0.00%       1.000us       1.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/_C/_autograd.pyi                        | 26 ++++++++++++++++---
 torch/autograd/profiler.py                    |  8 +++---
 torch/csrc/autograd/init.cpp                  |  8 +++---
 .../rpc/server_process_global_profiler.py     |  2 +-
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index 926457fe80ee..2f4368d1a003 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Set
 from enum import Enum
 
 # Defined in tools/autograd/init.cpp
@@ -8,7 +8,11 @@ class ProfilerState(Enum):
     CPU = ...
     CUDA = ...
     NVTX = ...
+    KINETO = ...
 
+class ProfilerActivity(Enum):
+    CPU = ...
+    CUDA = ...
 
 class ProfilerConfig:
     def __init__(
@@ -37,9 +41,25 @@ class ProfilerEvent:
     def thread_id(self) -> int: ...
     ...
 
+class KinetoEvent:
+    def name(self) -> str: ...
+    def device_index(self) -> int: ...
+    def start_us(self) -> int: ...
+    def duration_us(self) -> int: ...
+    ...
+
+class ProfilerResult:
+    def events(self) -> List[KinetoEvent]: ...
+    def legacy_events(self) -> List[List[ProfilerEvent]]: ...
+    def save(self, str) -> None: ...
 
-def _enable_profiler(config: ProfilerConfig) -> None: ...
-def _disable_profiler() -> List[List[ProfilerEvent]]: ...
+def _enable_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ...
+def _prepare_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ...
+def _disable_profiler() -> ProfilerResult: ...
 def _profiler_enabled() -> bool: ...
+def kineto_available() -> bool: ...
 def _enable_record_function(enable: bool) -> None: ...
 def _set_empty_test_observer(is_global: bool, sampling_prob: float) -> None: ...
+
+def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
+def _disable_profiler_legacy() -> List[List[ProfilerEvent]]: ...
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 614ad123eee7..090cc209d77a 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -6,7 +6,7 @@
 from collections import defaultdict, namedtuple
 from operator import attrgetter
 
-from typing import List, Tuple, Optional
+from typing import Dict, List, Tuple, Optional
 
 try:
     # Available in Python >= 3.2
@@ -275,9 +275,9 @@ def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
             An EventList containing FunctionEventAvg objects.
         """
         assert self._tree_built
-        stats = defaultdict(FunctionEventAvg)
+        stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
 
-        def get_key(event, group_by_input_shapes, group_by_stack_n):
+        def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]:
             key = [str(event.key), str(event.node_id)]
             if group_by_input_shapes:
                 key.append(str(event.input_shapes))
@@ -1046,7 +1046,7 @@ def parse_kineto_results(result):
     # Create and return FunctionEvent list
     string_table = StringTable()
     function_events = []
-    cuda_corr_map = {}
+    cuda_corr_map: Dict[int, List[torch.autograd.KinetoEvent]] = {}
     for kineto_event in result.events():
         if filter_name(kineto_event.name()):
             continue
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 244531c6ed8b..ea06f475a629 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -133,10 +133,10 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       // correlation id of a linked event
       .def("linked_correlation_id", &KinetoEvent::linkedCorrelationId);
 
-    py::class_<ProfilerResult>(m, "ProfilerResult")
-      .def("events", &ProfilerResult::events)
-      .def("legacy_events", &ProfilerResult::legacy_events)
-      .def("save", &ProfilerResult::save);
+  py::class_<ProfilerResult>(m, "ProfilerResult")
+    .def("events", &ProfilerResult::events)
+    .def("legacy_events", &ProfilerResult::legacy_events)
+    .def("save", &ProfilerResult::save);
 
   m.def("_enable_profiler", enableProfiler);
   m.def("_disable_profiler", disableProfiler);
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index 0f4ba8d53817..bd06003aa506 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -145,7 +145,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         process_global_function_events = []
         for thread_local_events in process_global_events:
             # Parse from ``Event``s to ``FunctionEvent``s.
-            thread_local_function_events = torch.autograd.profiler.parse_event_records(
+            thread_local_function_events = torch.autograd.profiler.parse_legacy_records(
                 thread_local_events
             )
             thread_local_function_events.sort(

From cafee0f71a0c51f0eb44e8ec85338098937e3047 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 17 Nov 2020 01:44:54 -0800
Subject: [PATCH 57/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       1.000us             2
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       2.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                                            aten::randn         5.17%      74.000us         6.71%      96.000us      48.000us       0.000us         0.00%       0.000us       0.000us             2
                                            aten::empty         1.33%      19.000us         1.33%      19.000us       4.750us       0.000us         0.00%       0.000us       0.000us             4
                                          aten::normal_         1.05%      15.000us         1.05%      15.000us       7.500us       0.000us         0.00%       0.000us       0.000us             2
                                               aten::to        77.90%       1.114ms        91.61%       1.310ms     436.667us       0.000us         0.00%       3.000us       1.000us             3
                                    aten::empty_strided         2.52%      36.000us         2.52%      36.000us      12.000us       0.000us         0.00%       0.000us       0.000us             3
                                            aten::copy_         2.73%      39.000us        11.19%     160.000us      53.333us       0.000us         0.00%       3.000us       1.000us             3
                                        cudaMemcpyAsync         4.34%      62.000us         4.34%      62.000us      20.667us       0.000us         0.00%       0.000us       0.000us             3
                                  cudaStreamSynchronize         1.61%      23.000us         1.61%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3
                                               aten::mm         0.21%       3.000us         7.20%     103.000us     103.000us       0.000us         0.00%       2.000us       2.000us             1
                                           aten::stride         0.21%       3.000us         0.21%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3
                                       cudaLaunchKernel         2.45%      35.000us         2.45%      35.000us      17.500us       0.000us         0.00%       0.000us       0.000us             2
                                              aten::add         0.49%       7.000us         4.27%      61.000us      61.000us       0.000us         0.00%       1.000us       1.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 CMakeLists.txt                                  | 2 +-
 benchmarks/profiler_benchmark/profiler_bench.py | 1 -
 torch/autograd/profiler.py                      | 6 +++---
 torch/csrc/autograd/profiler_kineto.cpp         | 2 +-
 torch/csrc/autograd/profiler_legacy.cpp         | 4 +---
 torch/csrc/autograd/profiler_legacy.h           | 2 +-
 6 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4547807f9e36..fe8d21fc0766 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -524,7 +524,7 @@ if(USE_KINETO AND (NOT USE_CUDA))
   set(USE_KINETO OFF)
 endif()
 
-if (USE_KINETO AND MSVC)
+if(USE_KINETO AND MSVC)
   message(STATUS "Not using libkineto in a Windows build.")
   set(USE_KINETO OFF)
 endif()
diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py
index 8b6a596c0e3b..0cc8c33a1334 100644
--- a/benchmarks/profiler_benchmark/profiler_bench.py
+++ b/benchmarks/profiler_benchmark/profiler_bench.py
@@ -1,5 +1,4 @@
 import argparse
-import statistics
 import sys
 import timeit
 import torch
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 090cc209d77a..28b40dc32620 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -869,7 +869,7 @@ def cuda_time_total(self):
         if self.device_type == 0:  # CPU
             # account for the kernels in the children ops
             return (sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) +
-                        sum(ch.cuda_time_total for ch in self.cpu_children))
+                    sum(ch.cuda_time_total for ch in self.cpu_children))
         else:
             assert self.device_type == 1  # CUDA
             return self.time_range.elapsed_us()
@@ -1056,11 +1056,11 @@ def parse_kineto_results(result):
 
         cpu_memory_usage = 0
         cuda_memory_usage = 0
-        if kineto_event.device_type() == 0: # CPU
+        if kineto_event.device_type() == 0:  # CPU
             # find the corresponding memory allocation events
             for mem_record in mem_records:
                 if (mem_record.start_us() >= kineto_event.start_us() and
-                    mem_record.start_us() <= abs_end_us):
+                        mem_record.start_us() <= abs_end_us):
                     cpu_memory_usage += mem_record.cpu_memory_usage()
                     cuda_memory_usage += mem_record.cuda_memory_usage()
         is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id()
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index bf4a39b5b13d..ff7be1b5cff1 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -7,7 +7,7 @@
 
 #ifdef USE_KINETO
 #include <pthread.h>
-#include "libkineto.h"
+#include <libkineto.h>
 #endif
 
 namespace torch { namespace autograd { namespace profiler {
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index a8e37d45ee7e..3e3d458debc4 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -455,8 +455,6 @@ void registerCUDAMethods(CUDAStubs* stubs) {
   cuda_stubs = stubs;
 }
 
-ProfilerConfig::~ProfilerConfig() = default;
-
 at::IValue ProfilerConfig::toIValue() const {
   c10::impl::GenericList eventIValueList(at::AnyType::get());
   eventIValueList.reserve(NUM_PROFILER_CFG_IVALUE_IDX);
@@ -675,7 +673,7 @@ double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const {
 
 CUDAStubs::~CUDAStubs() = default;
 
-static jit::CodeTemplate event_template(R"(
+static const jit::CodeTemplate event_template(R"(
 {
   "name": "${name}",
   "ph": "X",
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index 86c9f81f7fee..9fb03c0b6ccd 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -368,7 +368,7 @@ struct TORCH_API ProfilerConfig {
         report_input_shapes(report_input_shapes),
         profile_memory(profile_memory),
         with_stack(with_stack) {}
-  ~ProfilerConfig();
+  ~ProfilerConfig() = default;
   ProfilerState state;
   bool report_input_shapes;
   bool profile_memory;

From 55028371704b7f9290d96dfafb90bdbbd234bb1b Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Tue, 17 Nov 2020 10:55:10 -0800
Subject: [PATCH 58/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       1.000us             2
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       2.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                                            aten::randn         5.17%      74.000us         6.71%      96.000us      48.000us       0.000us         0.00%       0.000us       0.000us             2
                                            aten::empty         1.33%      19.000us         1.33%      19.000us       4.750us       0.000us         0.00%       0.000us       0.000us             4
                                          aten::normal_         1.05%      15.000us         1.05%      15.000us       7.500us       0.000us         0.00%       0.000us       0.000us             2
                                               aten::to        77.90%       1.114ms        91.61%       1.310ms     436.667us       0.000us         0.00%       3.000us       1.000us             3
                                    aten::empty_strided         2.52%      36.000us         2.52%      36.000us      12.000us       0.000us         0.00%       0.000us       0.000us             3
                                            aten::copy_         2.73%      39.000us        11.19%     160.000us      53.333us       0.000us         0.00%       3.000us       1.000us             3
                                        cudaMemcpyAsync         4.34%      62.000us         4.34%      62.000us      20.667us       0.000us         0.00%       0.000us       0.000us             3
                                  cudaStreamSynchronize         1.61%      23.000us         1.61%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3
                                               aten::mm         0.21%       3.000us         7.20%     103.000us     103.000us       0.000us         0.00%       2.000us       2.000us             1
                                           aten::stride         0.21%       3.000us         0.21%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3
                                       cudaLaunchKernel         2.45%      35.000us         2.45%      35.000us      17.500us       0.000us         0.00%       0.000us       0.000us             2
                                              aten::add         0.49%       7.000us         4.27%      61.000us      61.000us       0.000us         0.00%       1.000us       1.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 torch/autograd/__init__.py              |  4 +--
 torch/autograd/profiler.py              | 44 +++++++++++++------------
 torch/csrc/autograd/init.cpp            | 19 +++++++++--
 torch/csrc/autograd/profiler_kineto.cpp |  4 ---
 4 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index d97210806a90..a89dc018a885 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -18,7 +18,7 @@
 from .grad_mode import no_grad, enable_grad, set_grad_enabled
 from .anomaly_mode import detect_anomaly, set_detect_anomaly
 from ..overrides import has_torch_function, handle_torch_function
-from . import profiler
+# from . import profiler
 from . import functional
 
 __all__ = ['Variable', 'Function', 'backward', 'grad_mode']
@@ -251,7 +251,7 @@ def variable(*args, **kwargs):
     raise RuntimeError("autograd initialization failed")
 
 # Import all native method/classes
-from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent,
+from torch._C._autograd import (DeviceType, ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent,
                                 _enable_profiler_legacy, _disable_profiler_legacy, _profiler_enabled,
                                 _enable_record_function, _set_empty_test_observer, kineto_available)
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 28b40dc32620..64d2151ec83f 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -1,6 +1,7 @@
 import itertools
 from typing import Any
 import torch
+from torch.autograd import DeviceType
 from torch.futures import Future
 
 from collections import defaultdict, namedtuple
@@ -83,7 +84,7 @@ def _populate_cpu_children(self):
         # Some events can be async (i.e. start and end on different threads),
         # since it's generally undefined how to attribute children ranges to
         # async ranges, we do not use them when calculating nested ranges and stats
-        sync_events = [evt for evt in self if not evt.is_async and evt.device_type == 0]
+        sync_events = [evt for evt in self if not evt.is_async and evt.device_type == DeviceType.CPU]
         events = sorted(
             sync_events,
             key=attrgetter("thread"),
@@ -340,7 +341,8 @@ class profile(object):
 
         use_kineto (bool, default False): experimental support for Kineto profiler
 
-        use_cpu (default True) - whether to profile CPU events
+        use_cpu (default True) - whether to profile CPU events; setting to False requires
+            use_kineto=True and can be used to lower the overhead for GPU-only profiling
 
     .. warning:
         Enabling memory profiling or source attribution incurs additional profiler
@@ -787,7 +789,7 @@ class FunctionEvent(FormattedTimesMixin):
     def __init__(
             self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None,
             stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
-            is_remote=False, sequence_nr=-1, node_id=-1, device_type=0, device_index=0):
+            is_remote=False, sequence_nr=-1, node_id=-1, device_type=DeviceType.CPU, device_index=0):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
@@ -806,11 +808,11 @@ def __init__(
         self.is_async: bool = is_async
         self.is_remote: bool = is_remote
         self.sequence_nr: int = sequence_nr
-        self.device_type: int = device_type
+        self.device_type: DeviceType = device_type
         self.device_index: int = device_index
 
     def append_kernel(self, name, device, start, end):
-        assert self.device_type == 0  # CPU
+        assert self.device_type == DeviceType.CPU
         self.kernels.append(Kernel(name, device, Interval(start, end)))
 
     def append_cpu_child(self, child):
@@ -819,9 +821,9 @@ def append_cpu_child(self, child):
         One is supposed to append only direct children to the event to have
         correct self cpu time being reported.
         """
-        assert(self.device_type == 0)  # CPU
+        assert(self.device_type == DeviceType.CPU)
         assert(isinstance(child, FunctionEvent))
-        assert(child.device_type == 0)
+        assert(child.device_type == DeviceType.CPU)
         self.cpu_children.append(child)
 
     def set_cpu_parent(self, parent):
@@ -831,16 +833,16 @@ def set_cpu_parent(self, parent):
         the child's range interval is completely inside the parent's. We use
         this connection to determine the event is from top-level op or not.
         """
-        assert(self.device_type == 0)  # CPU
+        assert(self.device_type == DeviceType.CPU)
         assert(isinstance(parent, FunctionEvent))
-        assert(parent.device_type == 0)
+        assert(parent.device_type == DeviceType.CPU)
         self.cpu_parent = parent
 
     # Note: async events don't have children, are not used when computing 'self'
     # metrics of other events, have only total cpu time
     @property
     def self_cpu_memory_usage(self):
-        if self.is_async or self.device_type != 0:  # CPU
+        if self.is_async or self.device_type != DeviceType.CPU:
             return 0
         return self.cpu_memory_usage - sum(
             [child.cpu_memory_usage for child in self.cpu_children]
@@ -848,7 +850,7 @@ def self_cpu_memory_usage(self):
 
     @property
     def self_cuda_memory_usage(self):
-        if self.is_async or self.device_type != 0:  # CPU
+        if self.is_async or self.device_type != DeviceType.CPU:
             return 0
         return self.cuda_memory_usage - sum(
             [child.cuda_memory_usage for child in self.cpu_children]
@@ -856,7 +858,7 @@ def self_cuda_memory_usage(self):
 
     @property
     def self_cpu_time_total(self):
-        if self.is_async or self.device_type != 0:
+        if self.is_async or self.device_type != DeviceType.CPU:
             return 0
         return self.cpu_time_total - sum(
             [child.cpu_time_total for child in self.cpu_children]
@@ -866,28 +868,28 @@ def self_cpu_time_total(self):
     def cuda_time_total(self):
         if self.is_async:
             return 0
-        if self.device_type == 0:  # CPU
+        if self.device_type == DeviceType.CPU:
             # account for the kernels in the children ops
             return (sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) +
                     sum(ch.cuda_time_total for ch in self.cpu_children))
         else:
-            assert self.device_type == 1  # CUDA
+            assert self.device_type == DeviceType.CUDA
             return self.time_range.elapsed_us()
 
     @property
     def self_cuda_time_total(self):
         if self.is_async:
             return 0
-        if self.device_type == 0:  # CPU
+        if self.device_type == DeviceType.CPU:
             return self.cuda_time_total - \
                 sum([child.cuda_time_total for child in self.cpu_children])
         else:
-            assert(self.device_type == 1)  # CUDA
+            assert(self.device_type == DeviceType.CUDA)
             return self.cuda_time_total
 
     @property
     def cpu_time_total(self):
-        if self.device_type == 0:  # CPU
+        if self.device_type == DeviceType.CPU:
             return self.time_range.elapsed_us()
         else:
             return 0
@@ -1056,7 +1058,7 @@ def parse_kineto_results(result):
 
         cpu_memory_usage = 0
         cuda_memory_usage = 0
-        if kineto_event.device_type() == 0:  # CPU
+        if kineto_event.device_type() == DeviceType.CPU:
             # find the corresponding memory allocation events
             for mem_record in mem_records:
                 if (mem_record.start_us() >= kineto_event.start_us() and
@@ -1082,7 +1084,7 @@ def parse_kineto_results(result):
             device_index=kineto_event.device_index(),
         )
         function_events.append(fe)
-        if kineto_event.device_type() == 1:  # CUDA
+        if kineto_event.device_type() == DeviceType.CUDA:
             corr_id = kineto_event.linked_correlation_id()
             if corr_id > 0 and corr_id not in cuda_corr_map:
                 cuda_corr_map[corr_id] = []
@@ -1090,7 +1092,7 @@ def parse_kineto_results(result):
 
     # associate CUDA kernels with CPU events
     for fe in function_events:
-        if (fe.device_type == 0 and not fe.is_async and
+        if (fe.device_type == DeviceType.CPU and not fe.is_async and
                 fe.id in cuda_corr_map):
             for k_evt in cuda_corr_map[fe.id]:
                 fe.append_kernel(
@@ -1206,7 +1208,7 @@ def adjusted_time(cuda_record, cuda_records_map):
                     is_async=is_async,
                     is_remote=is_remote_event,
                     sequence_nr=start.sequence_nr(),
-                    device_type=0,  # CPU
+                    device_type=DeviceType.CPU,
                 )
                 # note: async events have only cpu total time
                 if not is_async and start.has_cuda():
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ea06f475a629..78336ded0d88 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/python_headers.h>
 
+#include <c10/core/DeviceType.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/autograd/grad_mode.h>
@@ -70,6 +71,20 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("correlation_id", &LegacyEvent::correlationId)
       .def("start_us", &LegacyEvent::cpuUs);
 
+  py::enum_<c10::DeviceType>(m, "DeviceType")
+      .value("CPU", c10::DeviceType::CPU)
+      .value("CUDA", c10::DeviceType::CUDA)
+      .value("MKLDNN", c10::DeviceType::MKLDNN)
+      .value("OPENGL", c10::DeviceType::OPENGL)
+      .value("OPENCL", c10::DeviceType::OPENCL)
+      .value("IDEEP", c10::DeviceType::IDEEP)
+      .value("HIP", c10::DeviceType::HIP)
+      .value("FPGA", c10::DeviceType::FPGA)
+      .value("MSNPU", c10::DeviceType::MSNPU)
+      .value("XLA", c10::DeviceType::XLA)
+      .value("Vulkan", c10::DeviceType::Vulkan)
+      .value("Metal", c10::DeviceType::Metal);
+
 #ifdef USE_KINETO
   py::class_<KinetoEvent>(m, "KinetoEvent")
       // name of the event
@@ -126,9 +141,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("device_index", &KinetoEvent::deviceIndex)
       // for CUDA - stream id, for CPU - start thread id
       .def("device_resource_id", &KinetoEvent::deviceResourceId)
-      // device type, currently: CPU or CUDA
+      // device type
       .def("device_type", [](const KinetoEvent& e) {
-        return (uint8_t)e.deviceType();
+        return e.deviceType();
       })
       // correlation id of a linked event
       .def("linked_correlation_id", &KinetoEvent::linkedCorrelationId);
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index ff7be1b5cff1..33947a86f54f 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -294,11 +294,7 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) {
   start_us_ = activity.timestamp();
   duration_us_ = activity.duration();
   correlation_id_ = activity.correlationId();
-  //std::cerr << "DEBUG: " name_ << ":  setting corr. id to " << correlation_id_ << std::endl;
   activity_type_ = (uint8_t)activity.type();
-  //if (activity.linkedActivity()) {
-  //  std::cerr << "DEBUG: linkedActivity: " << activity.name() << " " << activity.linkedActivity()->deviceId() << " " << activity.linkedActivity()->resourceId() << std::endl;
-  //}
   if (activity.linkedActivity()) {
     linked_correlation_id_ = activity.linkedActivity()->correlationId();
   }

From f70a95c581d1bbe0e7d4522f2d6329297bbce9d1 Mon Sep 17 00:00:00 2001
From: ilia-cher <iliacher@fb.com>
Date: Fri, 20 Nov 2020 06:07:43 -0800
Subject: [PATCH 59/59] Update on "Use libkineto in profiler"

Summary:
Adding ability to use Kineto (CUPTI) to profile CUDA kernels

Test Plan:
USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install
python test/test_profiler.py

python test/test_autograd.py -k test_profile
python test/test_autograd.py -k test_record

```
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       1.000us             2
                                      sgemm_32x32x32_NN         0.00%       0.000us         0.00%       0.000us       0.000us       2.000us        33.33%       2.000us       2.000us             1
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                       Memcpy DtoH (Device -> Pageable)         0.00%       0.000us         0.00%       0.000us       0.000us       1.000us        16.67%       1.000us       1.000us             1
                                            aten::randn         5.17%      74.000us         6.71%      96.000us      48.000us       0.000us         0.00%       0.000us       0.000us             2
                                            aten::empty         1.33%      19.000us         1.33%      19.000us       4.750us       0.000us         0.00%       0.000us       0.000us             4
                                          aten::normal_         1.05%      15.000us         1.05%      15.000us       7.500us       0.000us         0.00%       0.000us       0.000us             2
                                               aten::to        77.90%       1.114ms        91.61%       1.310ms     436.667us       0.000us         0.00%       3.000us       1.000us             3
                                    aten::empty_strided         2.52%      36.000us         2.52%      36.000us      12.000us       0.000us         0.00%       0.000us       0.000us             3
                                            aten::copy_         2.73%      39.000us        11.19%     160.000us      53.333us       0.000us         0.00%       3.000us       1.000us             3
                                        cudaMemcpyAsync         4.34%      62.000us         4.34%      62.000us      20.667us       0.000us         0.00%       0.000us       0.000us             3
                                  cudaStreamSynchronize         1.61%      23.000us         1.61%      23.000us       7.667us       0.000us         0.00%       0.000us       0.000us             3
                                               aten::mm         0.21%       3.000us         7.20%     103.000us     103.000us       0.000us         0.00%       2.000us       2.000us             1
                                           aten::stride         0.21%       3.000us         0.21%       3.000us       1.000us       0.000us         0.00%       0.000us       0.000us             3
                                       cudaLaunchKernel         2.45%      35.000us         2.45%      35.000us      17.500us       0.000us         0.00%       0.000us       0.000us             2
                                              aten::add         0.49%       7.000us         4.27%      61.000us      61.000us       0.000us         0.00%       1.000us       1.000us             1
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
```

[ghstack-poisoned]
---
 benchmarks/profiler_benchmark/profiler_bench.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py
index 0cc8c33a1334..75cd490fed2e 100644
--- a/benchmarks/profiler_benchmark/profiler_bench.py
+++ b/benchmarks/profiler_benchmark/profiler_bench.py
@@ -37,7 +37,8 @@ def parallel_task(x):
     parser.add_argument('--profiling_tensor_size', default=1, type=int)
     parser.add_argument('--workload', default='loop', type=str)
     parser.add_argument('--internal_iter', default=256, type=int)
-    parser.add_argument('--timer_min_run_time', default=100, type=int)
+    parser.add_argument('--timer_min_run_time', default=10, type=int)
+    parser.add_argument('--cuda_only', action='store_true')
 
     args = parser.parse_args()
 
@@ -83,7 +84,8 @@ def payload():
                 with torch.autograd.profiler.profile(
                         use_cuda=args.with_cuda,
                         with_stack=args.with_stack,
-                        use_kineto=args.use_kineto) as prof:
+                        use_kineto=args.use_kineto,
+                        use_cpu=not args.cuda_only) as prof:
                     x = workload(input_x)
                 return x
         else: